## 【例6.1】k-means聚类的代码示例（图 6.3，图 6.4）。

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

# Generate sample data
X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.5, random_state=0)

# Apply KMeans clustering
kmeans = KMeans(n_clusters=4, random_state=0)
kmeans.fit(X)
labels = kmeans.labels_

# Calculate Silhouette Score
score = silhouette_score(X, labels)
print("Silhouette Score:", score)

# Visualize clustered data
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', edgecolors='k', s=50)
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='red', marker='X', s=200, alpha=0.75, label='Centroids')
plt.title("KMeans Clustering")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.legend()
plt.savefig('kmeans1.png',dpi=300)
plt.show()


In [None]:
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

# 创建一个示例数据集，共包含两个簇
X, y = make_blobs(n_samples=300, centers=4, random_state=42)

# 初始化一个KMeans对象
#model = KMeans(random_state=42)
model = KMeans(n_init=10, random_state=42)

# 可视化肘部法则
visualizer = KElbowVisualizer(model, k=(1, 11), metric='distortion', timings=False)
visualizer.fit(X)

# 保存肘部法则图像
visualizer.show(outpath='elbow_visualization.jpg')
visualizer.show()
# 保存肘部法则图像
 

# 使用KMeans进行聚类（假设最佳簇数为3，根据肘部法则选择）
best_k = visualizer.elbow_value_
print(f'{best_k=}')
kmeans = KMeans(n_clusters=best_k, random_state=42)
y_kmeans = kmeans.fit_predict(X)

# 可视化聚类结果
plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, cmap='viridis', edgecolor='k')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='red', marker='X', s=200, label='Centroids')
plt.title('K-Means Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()

# 保存聚类结果图像
plt.savefig('kmeans_clustering.png')
plt.show()  # 最后显示图像，确保在保存之前显示图像

# 使用Silhouette Visualizer进行轮廓系数的可视化
silhouette_visualizer = SilhouetteVisualizer(kmeans, colors='yellowbrick')
silhouette_visualizer.fit(X)
# 保存Silhouette Visualizer图像
silhouette_visualizer.show(outpath='silhouette_visualization.png')
silhouette_visualizer.show()




In [None]:

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

# 创建一个示例数据集，共包含两个簇
X, y = make_blobs(n_samples=300, centers=4, random_state=42)
# 初始化一个KMeans对象
model = KMeans(random_state=42)
# 可视化肘部法则
visualizer = KElbowVisualizer(model, k=(1,11), metric='distortion', timings=False)
visualizer.fit(X)
visualizer.show()


In [None]:
visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
visualizer.fit(X)
visualizer.show()


## 【例6.2】层次聚类案例代码。

In [None]:
from sklearn.datasets import make_blobs
from sklearn.cluster import AgglomerativeClustering, KMeans
import matplotlib.pyplot as plt
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

# 生成一个随机数据集
X, y = make_blobs(n_samples=300, centers=4, random_state=42)

# 使用层次聚类算法将数据集分成4个簇
model = AgglomerativeClustering(n_clusters=4)
y_pred = model.fit_predict(X)

# 绘制数据点和聚类结果
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.title("Agglomerative Clustering")
plt.show()

# 肘部图
visualizer = KElbowVisualizer(KMeans(), k=(2, 10))
visualizer.fit(X)
visualizer.show()

# 轮廓系数图
silhouette_visualizer = SilhouetteVisualizer(KMeans(n_clusters=visualizer.elbow_value_))
silhouette_visualizer.fit(X)
silhouette_visualizer.show()



In [None]:

from sklearn.datasets import make_blobs
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt

# 生成一个随机数据集
X, y = make_blobs(n_samples=300, centers=4, random_state=42)

# 使用层次聚类算法将数据集分成4个簇
model = AgglomerativeClustering(n_clusters=4)
y_pred = model.fit_predict(X)

# 绘制数据点和聚类结果
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.title("Agglomerative Clustering")
plt.show()


In [None]:
from sklearn.datasets import make_blobs
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

# 生成一个随机数据集
X, y = make_blobs(n_samples=300, centers=4, random_state=42)

# 使用层次聚类算法将数据集分成4个簇
model = AgglomerativeClustering(n_clusters=4)
y_pred = model.fit_predict(X)

# 定义更鲜明的颜色列表
colors = ListedColormap(['#FF0000', '#00FF00', '#0000FF', '#FFFF00'])

# 绘制数据点和聚类结果
plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap=colors)
plt.title("Agglomerative Clustering")
plt.show()


## 【例6.3】AP算法案例代码

In [None]:
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle

# Assuming X is your data and n_clusters_ is the number of clusters obtained from AP

# Apply Affinity Propagation
affinity_propagation = AffinityPropagation().fit(X)
labels = affinity_propagation.labels_
n_clusters_ = len(np.unique(labels))

# Silhouette Score
silhouette_score = metrics.silhouette_score(X, labels, metric='euclidean')
print(f'Silhouette Score: {silhouette_score}')

# Visualize Clustering Results
colors = cycle('bgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    class_members = labels == k
    cluster_center = X[cluster_centers_indices[k]]
    plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)
    for x in X[class_members]:
        plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)

plt.title('Affinity Propagation clustering')
plt.show()


In [None]:
import numpy as np
from sklearn.cluster import AffinityPropagation
from sklearn.datasets import make_blobs
import networkx as nx
import matplotlib.pyplot as plt

# 生成一个简化的社交网络图
G = nx.karate_club_graph()

# 从图中获取节点的邻接矩阵作为相似度矩阵
adjacency_matrix = nx.to_numpy_matrix(G)

# 使用亲和传播算法进行聚类
affinity_propagation = AffinityPropagation(damping=0.5, preference=-10).fit(adjacency_matrix)

# 获取聚类的标签和聚类中心
labels = affinity_propagation.labels_
cluster_centers_indices = affinity_propagation.cluster_centers_indices_

# 可视化社交网络图和聚类结果
pos = nx.spring_layout(G)
plt.figure(figsize=(10, 8))

# 绘制社交网络图
nx.draw(G, pos, with_labels=True, node_color='lightblue', node_size=800, font_size=8)

# 绘制聚类中心
for center_index in cluster_centers_indices:
    plt.scatter(pos[center_index][0], pos[center_index][1], c='red', marker='o', s=300, label='Cluster Center')

# 根据聚类结果给节点上色
colors = [labels[node] for node in G.nodes()]
nx.draw(G, pos, node_color=colors, cmap='viridis', node_size=800, font_size=8)

plt.title("Social Network Clustering using Affinity Propagation")
plt.legend()
plt.show()


In [None]:

from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from sklearn.datasets import make_blobs

# 创建一个示例数据集
# 这里使用make_blobs生成一个随机的聚类样本
X, y = make_blobs(n_samples=300, centers=3, random_state=42)

# 使用AP算法进行聚类
clustering = AffinityPropagation().fit(X)

# 获取最终的簇中心和簇标签
cluster_centers_indices = clustering.cluster_centers_indices_
labels = clustering.labels_

# 打印聚类的数量
n_clusters_ = len(cluster_centers_indices)

# 输出结果
print(f'聚类的数量: {n_clusters_}')
print(f'簇中心的索引: {cluster_centers_indices}')
print(f'每个样本的簇标签: {labels}')

# 使用metrics进行聚类效果评估
silhouette_score = metrics.silhouette_score(X, labels, metric='euclidean')
print(f'Silhouette Score: {silhouette_score}')


In [None]:
import numpy as np
from sklearn.cluster import SpectralClustering
from sklearn.metrics.pairwise import pairwise_distances
import matplotlib.pyplot as plt

# 生成模拟的社交网络数据
np.random.seed(42)
num_users = 100
user_features = np.random.rand(num_users, 2)  # 生成用户的二维特征

# 生成用户之间的相似性矩阵，这里使用欧氏距离
similarity_matrix = pairwise_distances(user_features, metric='euclidean')

# 使用谱聚类进行聚类
num_clusters = 3
spectral = SpectralClustering(n_clusters=num_clusters, affinity='nearest_neighbors', random_state=42)
user_labels = spectral.fit_predict(similarity_matrix)

# 可视化聚类结果
plt.scatter(user_features[:, 0], user_features[:, 1], c=user_labels, cmap='viridis', s=50)
plt.title('Spectral Clustering of Social Network Users')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.savefig('spectral.png',dpi=300)
plt.show()



In [None]:
import matplotlib.pyplot as plt
from itertools import cycle

# 创建一个示例数据集
X, y = make_blobs(n_samples=300, centers=3, random_state=42)

# 使用AP算法进行聚类
clustering = AffinityPropagation().fit(X)

# 获取最终的簇中心和簇标签
cluster_centers_indices = clustering.cluster_centers_indices_
labels = clustering.labels_

# 打印聚类的数量
n_clusters_ = len(cluster_centers_indices)

# 输出结果
print(f'聚类的数量: {n_clusters_}')
print(f'簇中心的索引: {cluster_centers_indices}')
print(f'每个样本的簇标签: {labels}')

# 使用metrics进行聚类效果评估
silhouette_score = metrics.silhouette_score(X, labels, metric='euclidean')
print(f'Silhouette Score: {silhouette_score}')

# 可视化聚类结果
colors = cycle('bgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    class_members = labels == k
    cluster_center = X[cluster_centers_indices[k]]
    plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)
    for x in X[class_members]:
        plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)

plt.title('Affinity Propagation clustering')
plt.show()


## 【例6.4】ICA算法案例代码

In [None]:

from sklearn.decomposition import FastICA
import numpy as np
# 生成混合信号
t = np.linspace(0, 10, 2000)
s1 = np.sin(2 * t)
s2 = np.sign(np.sin(3 * t))
s3 = np.random.randn(2000)
S = np.c_[s1, s2, s3]
A = np.array([[1, 1, 1], [0.5, 2, 1.0], [1.5, 1.0, 2.0]])
X = np.dot(S, A.T)  # 生成混合信号

# ICA分离信号
ica = FastICA(n_components=3)
S_ = ica.fit_transform(X)

# 绘制原始信号和ICA分离后的信号
import matplotlib.pyplot as plt
plt.figure()
plt.subplot(3, 1, 1)
plt.plot(S)
plt.title('Original signals')
plt.subplot(3, 1, 2)
plt.plot(X)
plt.title('Mixed signals')
plt.subplot(3, 1, 3)
plt.plot(S_)
plt.title('ICA recovered signals')
plt.tight_layout()
plt.show()


##  【例6.5】NMF算法案例代码。

In [None]:


from sklearn.decomposition import NMF
from sklearn.datasets import load_iris

# 加载数据集
iris = load_iris()
X = iris.data

# 进行 NMF 分解，设置两个主题
nmf = NMF(n_components=2)
W = nmf.fit_transform(X)
H = nmf.components_

# 输出分解结果
print(W)
print(H)


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import NMF
from sklearn.datasets import load_iris

# 加载数据集
iris = load_iris()
X = iris.data

# 进行 NMF 分解，设置两个主题
nmf = NMF(n_components=2)
W = nmf.fit_transform(X)
H = nmf.components_

# 输出分解结果
print("W (Transformed Data):")
print(W)
print("\nH (Components):")
print(H)

# 可视化 NMF 分解结果
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))

# 绘制 W 矩阵
axes[0].scatter(W[:, 0], W[:, 1], c=iris.target, cmap='viridis', edgecolor='k')
axes[0].set_title('NMF Transformed Data (W Matrix)')
axes[0].set_xlabel('NMF Feature 1')
axes[0].set_ylabel('NMF Feature 2')

# 绘制 H 矩阵
axes[1].imshow(H, cmap='viridis', aspect='auto', interpolation='nearest')
axes[1].set_title('NMF Components (H Matrix)')
axes[1].set_xlabel('Features')
axes[1].set_ylabel('NMF Components')

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# 假设有一份购物数据，包含顾客购买不同商品的数量
data = {
    'Milk': [2, 5, 8, 1, 3, 6, 9, 4, 7],
    'Bread': [3, 6, 9, 2, 4, 7, 1, 5, 8]
}

df = pd.DataFrame(data)

# 标准化数据
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

# 使用K均值聚类算法，假设我们要分成两个群体
kmeans = KMeans(n_clusters=2, random_state=42)
df['Cluster'] = kmeans.fit_predict(scaled_data)

# 可视化聚类结果
plt.scatter(df['Milk'], df['Bread'], c=df['Cluster'], cmap='viridis', edgecolor='k')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='red', marker='X', s=200, label='Centroids')
plt.title('Customer Segmentation based on Purchase Behavior')
plt.xlabel('Milk Quantity')
plt.ylabel('Bread Quantity')
plt.legend()
plt.savefig('bread.png',dpi=300)
plt.show()


## 【例6.6】孤立森林算法案例。

In [None]:
import numpy as np
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt

# 生成模拟购物数据，其中包含一些正常值和一些异常值
np.random.seed(42)
normal_data = np.random.normal(loc=100, scale=20, size=1000)
anomalies = np.random.normal(loc=300, scale=50, size=20)
shopping_data = np.concatenate([normal_data, anomalies])

# 将数据整理成一维数组
shopping_data = shopping_data.reshape(-1, 1)

# 使用孤立森林算法进行建模
clf = IsolationForest(contamination=0.02, random_state=42)  # contamination 表示异常值的比例
clf.fit(shopping_data)

# 预测异常值
predictions = clf.predict(shopping_data)

# 可视化结果
plt.scatter(range(len(shopping_data)), shopping_data, c=predictions, cmap='viridis')
plt.xlabel('Data Point Index')
plt.ylabel('Shopping Amount')
plt.title('Isolation Forest for Anomaly Detection')
plt.savefig('iso1.png',dpi=300)
plt.show()


In [None]:

from sklearn.ensemble import IsolationForest
import numpy as np

# 生成一个包含正常点和异常点的数据集
rng = np.random.RandomState(42)
X = 0.3 * rng.randn(100, 2)
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
X = np.r_[X + 2, X - 2, X_outliers]

# 使用孤立森林进行异常检测
clf = IsolationForest(random_state=rng, contamination='auto')
clf.fit(X)

# 预测结果，输出-1表示异常点，输出1表示正常点
y_pred = clf.predict(X)
print(y_pred)


##  【例6.7】DBSCAN算法案例。

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_blobs

# 生成模拟的地理位置数据
X, _ = make_blobs(n_samples=300, centers=4, random_state=42, cluster_std=1.0)

# 假设 X 包含用户在城市中的地理位置坐标
# 这里 X 是一个二维数组，每行代表一个用户的坐标

# 使用 DBSCAN 进行聚类
dbscan = DBSCAN(eps=0.5, min_samples=5)
labels = dbscan.fit_predict(X)

# 可视化聚类结果
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', edgecolor='k')
plt.title('DBSCAN Clustering of User Locations')
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.savefig('dbscan.png',dpi=300)
plt.show()


In [None]:

from sklearn.cluster import DBSCAN
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

# 生成随机数据
X, y = make_blobs(n_samples=1000, centers=8, random_state=42)

# 创建DBSCAN聚类模型
dbscan = DBSCAN(eps=0.5, min_samples=5)

# 拟合模型并进行预测
y_pred = dbscan.fit_predict(X)

# 可视化聚类结果
plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap='viridis')
plt.title('DBSCAN Clustering')
plt.show()


##  【例6.8】轮廓系数计算案例代码。

In [None]:

from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

X, y_true = make_blobs(n_samples=300, centers=4,
                       cluster_std=0.60, random_state=0)
kmeans = KMeans(n_clusters=4, random_state=0)
kmeans.fit(X)
score = silhouette_score(X, kmeans.labels_)
print(score)


## 【例6.9】轮廓系数可视化案例代码。

In [None]:

 
from sklearn.cluster import KMeans
from yellowbrick.cluster import SilhouetteVisualizer
from sklearn.datasets import make_blobs

# 生成包含3个簇、每个簇包含100个样本的数据集
X, y = make_blobs(n_samples=300, centers=3, random_state=42)
# 创建KMeans模型并拟合数据
model = KMeans(4, n_init=10, random_state=42)
model.fit(X)

# 创建轮廓系数可视化器并拟合数据
visualizer = SilhouetteVisualizer(model)
visualizer.fit(X)

# 显示可视化器
visualizer.show()


##  【例6.10】互信息案例代码。
互信息：使用sklearn.metrics.adjusted_mutual_info_score()函数计算。函数需要输入真实标签和聚类标签。例如：

In [None]:

from sklearn.metrics import adjusted_mutual_info_score
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

X, y_true = make_blobs(n_samples=300, centers=4,
                       cluster_std=0.60, random_state=0)
kmeans = KMeans(n_clusters=4, random_state=0)
kmeans.fit(X)
score = adjusted_mutual_info_score(y_true, kmeans.labels_)
print(score)


## 【例6.11】互信息可视化案例代码。

In [None]:

import numpy as np
from sklearn.datasets import make_classification
from sklearn.cluster import KMeans
from sklearn.metrics import mutual_info_score
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.features import JointPlotVisualizer
import matplotlib.pyplot as plt
# 生成数据
X, y = make_classification(n_samples=1000, n_features=15, n_informative=10, n_classes=6, random_state=42)

# 寻找最佳的K值
model = KMeans(n_init=10)
visualizer = KElbowVisualizer(model, k=(2, 12))
visualizer.fit(X)
visualizer.show()

# 创建KMeans模型并拟合数据
k = visualizer.elbow_value_
print(k)
model = KMeans(k, random_state=42,n_init=10)
model.fit(X)

# 预测聚类结果
labels_pred = model.predict(X)

# 计算互信息得分
score = mutual_info_score(y, labels_pred)
print(score)
# 创建联合分布图
jointplot = JointPlotVisualizer(columns=[0, 1], feature=y, title=f'Mutual Information {score=}')

# 绘制联合分布图
jointplot.fit(X)


## 【例6.12】CH指数计算案例代码。

In [None]:

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score

# 生成一些随机的聚类样本数据
X, y = make_blobs(n_samples=300, centers=4, random_state=42)

# 使用K均值聚类算法进行聚类
kmeans = KMeans(n_clusters=4, random_state=42)
labels = kmeans.fit_predict(X)

# 计算DB指数
db_score = davies_bouldin_score(X, labels)

print(f"DB指数: {db_score}")


## 【例6.13】CH指数与簇数可视化案例代码。

In [None]:

import matplotlib.pyplot as plt
from sklearn.metrics import calinski_harabasz_score

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
# 生成一些随机的聚类样本数据
X, y = make_blobs(n_samples=300, centers=4, random_state=42)

# 尝试不同的簇数量，并计算对应的CH指数
cluster_numbers = range(2, 11)
ch_scores = []

for n_clusters in cluster_numbers:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(X)
    ch_score = calinski_harabasz_score(X, labels)
    ch_scores.append(ch_score)
import matplotlib.pyplot as plt
# 设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei']  # 指定默认字体
plt.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题
# 绘制CH指数随簇数量变化的曲线图
plt.plot(cluster_numbers, ch_scores, marker='o')
plt.xlabel('簇数量')
plt.ylabel('CH指数')
plt.title('CH指数随簇数量变化')
plt.show()


##  【例6.14】DB指数计算案例代码。

In [None]:

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score

# 生成一些随机的聚类样本数据
X, y = make_blobs(n_samples=300, centers=4, random_state=42)

# 尝试不同的簇数量，并计算对应的DB指数
cluster_numbers = range(2, 11)
db_scores = []

for n_clusters in cluster_numbers:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(X)
    db_score = davies_bouldin_score(X, labels)
    db_scores.append(db_score)


##  【例6.15】DB指数可视化案例代码。

In [None]:

from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
# 设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei']  # 指定默认字体
plt.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题
# 绘制DB指数随簇数量变化的曲线图
plt.plot(cluster_numbers, db_scores, marker='o')
plt.xlabel('簇数量')
plt.ylabel('DB指数')
plt.title('DB指数随簇数量变化')
plt.show()
