In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
import itertools as it
import copy

SEED = 4933

all_columns = ['ko', 'cao', 'cuo', 'alo', 'sio']

data = pd.read_csv("../data_after_cleaning.csv")

type_name = {
    0 : "铅钡类",
    1 : "高钾类",
}

sns.set(font='SimHei',style='white')

def to_str(conbi):
    ret = str(conbi[0])
    for i in range(1, len(conbi)):
        ret+= '+' + conbi[i]
    return ret

for type_ in range(2):
    data1 = data[data.type == type_]
    for chosen_num in range(2, len(all_columns) + 1):
        results = []
        for conbi in it.combinations(all_columns, chosen_num):
            for k in range(2, 2**chosen_num):
                # print(data1.info())
                X = data1[list(conbi)]
                cluster = KMeans(n_clusters=min(2**k, len(X)-1), random_state=SEED).fit(X)
                y_pred = cluster.labels_
                centroid = cluster.cluster_centers_
                # 计算平均轮廓系数
                sil = silhouette_score(X, y_pred)
                # 值越大，说明同类样本相距约近，不同样本相距越远，则聚类效果越好
                results.append((to_str(conbi), k, sil))
        draw_data = pd.DataFrame(results)
        draw_data.columns = ['成分组合','k 值','轮廓系数']
        plt_ = sns.lineplot(x="k 值", y="轮廓系数",hue='成分组合',data=draw_data)
        plt_.legend(loc=2, bbox_to_anchor=(1.05,1.0),borderaxespad = 0.)
        plt.savefig(f'./k_means/{type_name[type_]}_{chosen_num}成分.png',dpi=400)
