In [1]:
import re
import math
import warnings
import tqdm
import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import fsolve
from sympy import symbols, Eq, solve
from matplotlib.pyplot import MultipleLocator
from sklearn.preprocessing import MinMaxScaler, Binarizer
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score
warnings.filterwarnings("ignore")

best_labels_best = []
silhouette_best = 0
ch_index_best = 0
for i in range(0, 100):
    # 示例数据
    data7 = pd.read_excel('2b数据处理结果.xlsx')
    data_p1 = data7[list(data7.columns)[13:]][:101]
    data_p2 = data7[list(data7.columns)[13:]][131:]
    data = pd.concat([data_p1, data_p2], axis=0, ignore_index=True)

    # 定义所需的聚类算法和参数
    clustering_algorithms = [
        ("K-Means", KMeans())  # 不需要指定初始参数
    ]

    # 定义KMeans的超参数搜索范围
    param_grid = {
        'n_clusters': [None, 3, 4, 5],  # 尝试不同的聚类数量
        'init': [None, 'k-means++', 'random'],  # 不同的初始化方法
        'max_iter': [None, 1,5,10,25,50,100],  # 不同的最大迭代次数
        'random_state': [None, 0,1,2], 
    }

    # 创建GridSearchCV对象
    grid_search = GridSearchCV(estimator=KMeans(), param_grid=param_grid, cv=5)

    # 执行网格搜索
    grid_search.fit(data)

    # 输出最佳参数
    print("Best Parameters:", grid_search.best_params_)

    # 获取最佳参数的KMeans模型
    best_kmeans_model = grid_search.best_estimator_

    # 使用最佳模型进行聚类
    best_kmeans_model.fit(data)
    best_labels = best_kmeans_model.labels_

    # 计算评估指标
    silhouette = silhouette_score(data, best_labels)
    ch_index = calinski_harabasz_score(data, best_labels)

    # print("Clustering Algorithm: K-Means (Tuned)")
    # print("Clustering Labels:", best_labels)
    # print("Silhouette coefficient:", silhouette)
    # print("Calinski-Harabasz index:", ch_index)
    if silhouette > silhouette_best:
        silhouette_best = silhouette.copy()
        best_labels_best = best_labels.copy()
        ch_index_best = ch_index.copy()
        print(f"Clustering Algorithm: K-Means (Tuned), Clustering Labels: {best_labels}, Silhouette coefficient: {silhouette}, Calinski-Harabasz index: {ch_index}")



Best Parameters: {'init': 'k-means++', 'max_iter': 5, 'n_clusters': 5, 'random_state': 2}
Clustering Algorithm: K-Means (Tuned), Clustering Labels: [1 4 1 4 4 3 0 2 4 2 3 4 2 4 0 1 2 4 4 4 1 0 2 2 1 1 1 1 1 0 1 1 1 1 0 1 1
 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 3 1 1 3 1 0 3 3 3 1 1
 3 1 1 2 2 1 0 1 1 1 1 3 1 2 3 1 0 3 2 1 1 3 1 1 1 1 1 4 0 4 0 0 0 2 1 4 4
 0 4 0 2 2 4 4 4 3 2 3 0 2 4 4 0 1 1 1], Silhouette coefficient: 0.5951914211495353, Calinski-Harabasz index: 50.374236800495346
Best Parameters: {'init': 'k-means++', 'max_iter': 5, 'n_clusters': 5, 'random_state': 2}
Best Parameters: {'init': 'k-means++', 'max_iter': 1, 'n_clusters': 5, 'random_state': None}


KeyboardInterrupt: 

In [None]:
# 归一化
# 比例变量变数值变量，归一化的是数值变量
# 假设data5是你的数据框
hm_columns = data5.filter(like='HM_').columns.tolist()
ed_columns = data5.filter(like='ED_').columns.tolist()
hm_columns.pop(0)
ed_columns.pop(0)
# print(ed_columns)

# 获取'HM_volume'和'ED_volume.0'列的值
hm_volume = data5['HM_volume']
ed_volume = data5['ED_volume']

# 将以'HM_'开头的列与'HM_volume'相乘
data5[hm_columns] = data5[hm_columns].mul(hm_volume, axis=0)

# 将以'ED_'开头的列与'ED_volume.0'相乘
data5[ed_columns] = data5[ed_columns].mul(ed_volume, axis=0)
# 创建 MinMaxScaler 对象
scaler = MinMaxScaler()
# 选择要归一化的列（除了 '年龄' 列之外的所有列）
columns_to_normalize = [col for col in data5.columns if col != '性别']
# 使用 MinMaxScaler 对所选列进行归一化
data5[columns_to_normalize] = scaler.fit_transform(data5[columns_to_normalize])

In [None]:
print(f"Clustering Algorithm: K-Means (Tuned), Clustering Labels: {best_labels_best}, Silhouette coefficient: {silhouette_best}, Calinski-Harabasz index: {ch_index_best}")




In [None]:
import json
# json.load()
# 0.694
# 创建一个包含数组的字典
data = {"best_labels_130": best_labels_best.tolist()}

# 保存为JSON文件
with open("best_labels_130.json", "w") as json_file:
    json.dump(data, json_file)

# 要在其他Python脚本中读取这个JSON文件，你可以使用json.load()函数。

In [None]:
data7