In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from skopt import BayesSearchCV

import lightgbm as lgb
from skopt import BayesSearchCV
from sklearn.metrics import accuracy_score
from lightgbm import early_stopping

import pandas as pd
import numpy as np

In [4]:
# Load data
DataStep7 = pd.read_feather('./Data/V2-DataStep7.feather')

# Define X and y
X = DataStep7.drop(columns=['Cluster'])
y = DataStep7['Cluster']

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the numeric and categorical columns
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),           # Normalize numeric columns
        ('cat', OneHotEncoder(), categorical_cols)  # One-hot encode categorical columns
    ])

# Create the complete pipeline with zero-variance feature removal
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('variance_threshold', VarianceThreshold())           # Remove zero-variance columns
])

# Fit the pipeline to the training data
pipeline.fit(X_train)

# Transform the training and test sets
X_train_transformed = pipeline.transform(X_train)
X_test_transformed = pipeline.transform(X_test)


# KNN

In [5]:
# Initialize the KNN model
knn = KNeighborsClassifier()

# Define the parameter grid
param_grid = {
    'n_neighbors': np.arange(10, 101, 10),        # Range of k values
    'weights': ['uniform', 'distance'],     # Weighting methods
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']  # Algorithms for finding neighbors
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy', n_jobs=16)  # 10-fold cross-validation
grid_search.fit(X_train_transformed, y_train)

# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate the model on the test set using the best found parameters
best_knn = grid_search.best_estimator_
test_score = best_knn.score(X_test_transformed, y_test)
print("Test Accuracy:", test_score)

Best Parameters: {'algorithm': 'auto', 'n_neighbors': 100, 'weights': 'distance'}
Best Score: 0.7924316541281593
Test Accuracy: 0.8002532001688001


# Logistic regression

In [6]:
# 定义参数网格
param_grid = {
    'solver': ['liblinear', 'lbfgs', 'saga'],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.001, 0.01, 0.05, 0.1, 0.5, 1]
}

# 创建逻辑回归模型
lr = LogisticRegression(max_iter=1000)

# 使用网格搜索进行超参数调优
grid_search = GridSearchCV(lr, param_grid, cv=10, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train_transformed, y_train)

# 输出最佳参数和准确率
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best accuracy: {grid_search.best_score_}")

# 使用最佳参数进行预测
best_model = grid_search.best_estimator_
print(f"Test accuracy: {best_model.score(X_test_transformed, y_test)}")

Best parameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
Best accuracy: 0.7976015930755976
Test accuracy: 0.8015192010128007


420 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "e:\Users\admin\miniconda3\envs\tf2\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "e:\Users\admin\miniconda3\envs\tf2\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "e:\Users\admin\miniconda3\envs\tf2\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^

# SVM

In [13]:
param_space = {
    'kernel': ['rbf', 'sigmoid'],  # Kernel functions
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],  # Regularization parameter (log-uniform search space)
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1]   # Gamma parameter (log-uniform search space for numerical values)
}

# Create the SVM model
svm = SVC()

# Perform Bayesian hyperparameter search
bayes_search = BayesSearchCV(
    estimator=svm,
    search_spaces=param_space,
    n_iter=50,  # Number of iterations for Bayesian search
    cv=10,      # 10-fold cross-validation
    n_jobs=-1,  # Use all available CPU cores
    scoring='accuracy',  # Optimize for accuracy
    verbose=0
)

# Fit the model
bayes_search.fit(X_train_transformed, y_train)

# Output the best parameters and accuracy
print(f"Best parameters: {bayes_search.best_params_}")
print(f"Best accuracy: {bayes_search.best_score_}")

# Use the best model to evaluate on the test set
best_model = bayes_search.best_estimator_
print(f"Test accuracy: {best_model.score(X_test_transformed, y_test)}")

Best parameters: OrderedDict([('C', 10), ('gamma', 0.001), ('kernel', 'sigmoid')])
Best accuracy: 0.7983049763700218
Test accuracy: 0.8026445350963567


# Random forest

In [2]:
import numpy as np

In [3]:
np.arange(3, 17,2)

array([ 3,  5,  7,  9, 11, 13, 15])

In [7]:
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],  # 树的数量
    'criterion': ['gini', 'entropy'],  # 分裂标准
    'max_depth': np.arange(3, 17,2),  # 最大树深
    'min_samples_leaf': np.arange(3, 17,2),  # 叶节点最小样本数
    'min_samples_split': np.arange(3, 17,2),  # 节点最小分裂数
    'max_features': [None, 'sqrt', 'log2']  # 分裂时的最大特征数
}

# 创建随机森林模型
rf = RandomForestClassifier()

# 使用贝叶斯优化进行超参数调优
opt = BayesSearchCV(
    estimator=rf,
    search_spaces=param_grid,
    n_iter=200,  # 搜索的迭代次数
    cv=10,
    n_jobs=-1,
    scoring='accuracy'
)

opt.fit(X_train_transformed, y_train)

# 输出最佳参数
print(f"Best parameters: {opt.best_params_}")
print(f"Best accuracy: {opt.best_score_}")

# 使用最佳参数进行测试集评估
best_model = opt.best_estimator_
print(f"Test accuracy: {best_model.score(X_test_transformed, y_test)}")

  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters: OrderedDict([('criterion', 'entropy'), ('max_depth', 13), ('max_features', 'sqrt'), ('min_samples_leaf', 13), ('min_samples_split', 9), ('n_estimators', 900)])
Best accuracy: 0.7982346281463144
Test accuracy: 0.8044732029821353


# LightGBM


In [8]:
# 定义参数搜索空间
param_space = {
    'n_estimators': (100, 1000),  # 设置整数范围
    'max_depth': (3, 30),         # 设置整数范围
    'subsample': (0.7, 1.0),      # 设置连续值范围
    'colsample_bytree': (0.7, 1.0),
    'learning_rate': (1e-5, 1e-1, 'log-uniform'),  # log-uniform 分布
    'num_leaves': (10, 100)       # 设置整数范围
}

# 创建 LGBM 分类模型
lgbm = lgb.LGBMClassifier(verbose=10)

# 使用贝叶斯搜索进行超参数调优
bayes_search = BayesSearchCV(
    estimator=lgbm, 
    search_spaces=param_space, 
    n_iter=200,  # 搜索的迭代次数
    cv=10,       # 使用10折交叉验证
    n_jobs=-1,   # 使用所有可用的CPU核
    scoring='accuracy',
    random_state=42
)

# 分割训练集用于早期停止
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_transformed, y_train, test_size=0.2, random_state=42
)

# 进行贝叶斯优化调参
bayes_search.fit(
    X_train_split, y_train_split,
    eval_set=[(X_val_split, y_val_split)],  # 验证集
    callbacks=[early_stopping(stopping_rounds=50)],  # 使用回调实现 early stopping
)

# 输出最佳参数
print(f"Best parameters: {bayes_search.best_params_}")
print(f"Best accuracy on CV: {bayes_search.best_score_}")

# 使用最佳参数进行测试集评估
best_model = bayes_search.best_estimator_

# 在测试集上评估模型
test_accuracy = best_model.score(X_test_transformed, y_test)
print(f"Test accuracy: {test_accuracy}")

# 也可以预测概率并计算准确率
y_test_pred = best_model.predict(X_test_transformed)
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {accuracy}")


  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,


[LightGBM] [Info] Number of positive: 16006, number of negative: 6741
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.000000
[LightGBM] [Debug] init for col-wise cost 0.000004 seconds, init for row-wise cost 0.000263 seconds
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000449 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2183
[LightGBM] [Info] Number of data points in the train set: 22747, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.703653 -> initscore=0.864755
[LightGBM] [Info] Start training from score 0.864755
[LightGBM] [Debug] Trained a tree with leaves = 8 and depth = 3
Training until validation scores don't improve for 50 rounds
[LightGBM] [Debug] Trained a tree with leaves = 8 and depth = 3
[LightGBM] [Debug] Trained a tree with leaves = 8 and depth = 3
[LightGBM] [Debug] Trained a tree with leaves = 8 and depth = 3
[LightGBM] [Deb