In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, matthews_corrcoef
from sklearn.preprocessing import StandardScaler
from rdkit import Chem
from rdkit.Chem import AllChem
# 导入各模型
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC  # 注意：原问题中提到 SVR，但由于任务为分类，这里采用 SVC
import lightgbm as lgb
from catboost import CatBoostClassifier

In [2]:
data = pd.read_csv(r'D:\分类数据\去重催化分类内部数据集.csv', encoding='gbk')

In [3]:
def smiles_to_features(smiles_list):
    features = []
    for smi in smiles_list:
        # 跳过无效的 SMILES（例如 NaN 或 float）
        if isinstance(smi, str):
            mol = Chem.MolFromSmiles(smi)
            if mol is not None:
                fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
                features.append(fp)
            else:
                features.append(np.zeros(2048))  # 对无效的 SMILES 使用零向量
        else:
            features.append(np.zeros(2048))  # 对非字符串值使用零向量
    return np.array(features)
X = smiles_to_features(data['smiles'])

[16:14:43] Explicit valence for atom # 16 N, 4, is greater than permitted
[16:14:50] Explicit valence for atom # 21 N, 4, is greater than permitted


In [5]:
from sklearn.preprocessing import LabelEncoder
# 标签编码
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['labels'])


# 输出编码后每个整数对应的原始标签
for i, label in enumerate(label_encoder.classes_):
    print(f"编码 {i}: 标签 '{label}'")

编码 0: 标签 'α'
编码 1: 标签 'αβγδ'
编码 2: 标签 'β'
编码 3: 标签 'βδ'
编码 4: 标签 'γ'
编码 5: 标签 'δ'


In [7]:
from sklearn.utils.class_weight import compute_class_weight
# 计算类别权重
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weight_dict = dict(zip(np.unique(y), class_weights))

In [8]:
# 3. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# 标准化处理（部分模型对特征缩放敏感）
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -------------------------------
# 2. 模型定义
# -------------------------------
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(random_state=42),
    "SVC": SVC(probability=True, random_state=42),  # 用SVC替代SVR，用于分类任务
    "KNN": KNeighborsClassifier(),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "MLP": MLPClassifier(random_state=42),
    "LightGBM": lgb.LGBMClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(random_state=42, verbose=0)
}

In [11]:
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# 模型训练
xgb_model.fit(X_train_scaled, y_train)

# 在测试集上进行预测
y_pred = xgb_model.predict(X_test_scaled)

# -------------------------------
# 3. 模型评估
# -------------------------------
# 输出分类报告
print("========== XGBoost 分类报告 ==========")
print(classification_report(y_test, y_pred))

# 计算 Matthews 相关系数 (MCC)
mcc = matthews_corrcoef(y_test, y_pred)
print("XGBoost 模型 MCC:", mcc)

Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0       0.94      0.97      0.95      1571
           1       1.00      1.00      1.00        73
           2       0.90      0.65      0.76        55
           3       0.76      0.59      0.67        22
           4       0.86      0.71      0.78       144
           5       0.96      0.95      0.95      1312

    accuracy                           0.94      3177
   macro avg       0.90      0.81      0.85      3177
weighted avg       0.94      0.94      0.94      3177

XGBoost 模型 MCC: 0.9007697417514272


In [10]:


# -------------------------------
# 3. 模型训练与评估
# -------------------------------
results = []  # 用于存储各模型的MCC值

for name, model in models.items():
    # 对于大多数模型，使用标准化数据有助于提升性能
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    # 输出分类报告
    print(f"========== {name} ==========")
    print(classification_report(y_test, y_pred))
    
    # 计算 Matthews 相关系数 (MCC)
    mcc = matthews_corrcoef(y_test, y_pred)
    results.append({"Model": name, "MCC": mcc})

# -------------------------------
# 4. 输出MCC对比表
# -------------------------------
results_df = pd.DataFrame(results)
print("========== MCC Comparison Table ==========")
print(results_df)

              precision    recall  f1-score   support

           0       0.94      0.97      0.95      1571
           1       1.00      1.00      1.00        73
           2       0.89      0.62      0.73        55
           3       0.72      0.59      0.65        22
           4       0.89      0.69      0.78       144
           5       0.96      0.95      0.96      1312

    accuracy                           0.94      3177
   macro avg       0.90      0.80      0.84      3177
weighted avg       0.94      0.94      0.94      3177

              precision    recall  f1-score   support

           0       0.90      0.95      0.93      1571
           1       1.00      1.00      1.00        73
           2       0.78      0.64      0.70        55
           3       0.82      0.64      0.72        22
           4       0.90      0.60      0.72       144
           5       0.93      0.92      0.92      1312

    accuracy                           0.91      3177
   macro avg       0.89