In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [2]:
data = pd.read_csv('liion25.csv')  
X = data.iloc[:, 1:] 
y = data.iloc[:, 0].values
feature_names = X.columns.tolist()

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_val = X_train.values

In [4]:
model_configs = [
    {
        'name': 'GBDT',
        'pipe': Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', GradientBoostingClassifier())
        ]),
        'params': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__learning_rate': [0.01, 0.1, 0.2],
            'classifier__max_depth': [3, 5],
            'classifier__subsample': [0.8, 1.0],
            'classifier__min_samples_split': [10, 20]
        }
    },
    {
        'name': 'RF',
        'pipe': Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', RandomForestClassifier())
        ]),
        'params': {
            'classifier__n_estimators': [50, 100, 150],
            'classifier__max_depth': [3, 5, 7, None],
            'classifier__max_features': ['sqrt', 'log2'],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4],
            'classifier__max_samples': [0.6, 0.8, None]
        }
    },
    {
        'name': 'SVM',
        'pipe': Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', SVC(probability=True))
        ]),
        'params': {
            'classifier__C': [0.1, 1, 10],
            'classifier__kernel': ['rbf', 'linear'],
            'classifier__gamma': ['scale', 'auto']
        }
    },
    {
        'name': 'KNN',
        'pipe': Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', KNeighborsClassifier())
        ]),
        'params': {
            'classifier__n_neighbors': [3, 5, 7, 9],
            'classifier__weights': ['uniform', 'distance'],
            'classifier__p': [1, 2]
        }
    }
]

In [5]:
results = []
svm_linear_importance = None

In [6]:
for config in model_configs:
    
    grid = GridSearchCV(
        estimator=config['pipe'],
        param_grid=config['params'],
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )
    grid.fit(X_train_val, y_train)
    
    
    best_model = grid.best_estimator_
    
    
    if config['name'] == 'SVM' and grid.best_params_['classifier__kernel'] == 'linear':
        classifier = best_model.named_steps['classifier']
        importance = np.abs(classifier.coef_[0])  
        svm_linear_importance = {
            'features': feature_names,
            'importance': importance / importance.sum()
        }
    
    
    def evaluate(X_data, y_true, data_type):
        X_data = X_data.values if isinstance(X_data, pd.DataFrame) else X_data
        y_pred = best_model.predict(X_data)
        y_proba = best_model.predict_proba(X_data)[:, 1] if hasattr(best_model, 'predict_proba') else None
        
        return {
            f'{data_type}_Accuracy': accuracy_score(y_true, y_pred),
            f'{data_type}_Sensitivity': recall_score(y_true, y_pred),
            f'{data_type}_AUC': roc_auc_score(y_true, y_proba) if y_proba is not None else np.nan
        }
    
    
    metrics = {
        'Model': config['name'],
        'Best_Params': grid.best_params_
    }
    metrics.update(evaluate(X_train, y_train, 'Train'))
    metrics.update(evaluate(X_test, y_test, 'Test'))
    
    results.append(metrics)

In [7]:
result_df = pd.DataFrame(results)
print("\nmodel performance：")
print(result_df[['Model', 'Best_Params',
                 'Train_Accuracy', 'Test_Accuracy',
                 'Train_Sensitivity', 'Test_Sensitivity',
                 'Train_AUC', 'Test_AUC']].to_string(index=False))



model performance：
Model                                                                                                                                                                                                Best_Params  Train_Accuracy  Test_Accuracy  Train_Sensitivity  Test_Sensitivity  Train_AUC  Test_AUC
 GBDT                                         {'classifier__learning_rate': 0.2, 'classifier__max_depth': 3, 'classifier__min_samples_split': 20, 'classifier__n_estimators': 200, 'classifier__subsample': 1.0}        0.996124       0.769231           1.000000          0.808511   0.999961  0.899527
   RF {'classifier__max_depth': None, 'classifier__max_features': 'log2', 'classifier__max_samples': 0.8, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 50}        0.968992       0.876923           0.984293          0.872340   0.996288  0.920804
  SVM                                                                                 

In [8]:
if svm_linear_importance is not None:
    sorted_idx = np.argsort(svm_linear_importance['importance'])[::-1]
    print("\n=== SVM linear importance ===")
    for idx in sorted_idx:
        print(f"{svm_linear_importance['features'][idx]:<15}: {svm_linear_importance['importance'][idx]:.4f}")
else:
    print("\n：SVMno")


=== SVM linear importance ===
MagpieData avg_dev GSvolume_pa: 0.0454
0-norm         : 0.0396
MagpieData minimum NUnfilled: 0.0302
MagpieData maximum GSvolume_pa: 0.0285
MagpieData range GSvolume_pa: 0.0210
MagpieData minimum GSvolume_pa: 0.0188
MagpieData range NValence: 0.0187
MagpieData maximum NValence: 0.0187
MagpieData mean NdUnfilled: 0.0181
mean simul. packing efficiency: 0.0173
MagpieData mean GSmagmom: 0.0169
MagpieData minimum AtomicWeight: 0.0164
MagpieData minimum Row: 0.0164
MagpieData minimum Number: 0.0164
MagpieData mean NpUnfilled: 0.0157
MagpieData avg_dev GSmagmom: 0.0154
MagpieData mean GSbandgap: 0.0140
MagpieData mean NfUnfilled: 0.0132
MagpieData avg_dev NfUnfilled: 0.0130
MagpieData avg_dev Electronegativity: 0.0127
MagpieData avg_dev GSbandgap: 0.0125
MagpieData mode SpaceGroupNumber: 0.0120
MagpieData avg_dev MeltingT: 0.0115
avg f valence electrons: 0.0111
MagpieData mean NfValence: 0.0111
2-norm         : 0.0111
mean abs simul. packing efficiency: 0.0110
Ma