In [None]:
import pandas as pd
from torch.utils.data import DataLoader
from xgboost import XGBClassifier 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.metrics import balanced_accuracy_score
import matplotlib.pyplot as plt

In [None]:
# data loading
data = pd.read_csv(r'''D:\A309-21-\课题\DILI Predict\DILI数据+模型\汇总-DILI\train\RDKit-ECFP_4data.csv''', low_memory=False)
X = data.iloc[:,3:].values
y = data.loc[:,'DILIst.1'].values
# fixed random seed so that training is the same for each division, shuffle: reorder the data before splitting
data_loader = DataLoader(dataset=data, batch_size=3, shuffle=True, num_workers=0, drop_last=False)
dataset = data.sample(frac=1).reset_index(drop=True) 
# slicing the dataset in two
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=78, shuffle=True)

In [None]:
# define the hyperparameter grid to search
param_grid = {
    'max_depth':range(10,60,5), 
    'learning_rate':[0.0001, 0.001, 0.01, 0.1, 0.3],       
    'n_estimators':range(80,160,5),            
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1],       
    'gamma':[0.05, 0.1, 0.3, 0.5, 1],  
    'reg_alpha':[0, 0.0001, 0.001, 0.01, 0.1], 
    'reg_lambda':[0, 0.0001, 0.001, 0.01, 0.1]  
}
XGBoost = XGBClassifier(objective='binary:logistic', booster='gbtree', random_state=78, seed=1)
grid_search = GridSearchCV(estimator=XGBoost, param_grid=param_grid, scoring='precision_weighted', cv=5)
# output the best parameter and the best score
grid_search.fit(X_train, y_train)
print("Best parameters: ", grid_search.best_params_)
print("Best precision weighted score: ", grid_search.best_score_)
# cross-validation
best_XGBoost = grid_search.best_estimator_
best_XGBoost_s = cross_val_score(best_XGBoost, X_train, y_train, cv=5, scoring="precision_weighted").mean()
print("Best XGBoost after GridSearch:{}".format(best_rfc_s))

# evaluation on test set
best_XGBoost.fit(X_train, y_train)
y_score_test = best_XGBoost.predict_proba(X_test)[:, 1] 
y_pred_test = (best_XGBoost.predict(X_test) > 0.5).astype("int32")
TN, FP, FN, TP = confusion_matrix(y_test, y_pred_test).ravel()
SE = TP / (TP + FN)
SP = TN / (TN + FP)
ACC = (TP + TN) / (TP + TN + FP + FN)                    
F1 = (TP / (2 * TP + FN + FP)) * 2
MCC = (TP * TN - FP * FN) / ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) ** 0.5
AUC = roc_auc_score(y_test, y_score_test)
BA = balanced_accuracy_score(y_test, y_pred_test)
LR = pd.DataFrame({'SE': [SE], 'SP': [SP], 'ACC': [ACC], 'AUC': [AUC], 'F1': [F1], 'MCC': [MCC], 'BA': [BA]})
print(LR)
