In [None]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import BorderlineSMOTE, SVMSMOTE, KMeansSMOTE, ADASYN
from matplotlib import pyplot as plt
import warnings
warnings.simplefilter("ignore")
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, r2_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from hyperopt.pyll.stochastic import sample
import math
from sklearn.model_selection import GridSearchCV, ParameterGrid, train_test_split, cross_val_score
import re
import seaborn as sns
from scipy.stats import chi2_contingency
from subprocess import check_output
from joblib.logger import pprint
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import xgboost as xgb
from xgboost.sklearn import XGBRegressor

import lightgbm as lgb

%matplotlib inline

# 1.0 Data load

In [None]:
all_df=pd.read_excel("Datasheet.xlsx")

In [None]:
print(all_df.shape)
all_df.head(1)

# Functions

In [None]:
def data_preprocess(all_df):
    real_df = all_df[~all_df['Ligand'].isin(train_exclude)]
    oob_all_df=all_df[all_df['Ligand'].isin(oob_ligands)]
    print('Train Ligands:\n', real_df.Ligand.value_counts())
    print('---------------------------------------------------------------')
    print('OOB Ligands:\n', oob_all_df.Ligand.value_counts())
    print('---------------------------------------------------------------')
    real_df=real_df.iloc[:,3:]
    #print(real_df.head(1))
    return real_df, oob_all_df

In [None]:
def smote_requirement(real_df, oob_all_df, smote_required = True,smote=1):
    real_df['class']=np.where(real_df['Output (ee)%']>70,1,0)
    print('Real distribution (>70 is 1): \n', real_df['class'].value_counts())
    print('Real dataset: ', real_df.shape)
    minority_df=real_df[real_df['class']==0]
    X=real_df.iloc[:,:-1]
    y=real_df.iloc[:,-1]
    if smote_required == True:
        if  smote==1:
            sm = BorderlineSMOTE(random_state=2, kind = 'borderline-2')
            X_res, y_res = sm.fit_resample(X, y)

        elif smote==2:
            svm = SVMSMOTE(random_state=2)
            X_res, y_res = svm.fit_resample(X, y)
            
        elif smote==3:
            
            km = KMeansSMOTE(random_state=2)
            X_res, y_res = km.fit_resample(X, y)
        
        elif smote==4:
            ada = ADASYN(random_state=2)
            X_res, y_res = ada.fit_resample(X, y)
        
        print('SMOTE distribution (>70 is 1): \n', y_res.value_counts())
        print('SMOTE dataset: ', X_res.shape)
        X = X_res
        y = y_res
    else:
        pass
    oob_df=oob_all_df.iloc[:,3:]
    print('OOB dataset: ', oob_df.shape)
    X_oob=oob_df.iloc[:,:-1]
    y_oob=oob_df.iloc[:,-1]
    return minority_df, X, y, oob_df, X_oob, y_oob



In [None]:
def data_split_scaling(X, random_state):
    X_org=X.iloc[:,:-1]
    y_org=X.iloc[:,-1]
    X_train, X_test, y_train, y_test = train_test_split(X_org, y_org, test_size=0.2, random_state=random_state)
    return X_train, X_test, y_train, y_test

In [None]:
def xgboost_model(X_train, X_test, y_train, y_test,
                  parameters_xgb, random_state, cv, early_stop , early_stop_rounds, X_oob , y_oob, oob_all_df ):

    xgb1 = XGBRegressor(random_state=random_state)
    xgb_grid = GridSearchCV(xgb1,
                        parameters_xgb,
                        cv = cv,
                        n_jobs = -1,
                        verbose=True)
    if early_stop == True:
        xgb_grid.fit(X_train, y_train,  early_stopping_rounds=early_stop_rounds, eval_set=[(X_test, y_test)])
    else:
        xgb_grid.fit(X_train, y_train)
    print('Best model score: ', xgb_grid.best_score_)
    print('Best model parameters: ', xgb_grid.best_params_)

    prediction_train = xgb_grid.predict(X_train)
    # Predict on test data
    prediction = xgb_grid.predict(X_test)
    # Compute mean squared error
    mse_train = mean_squared_error(y_train, prediction_train, squared = False)
    mse_test = mean_squared_error(y_test, prediction, squared = False)

    print('Train RMSE: ', mse_train)
    print('Test RMSE: ', mse_test)
    
    # Predict on oob data
    prediction_oob = xgb_grid.predict(X_oob)
    # Compute mean squared error
    mse_oob = mean_squared_error(y_oob, prediction_oob, squared = False)
    print('OOB RMSE: ', mse_oob)

    oob_df_predict = oob_all_df.copy()
    oob_df_predict['prediction'] = prediction_oob
    
    def r2_rmse(g):
        r2 = r2_score(g['Output (ee)%'], g['prediction'])
        rmse = np.sqrt(mean_squared_error(g['Output (ee)%'], g['prediction'], squared = False))
        return pd.Series(dict(rmse = rmse))
    
    return xgb_grid, mse_train, mse_test, mse_oob, xgb_grid.best_params_, prediction_oob, y_oob 

In [None]:
def rf_model(X_train, X_test, y_train, y_test,
                  parameters_rf, random_state, cv, early_stop , early_stop_rounds, X_oob , y_oob, oob_all_df ):

    rf1 = RandomForestRegressor(random_state=random_state)
    rf_grid = GridSearchCV(rf1,
                        parameters_rf,
                        cv = cv,
                        n_jobs = -1,
                        verbose=True)
    if early_stop == True:
        rf_grid.fit(X_train, y_train,  early_stopping_rounds=early_stop_rounds, eval_set=[(X_test, y_test)])
    else:
        rf_grid.fit(X_train, y_train)
    print('Best model score: ', rf_grid.best_score_)
    print('Best model parameters: ', rf_grid.best_params_)

    prediction_train = rf_grid.predict(X_train)
    # Predict on test data
    prediction = rf_grid.predict(X_test)
    # Compute mean squared error
    mse_train = mean_squared_error(y_train, prediction_train, squared = False)
    mse_test = mean_squared_error(y_test, prediction, squared = False)

    print('Train RMSE: ', mse_train)
    print('Test RMSE: ', mse_test)
    
    # Predict on oob data
    prediction_oob = rf_grid.predict(X_oob)
    # Compute mean squared error
    mse_oob = mean_squared_error(y_oob, prediction_oob, squared = False)
    print('OOB RMSE: ', mse_oob)

    oob_df_predict = oob_all_df.copy()
    oob_df_predict['prediction'] = prediction_oob
    
    def r2_rmse(g):
        r2 = r2_score(g['Output (ee)%'], g['prediction'])
        rmse = np.sqrt(mean_squared_error(g['Output (ee)%'], g['prediction'], squared = False))
        return pd.Series(dict(rmse = rmse))
    
    return rf_grid, mse_train, mse_test, mse_oob, rf_grid.best_params_

In [None]:
def lgb_model(X_train, X_test, y_train, y_test,
                  parameters_lgb, random_state, cv, early_stop , early_stop_rounds, X_oob , y_oob, oob_all_df ):

    lgb1 = lgb.LGBMRegressor(random_state=random_state, verbose=-1)
    lgb_grid = GridSearchCV(lgb1,
                        parameters_lgb,
                        cv = cv,
                        n_jobs = -1,
                        verbose=True)
    if early_stop == True:
        lgb_grid.fit(X_train, y_train,  early_stopping_rounds=early_stop_rounds, eval_set=[(X_test, y_test)])
    else:
        lgb_grid.fit(X_train, y_train)
    print('Best model score: ', lgb_grid.best_score_)
    print('Best model parameters: ', lgb_grid.best_params_)

    prediction_train = lgb_grid.predict(X_train)
    # Predict on test data
    prediction = lgb_grid.predict(X_test)
    # Compute mean squared error
    mse_train = mean_squared_error(y_train, prediction_train, squared = False)
    mse_test = mean_squared_error(y_test, prediction, squared = False)

    print('Train RMSE: ', mse_train)
    print('Test RMSE: ', mse_test)
    
    # Predict on oob data
    prediction_oob = lgb_grid.predict(X_oob)
    # Compute mean squared error
    mse_oob = mean_squared_error(y_oob, prediction_oob, squared = False)
    print('OOB RMSE: ', mse_oob)

    oob_df_predict = oob_all_df.copy()
    oob_df_predict['prediction'] = prediction_oob
    
    def r2_rmse(g):
        r2 = r2_score(g['Output (ee)%'], g['prediction'])
        rmse = np.sqrt(mean_squared_error(g['Output (ee)%'], g['prediction'], squared = False))
        return pd.Series(dict(rmse = rmse))
    

    #print('OOB RMSE at Ligand level: \n', oob_df_predict.groupby('Ligand').apply(r2_rmse).reset_index())
    
    return lgb_grid, mse_train, mse_test, mse_oob, lgb_grid.best_params_

# Modeling

In [None]:
oob_ligands = ['L13','L14', 'L15']
train_exclude = oob_ligands 

In [None]:
real_df, oob_all_df = data_preprocess(all_df)

In [None]:
start = 0
end = 1000 #1000
step_size = 1000

random_num = np.arange(start,end,step_size)
print(len(random_num))

## XG Boost

### Kmeans

In [None]:
"""
1: Borderline 2
2: SVM
3: Kmeans
4: Adasyn

"""
minority_df, X, y, oob_df, X_oob, y_oob = smote_requirement(real_df, oob_all_df, 
                                                            smote_required = True,smote=3)

In [None]:
%%time
import tqdm.notebook as tq

columns = ['Random number', 'Train RMSE', 'Test RMSE', 'OOB RMSE', 'Hyp parameters']
kmeans_xgb_100 = pd.DataFrame(columns=columns)

for i in tq.tqdm(random_num):
    X_train, X_test, y_train, y_test = data_split_scaling(X, random_state=i)
    
    parameters_xgb = {'gamma':[0.5, 3],
              'objective':['reg:squarederror'],
              'learning_rate': [.03, 0.05], 
              'max_depth': [5, 6],
              'min_child_weight': [4],
              'subsample': [0.3],
              'colsample_bytree': [0.7, 1.0],
              'n_estimators': [500],          
              'reg_alpha': [ 0.5],            
              'reg_lambda': [0]}              
    xgb_grid, mse_train, mse_test, mse_oob, parameters = xgboost_model(X_train, X_test, y_train, y_test, parameters_xgb,
                                                                       random_state=i, cv = 5, early_stop = False ,early_stop_rounds = 5 , 
                                                                       X_oob = X_oob, y_oob = y_oob, oob_all_df = oob_all_df)
    kmeans_xgb_100 = kmeans_xgb_100.append({
        'Random number': i,
        'Train RMSE': mse_train,
        'Test RMSE': mse_test,
        'OOB RMSE': mse_oob,  
        'Hyp parameters': parameters
    }, ignore_index=True)
    
    print("\n", i, " Done #################################################################################")

In [None]:
kmeans_xgb_100.head()

In [None]:
kmeans_xgb_100.describe()

In [None]:
kmeans_xgb_100.to_excel(r'100 runs/kmeans_xgb_100_set2.xlsx', index=False)

### No SMOTE

In [None]:
"""
1: Borderline 2
2: SVM
3: Kmeans
4: Adasyn

"""
minority_df, X, y, oob_df, X_oob, y_oob = smote_requirement(real_df, oob_all_df, 
                                                            smote_required = False,smote=3)

In [None]:
%%time
import tqdm.notebook as tq

columns = ['Random number', 'Train RMSE', 'Test RMSE', 'OOB RMSE', 'Hyp parameters']
no_smote_xgb_100 = pd.DataFrame(columns=columns)

for i in tq.tqdm(random_num):
    X_train, X_test, y_train, y_test = data_split_scaling(X, random_state=i)
    
    parameters_xgb = {'gamma':[0.5, 3],
              'objective':['reg:squarederror'],
              'learning_rate': [.03, 0.05], 
              'max_depth': [5, 6],
              'min_child_weight': [4],
              'subsample': [0.3],
              'colsample_bytree': [0.7, 1.0],
              'n_estimators': [500],           
              'reg_alpha': [ 0.5],             
              'reg_lambda': [0]}               
    xgb_grid, mse_train, mse_test, mse_oob, parameters = xgboost_model(X_train, X_test, y_train, y_test, parameters_xgb,
                                                                       random_state=i, cv = 5, early_stop = False ,early_stop_rounds = 5 , 
                                                                       X_oob = X_oob, y_oob = y_oob, oob_all_df = oob_all_df)
    no_smote_xgb_100 = no_smote_xgb_100.append({
        'Random number': i,
        'Train RMSE': mse_train,
        'Test RMSE': mse_test,
        'OOB RMSE': mse_oob,  
        'Hyp parameters': parameters
    }, ignore_index=True)
    
    print("\n", i, " Done #################################################################################")

In [None]:
no_smote_xgb_100.head()

In [None]:
no_smote_xgb_100.describe()

In [None]:
no_smote_xgb_100.to_excel(r'100 runs/no_smote_xgb_100_set2.xlsx', index=False)

## Random Forest

### Kmeans

In [None]:
"""
1: Borderline 2
2: SVM
3: Kmeans
4: Adasyn

"""
minority_df, X, y, oob_df, X_oob, y_oob = smote_requirement(real_df, oob_all_df, 
                                                            smote_required = True,smote=3)

In [None]:
%%time
import tqdm.notebook as tq

columns = ['Random number', 'Train RMSE', 'Test RMSE', 'OOB RMSE', 'Hyp parameters']
kmeans_rf_100 = pd.DataFrame(columns=columns)

for i in tq.tqdm(random_num):
    X_train, X_test, y_train, y_test = data_split_scaling(X, random_state=i)
    
    param_grid_rf = {
        'n_estimators': [100, 500],           
        'max_depth': [5, 10],                 
        'min_samples_split': [ 10],           
        'min_samples_leaf': [ 2, 4],          
        'max_features': ['sqrt'],             
        'bootstrap': [True, False]                         
    }
    rf_grid, mse_train, mse_test, mse_oob, parameters = rf_model(X_train, X_test, y_train, y_test, param_grid_rf,
                                                                       random_state=i, cv = 5, early_stop = False ,early_stop_rounds = 5 , 
                                                                       X_oob = X_oob, y_oob = y_oob, oob_all_df = oob_all_df)
    kmeans_rf_100 = kmeans_rf_100.append({
        'Random number': i,
        'Train RMSE': mse_train,
        'Test RMSE': mse_test,
        'OOB RMSE': mse_oob,  
        'Hyp parameters': parameters
    }, ignore_index=True)
    
    print("\n", i, " Done #################################################################################")

In [None]:
kmeans_rf_100.head()

In [None]:
kmeans_rf_100.describe()

In [None]:
kmeans_rf_100.to_excel(r'100 runs/kmeans_rf_100_set2.xlsx', index=False)

### No SMOTE

In [None]:
"""
1: Borderline 2
2: SVM
3: Kmeans
4: Adasyn

"""
minority_df, X, y, oob_df, X_oob, y_oob = smote_requirement(real_df, oob_all_df, 
                                                            smote_required = False,smote=3)

In [None]:
%%time
import tqdm.notebook as tq

columns = ['Random number', 'Train RMSE', 'Test RMSE', 'OOB RMSE', 'Hyp parameters']
no_smote_rf_100 = pd.DataFrame(columns=columns)

for i in tq.tqdm(random_num):
    X_train, X_test, y_train, y_test = data_split_scaling(X, random_state=i)
    
    param_grid_rf = {
        'n_estimators': [100, 500],      
        'max_depth': [5, 10],            
        'min_samples_split': [ 10],      
        'min_samples_leaf': [ 2, 4],     
        'max_features': ['sqrt'],        
        'bootstrap': [True, False]       
    }
    rf_grid, mse_train, mse_test, mse_oob, parameters = rf_model(X_train, X_test, y_train, y_test, param_grid_rf,
                                                                       random_state=i, cv = 5, early_stop = False ,early_stop_rounds = 5 , 
                                                                       X_oob = X_oob, y_oob = y_oob, oob_all_df = oob_all_df)
    no_smote_rf_100 = no_smote_rf_100.append({
        'Random number': i,
        'Train RMSE': mse_train,
        'Test RMSE': mse_test,
        'OOB RMSE': mse_oob, 
        'Hyp parameters': parameters
    }, ignore_index=True)
    
    print("\n", i, " Done #################################################################################")

In [None]:
no_smote_rf_100.head()

In [None]:
no_smote_rf_100.describe()

In [None]:
no_smote_rf_100.to_excel(r'100 runs/no_smote_rf_100_set2.xlsx', index=False)

## LGBM

### Kmeans

In [None]:
"""
1: Borderline 2
2: SVM
3: Kmeans
4: Adasyn

"""
minority_df, X, y, oob_df, X_oob, y_oob = smote_requirement(real_df, oob_all_df, 
                                                            smote_required = True,smote=3)

In [None]:
%%time
import tqdm.notebook as tq

columns = ['Random number', 'Train RMSE', 'Test RMSE', 'OOB RMSE', 'Hyp parameters']
kmeans_lgb_100 = pd.DataFrame(columns=columns)

for i in tq.tqdm(random_num):
    X_train, X_test, y_train, y_test = data_split_scaling(X, random_state=i)
    
    param_grid_lgb = {
        'boosting_type': ['dart'],       
        'num_leaves': [10,31],           
        'learning_rate': [0.05, 0.3],    
        'subsample': [0.3,0.8],          
        'colsample_bytree': [0.5,0.8],   
        'reg_alpha': [0.5],              
        'reg_lambda': [0,0.5],           
        'n_estimators': [500],           
    }
    lgb_grid, mse_train, mse_test, mse_oob, parameters = lgb_model(X_train, X_test, y_train, y_test, param_grid_lgb,
                                                                       random_state=i, cv = 5, early_stop = False ,early_stop_rounds = 5 , 
                                                                       X_oob = X_oob, y_oob = y_oob, oob_all_df = oob_all_df)
    kmeans_lgb_100 = kmeans_lgb_100.append({
        'Random number': i,
        'Train RMSE': mse_train,
        'Test RMSE': mse_test,
        'OOB RMSE': mse_oob,  
        'Hyp parameters': parameters
    }, ignore_index=True)
    
    print("\n", i, " Done #################################################################################")

In [None]:
kmeans_lgb_100.head()

In [None]:
kmeans_lgb_100.describe()

In [None]:
kmeans_lgb_100.to_excel(r'100 runs/kmeans_lgb_100_set2.xlsx', index=False)

### No SMOTE

In [None]:
"""
1: Borderline 2
2: SVM
3: Kmeans
4: Adasyn

"""
minority_df, X, y, oob_df, X_oob, y_oob = smote_requirement(real_df, oob_all_df, 
                                                            smote_required = False,smote=3)

In [None]:
%%time
import tqdm.notebook as tq

columns = ['Random number', 'Train RMSE', 'Test RMSE', 'OOB RMSE', 'Hyp parameters']
no_smote_lgb_100 = pd.DataFrame(columns=columns)

for i in tq.tqdm(random_num):
    X_train, X_test, y_train, y_test = data_split_scaling(X, random_state=i)
    
    param_grid_lgb = {
        'boosting_type': ['dart'],          
        'num_leaves': [10,31],              
        'learning_rate': [0.05, 0.3],       
        'subsample': [0.3,0.8],             
        'colsample_bytree': [0.5,0.8],      
        'reg_alpha': [0,0.5],               
        'reg_lambda': [0,0.5],              
        'n_estimators': [500],              
    }
    lgb_grid, mse_train, mse_test, mse_oob, parameters = lgb_model(X_train, X_test, y_train, y_test, param_grid_lgb,
                                                                       random_state=i, cv = 5, early_stop = False ,early_stop_rounds = 5 , 
                                                                       X_oob = X_oob, y_oob = y_oob, oob_all_df = oob_all_df)
    no_smote_lgb_100 = no_smote_lgb_100.append({
        'Random number': i,
        'Train RMSE': mse_train,
        'Test RMSE': mse_test,
        'OOB RMSE': mse_oob,  
        'Hyp parameters': parameters
    }, ignore_index=True)
    
    print("\n", i, " Done #################################################################################")

In [None]:
no_smote_lgb_100.head()

In [None]:
no_smote_lgb_100.describe()

In [None]:
no_smote_lgb_100.to_excel(r'100 runs/no_smote_lgb_100_set2.xlsx', index=False)

# Predicted values for best run

In [None]:
oob_ligands = ['L13','L14', 'L15']
train_exclude = oob_ligands 

real_df, oob_all_df = data_preprocess(all_df)

random_num = [660]

"""
1: Borderline 2
2: SVM
3: Kmeans
4: Adasyn

"""
minority_df, X, y, oob_df, X_oob, y_oob = smote_requirement(real_df, oob_all_df, 
                                                            smote_required = True,smote=3)

columns = ['Random number', 'Train RMSE', 'Test RMSE', 'OOB RMSE', 'Hyp parameters']
kmeans_xgb_100 = pd.DataFrame(columns=columns)

for i in random_num:
    X_train, X_test, y_train, y_test = data_split_scaling(X, random_state=i)
    
    parameters_xgb = {
        'gamma':[0.5],
        'objective':['reg:squarederror'],
        'learning_rate': [0.03], 
        'max_depth': [6],
        'min_child_weight': [4],
        'subsample': [0.3],
        'colsample_bytree': [0.7],
        'n_estimators': [500], 
        'reg_alpha': [ 0.5],  
        'reg_lambda': [0] 
    }
    xgb_grid, mse_train, mse_test, mse_oob, parameters, prediction_oob, y_oob= xgboost_model(X_train, X_test, y_train, y_test, parameters_xgb,
                                                                       random_state=i, cv = 5, early_stop = False ,early_stop_rounds = 5 , 
                                                                       X_oob = X_oob, y_oob = y_oob, oob_all_df = oob_all_df)


# Shapley

In [None]:
oob_ligands = ['L13','L14', 'L15']
train_exclude = oob_ligands 

In [None]:
real_df, oob_all_df = data_preprocess(all_df)

In [None]:
start = 0
end = 1000 #1000
step_size = 10

random_num = np.arange(start,end,step_size)
print(len(random_num))

In [None]:
"""
1: Borderline 2
2: SVM
3: Kmeans
4: Adasyn

"""
minority_df, X, y, oob_df, X_oob, y_oob = smote_requirement(real_df, oob_all_df, 
                                                            smote_required = True,smote=3)

In [None]:
%%time
import tqdm.notebook as tq

columns = ['Random number', 'Train RMSE', 'Test RMSE', 'OOB RMSE', 'Hyp parameters']
results = []
random_num = [660]

X_train, X_test, y_train, y_test = data_split_scaling(X, random_state=i)

parameters_xgb = {
    'gamma':[0.5],
    'objective':['reg:squarederror'],
    'learning_rate': [0.03], #so called `eta` value
    'max_depth': [6],
    'min_child_weight': [4],
    'subsample': [0.3],
    'colsample_bytree': [0.7],
    'n_estimators': [500],  
    'reg_alpha': [ 0.5],  
    'reg_lambda': [0]  
}

xgb_grid, mse_train, mse_test, mse_oob, parameters = xgboost_model(X_train, X_test, y_train, y_test, parameters_xgb,
                                                                   random_state=random_num, cv=5, early_stop=False, early_stop_rounds=5, 
                                                                   X_oob=X_oob, y_oob=y_oob, oob_all_df=oob_all_df)


results.append({
    'Random number': i,
    'Train RMSE': mse_train,
    'Test RMSE': mse_test,
    'OOB RMSE': mse_oob,  
    'Hyp parameters': parameters
})
    
kmeans_xgb_100 = pd.DataFrame(results, columns=columns)

In [None]:
import shap
results = []

parameters_xgb = {
    'gamma':[0.5],
    'objective':['reg:squarederror'],
    'learning_rate': [0.03], 
    'max_depth': [6],
    'min_child_weight': [4],
    'subsample': [0.3],
    'colsample_bytree': [0.7],
    'n_estimators': [500],  
    'reg_alpha': [ 0.5],
    'reg_lambda': [0] 
}


xgb_regressor = xgb.XGBRegressor(random_state=660)

grid_search = GridSearchCV(estimator=xgb_regressor, param_grid=parameters_xgb, 
                           scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_xgb = grid_search.best_estimator_

print("Best parameters found: ", grid_search.best_params_)

y_train_pred = best_xgb.predict(X_train)
y_test_pred = best_xgb.predict(X_test)

mse_train = mean_squared_error(y_train, y_train_pred, squared=False)
mse_test = mean_squared_error(y_test, y_test_pred, squared=False)
print(f"Train RMSE: {mse_train}")
print(f"Test RMSE: {mse_test}")

explainer = shap.Explainer(best_xgb, X_train)
shap_values = explainer(X_test)

plt.figure(figsize=(10, 7))
shap.summary_plot(shap_values, X_test, show=False)

results.append({
    'Random number': random_num[0],
    'Train RMSE': mse_train,
    'Test RMSE': mse_test,
    'OOB RMSE': mse_oob,  
    'Hyp parameters': parameters
})

kmeans_xgb_100 = pd.DataFrame(results, columns=columns)
kmeans_xgb_100