In [None]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import BorderlineSMOTE, SVMSMOTE, KMeansSMOTE, ADASYN
from matplotlib import pyplot as plt
import warnings
warnings.simplefilter("ignore")
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from hyperopt.pyll.stochastic import sample
import math
from sklearn.model_selection import GridSearchCV, ParameterGrid, train_test_split, cross_val_score
import re
import seaborn as sns
from scipy.stats import chi2_contingency
from subprocess import check_output
from joblib.logger import pprint
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import xgboost as xgb
from xgboost.sklearn import XGBRegressor

%matplotlib inline


# 1.0 Data load

In [None]:
all_df=pd.read_excel("Datasheet.xlsx")

In [None]:
print(all_df.shape)
all_df.head(1)

# Functions

In [None]:
def data_preprocess(all_df):
    real_df = all_df[~all_df['Ligand'].isin(train_exclude)]
    oob_all_df=all_df[all_df['Ligand'].isin(oob_ligands)]
    print('Train Ligands:\n', real_df.Ligand.value_counts())
    print('---------------------------------------------------------------')
    print('OOB Ligands:\n', oob_all_df.Ligand.value_counts())
    print('---------------------------------------------------------------')
    real_df=real_df.iloc[:,3:]
    #print(real_df.head(1))
    return real_df, oob_all_df

In [None]:
def smote_requirement(real_df, oob_all_df, smote_required = True,smote=1):
    real_df['class']=np.where(real_df['Output (ee)%']>70,1,0)
    print('Real distribution (>70 is 1): \n', real_df['class'].value_counts())
    print('Real dataset: ', real_df.shape)
    minority_df=real_df[real_df['class']==0]
    X=real_df.iloc[:,:-1]
    y=real_df.iloc[:,-1]
    if smote_required == True:
        if  smote==1:
            sm = BorderlineSMOTE(random_state=2, kind = 'borderline-2')
            X_res, y_res = sm.fit_resample(X, y)

        elif smote==2:
            svm = SVMSMOTE(random_state=2)
            X_res, y_res = svm.fit_resample(X, y)
            
        elif smote==3:
            
            km = KMeansSMOTE(random_state=2)
            X_res, y_res = km.fit_resample(X, y)
        
        elif smote==4:
            ada = ADASYN(random_state=2)
            X_res, y_res = ada.fit_resample(X, y)
        
        print('SMOTE distribution (>70 is 1): \n', y_res.value_counts())
        print('SMOTE dataset: ', X_res.shape)
        X = X_res
        y = y_res
    else:
        pass
    oob_df=oob_all_df.iloc[:,3:]
    print('OOB dataset: ', oob_df.shape)
    X_oob=oob_df.iloc[:,:-1]
    y_oob=oob_df.iloc[:,-1]
    return minority_df, X, y, oob_df, X_oob, y_oob



In [None]:
def data_split_scaling(X, random_state):
    X_org=X.iloc[:,:-1]
    y_org=X.iloc[:,-1]
    X_train, X_test, y_train, y_test = train_test_split(X_org, y_org, test_size=0.2, random_state=random_split)
    print('X_train: \n', X_train.shape, '\nX_test: \n', X_test.shape, '\ny_train: \n', y_train.shape, '\ny_test: \n', y_test.shape)
    return X_train, X_test, y_train, y_test

In [None]:
def xgboost_model(X_train, X_test, y_train, y_test,
                  parameters_xgb, cv, early_stop , early_stop_rounds, X_oob , y_oob, oob_all_df ):

    xgb1 = XGBRegressor(random_state=0)
    xgb_grid = GridSearchCV(xgb1,
                        parameters_xgb,
                        cv = cv,
                        n_jobs = -1,
                        verbose=True)
    if early_stop == True:
        xgb_grid.fit(X_train, y_train,  early_stopping_rounds=early_stop_rounds, eval_set=[(X_test, y_test)])
    else:
        xgb_grid.fit(X_train, y_train)
    print('Best model score: ', xgb_grid.best_score_)
    print('Best model parameters: ', xgb_grid.best_params_)

    prediction_train = xgb_grid.predict(X_train)
    # Predict on test data
    prediction = xgb_grid.predict(X_test)
    # Compute mean squared error
    mse_train = mean_squared_error(y_train, prediction_train, squared = False)
    mse_test = mean_squared_error(y_test, prediction, squared = False)
    

    print('Train RMSE: ', mse_train)
    print('Test RMSE: ', mse_test)
    
    # Predict on oob data
    prediction_oob = xgb_grid.predict(X_oob)
    # Compute mean squared error
    mse_oob = mean_squared_error(y_oob, prediction_oob, squared = False)
    print('OOB RMSE: ', mse_oob)

    oob_df_predict = oob_all_df.copy()
    oob_df_predict['prediction'] = prediction_oob
    
    def r2_rmse(g):
        r2 = r2_score(g['Output (ee)%'], g['prediction'])
        rmse = np.sqrt(mean_squared_error(g['Output (ee)%'], g['prediction'], squared = False))
        return pd.Series(dict(rmse = rmse))
    

    print('OOB RMSE at Ligand level: \n', oob_df_predict.groupby('Ligand').apply(r2_rmse).reset_index())
    
    return xgb_grid

# Preprocessing

In [None]:
oob_ligands = ['L13','L14', 'L15']
train_exclude = oob_ligands 

In [None]:
real_df, oob_all_df = data_preprocess(all_df)

In [None]:
"""
1: Borderline 2
2: SVM
3: Kmeans
4: Adasyn

"""
minority_df, X, y, oob_df, X_oob, y_oob = smote_requirement(real_df, oob_all_df, 
                                                            smote_required = True,smote=3)

In [None]:
random_split = 42
X_train, X_test, y_train, y_test = data_split_scaling(X, random_state=random_split)

In [None]:
%%time

parameters_xgb = {'gamma':[3],
              'objective':['reg:squarederror'],
              'learning_rate': [.03, 0.05], 
              'max_depth': [5, 6,15],
              'min_child_weight': [4],
              'subsample': [0.3, 0.5, 0.9],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}

pg=ParameterGrid(parameters_xgb)
print(len(pg))

xgb_grid = xgboost_model(X_train, X_test, y_train, y_test,
                                          parameters_xgb, cv = 5, early_stop = False ,early_stop_rounds = 5 , X_oob = X_oob, y_oob = y_oob, oob_all_df = oob_all_df)