In [None]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import BorderlineSMOTE, SVMSMOTE, KMeansSMOTE, ADASYN
from matplotlib import pyplot as plt
import warnings
warnings.simplefilter("ignore")
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, r2_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from hyperopt.pyll.stochastic import sample
import math
from sklearn.model_selection import GridSearchCV, ParameterGrid, train_test_split, cross_val_score
import re
import seaborn as sns
from scipy.stats import chi2_contingency
from subprocess import check_output
from joblib.logger import pprint
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
import lightgbm as lgb
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

%matplotlib inline

# 1.0 Data load

In [None]:
all_df=pd.read_excel(r"Datasheet.xlsx")

In [None]:
print(all_df.shape)
all_df.head(1)

# Functions

In [None]:
def data_preprocess(all_df):
    real_df = all_df[~all_df['Ligand'].isin(train_exclude)]
    oob_all_df=all_df[all_df['Ligand'].isin(oob_ligands)]
    print('Train Ligands:\n', real_df.Ligand.value_counts())
    print('---------------------------------------------------------------')
    print('OOB Ligands:\n', oob_all_df.Ligand.value_counts())
    print('---------------------------------------------------------------')
    real_df=real_df.iloc[:,3:]
    #print(real_df.head(1))
    return real_df, oob_all_df

In [None]:
def smote_requirement(real_df, oob_all_df, smote_required = True,smote=1):
    real_df['class']=np.where(real_df['Output (ee)%']>70,1,0)
    print('Real distribution (>70 is 1): \n', real_df['class'].value_counts())
    print('Real dataset: ', real_df.shape)
    minority_df=real_df[real_df['class']==0]
    X=real_df.iloc[:,:-1]
    y=real_df.iloc[:,-1]
    if smote_required == True:
        if  smote==1:
            sm = BorderlineSMOTE(random_state=2, kind = 'borderline-2')
            X_res, y_res = sm.fit_resample(X, y)

        elif smote==2:
            svm = SVMSMOTE(random_state=2)
            X_res, y_res = svm.fit_resample(X, y)
            
        elif smote==3:
            
            km = KMeansSMOTE(random_state=2)
            X_res, y_res = km.fit_resample(X, y)
        
        elif smote==4:
            ada = ADASYN(random_state=2)
            X_res, y_res = ada.fit_resample(X, y)
        
        print('SMOTE distribution (>70 is 1): \n', y_res.value_counts())
        print('SMOTE dataset: ', X_res.shape)
        X = X_res
        y = y_res
    else:
        pass
    oob_df=oob_all_df.iloc[:,3:]
    print('OOB dataset: ', oob_df.shape)
    X_oob=oob_df.iloc[:,:-1]
    y_oob=oob_df.iloc[:,-1]
    return minority_df, X, y, oob_df, X_oob, y_oob



In [None]:
def data_split_scaling(X, random_state):
    X_org=X.iloc[:,:-1]
    y_org=X.iloc[:,-1]
    X_train, X_test, y_train, y_test = train_test_split(X_org, y_org, test_size=0.2, random_state=random_state)
    return X_train, X_test, y_train, y_test

In [None]:
def nn_model(X_train, X_test, y_train, y_test,
                  param_dist, random_state, cv, epochs, early_stop, X_oob , y_oob, oob_all_df ):
    
    tf.random.set_seed(random_state)
    
    def create_neural_network(hidden_units=32, dropout_rate=0.3, learning_rate=0.001):
        model = keras.Sequential([
            keras.layers.Dense(hidden_units, activation='relu', input_shape=(X_train.shape[1],)),
            keras.layers.Dropout(dropout_rate),
            keras.layers.Dense(hidden_units // 2, activation='relu'),
            keras.layers.Dense(1)  # No activation for regression
        ])
        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
        model.compile(optimizer=optimizer, loss='mse')
        return model

    keras_regressor = KerasRegressor(build_fn=create_neural_network, verbose=0)

    grid = GridSearchCV(estimator=keras_regressor, param_grid=param_dist, cv=cv)
    grid_result = grid.fit(X_train, y_train)

    best_params = grid_result.best_params_
    best_model = grid_result.best_estimator_.model

    best_model.fit(X_train, y_train, epochs=epochs, batch_size=32)

    prediction_train = best_model.predict(X_train)

    prediction = best_model.predict(X_test)

    mse_train = mean_squared_error(y_train, prediction_train, squared = False)
    mse_test = mean_squared_error(y_test, prediction, squared = False)

    print('Train RMSE: ', mse_train)
    print('Test RMSE: ', mse_test)
    
    prediction_oob = best_model.predict(X_oob)
    
    mse_oob = mean_squared_error(y_oob, prediction_oob, squared = False)
    print('OOB RMSE: ', mse_oob)

    oob_df_predict = oob_all_df.copy()
    oob_df_predict['prediction'] = prediction_oob
    
    def r2_rmse(g):
        r2 = r2_score(g['Output (ee)%'], g['prediction'])
        rmse = np.sqrt(mean_squared_error(g['Output (ee)%'], g['prediction'], squared = False))
        return pd.Series(dict(rmse = rmse))
    
    return best_model, mse_train, mse_test, mse_oob, best_params

# Modeling

In [None]:
oob_ligands = ['L13','L14', 'L15']
train_exclude = oob_ligands 

In [None]:
real_df, oob_all_df = data_preprocess(all_df)

In [None]:
start = 0
end = 1000 #1000
step_size = 10

random_num = np.arange(start,end,step_size)
print(len(random_num))

## NN

### Kmeans

In [None]:
"""
1: Borderline 2
2: SVM
3: Kmeans
4: Adasyn

"""
minority_df, X, y, oob_df, X_oob, y_oob = smote_requirement(real_df, oob_all_df, 
                                                            smote_required = True,smote=3)

In [None]:
%%time
import tqdm.notebook as tq

columns = ['Random number', 'Train RMSE', 'Test RMSE', 'OOB RMSE', 'Hyp parameters']
kmeans_nn_100 = pd.DataFrame(columns=columns)

for i in tq.tqdm(random_num):
    X_train, X_test, y_train, y_test = data_split_scaling(X, random_state=i)
    
    param_dist = {
        'hidden_units': [32, 64],
        'dropout_rate': [0.1, 0.2,0.3],
        'learning_rate': [0.001, 0.01]
    }   

    model,mse_train, mse_test, mse_oob, parameters = nn_model(X_train, X_test, y_train, y_test, param_dist,
                                                                       random_state=i, cv = 5, epochs = 200, early_stop = False , 
                                                                       X_oob = X_oob, y_oob = y_oob, oob_all_df = oob_all_df)
    kmeans_nn_100 = kmeans_nn_100.append({
        'Random number': i,
        'Train RMSE': mse_train,
        'Test RMSE': mse_test,
        'OOB RMSE': mse_oob,  
        'Hyp parameters': parameters
    }, ignore_index=True)
    
    print("\n", i, " Done #################################################################################")

In [None]:
kmeans_nn_100.to_excel(r'100 runs/kmeans_nn_100_set2.xlsx', index=False)

### No SMOTE

In [None]:
"""
1: Borderline 2
2: SVM
3: Kmeans
4: Adasyn

"""
minority_df, X, y, oob_df, X_oob, y_oob = smote_requirement(real_df, oob_all_df, 
                                                            smote_required = False,smote=3)

In [None]:
%%time
import tqdm.notebook as tq

columns = ['Random number', 'Train RMSE', 'Test RMSE', 'OOB RMSE', 'Hyp parameters']
no_smote_nn_100 = pd.DataFrame(columns=columns)

for i in tq.tqdm(random_num):
    X_train, X_test, y_train, y_test = data_split_scaling(X, random_state=i)
    
    param_dist = {
        'hidden_units': [32, 64],
        'dropout_rate': [0.1, 0.2,0.3],
        'learning_rate': [0.001, 0.01]
    }   
  

    model,mse_train, mse_test, mse_oob, parameters = nn_model(X_train, X_test, y_train, y_test, param_dist,
                                                                       random_state=i, cv = 5, epochs = 200, early_stop = False , 
                                                                       X_oob = X_oob, y_oob = y_oob, oob_all_df = oob_all_df)
    no_smote_nn_100 = no_smote_nn_100.append({
        'Random number': i,
        'Train RMSE': mse_train,
        'Test RMSE': mse_test,
        'OOB RMSE': mse_oob,  
        'Hyp parameters': parameters
    }, ignore_index=True)
    
    print("\n", i, " Done #################################################################################")

In [None]:
no_smote_nn_100.to_excel(r'100 runs/no_smote_nn_100_set2.xlsx', index=False)