In [25]:
import pandas as pd
import numpy as np
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score

In [38]:
df = pd.ExcelFile('../data/LOX-COX_VS_DPPH-NRF_VS_TEST.xlsx')
df_train_LOX = pd.read_excel(df, 'training_set_LOX')
df_train_ANT = pd.read_excel(df, 'training_set_antioxidant')
df_test = pd.read_excel(df, 'test_set')

#drop title column
df_train_LOX = df_train_LOX.drop(columns = ['Title'], axis = 1)
df_train_ANT = df_train_ANT.drop(columns = ['Title'], axis = 1)
df_test = df_test.drop(columns = ['Title'], axis = 1)

In [27]:
#replace NaN values in each file with mean of column

mean_values_LOX = df_train_LOX.mean()
mean_values_ANT = df_train_ANT.mean()

df_train_LOX = df_train_LOX.fillna(mean_values_LOX)
df_train_ANT = df_train_ANT.fillna(mean_values_ANT)

#merge LOX and ANT arrays
frames = [df_train_LOX, df_train_ANT]
merged_train = pd.concat(frames)
merged_train.shape

train_set = merged_train

In [28]:
# #join LOX and ANT to one matrix. Calculate correlation coefficient and drop columns
# #with minimum threshold
# corr_matrix = merged_train.corr() #correlation coefficient calculation
# np.fill_diagonal(corr_matrix.values, 0) # set diagonal to 0
# threshold = 0.9

# ax = plt.axes()
# sns.heatmap(corr_matrix, xticklabels=False, yticklabels=False)
# ax.set_title('Correlation Coefficient heatmap')

# corr_matrix = abs(corr_matrix) #absolute values of corr coef

# corr_matrix = corr_matrix.replace(np.nan, 0) #replace nan with 0

# corr_matrix.values[np.tril_indices(corr_matrix.shape[0], k=0)] = 0 #set values below diagonal to 0 so i dont drop both columns

# drop_columns = corr_matrix.columns[corr_matrix.max() > threshold].tolist() #columns where corr coef > threshold
# train_set = merged_train.drop(drop_columns, axis = 1) #drop above columns
# print("Shape of train set after column drop",train_set.shape)
# plt.savefig('../results/heatmap', dpi=500)

In [29]:
#form data for tests
x = train_set.loc[:, train_set.columns != 'class']
y = train_set.iloc[:,0]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0)

In [30]:
#Calculate logistic regression with KFold cross validation

kfold = KFold(n_splits=5, random_state=0, shuffle=True)
lr_model = LogisticRegression(solver='liblinear', max_iter = 500)
results = cross_val_score(lr_model, x, y, cv=kfold)

print(results)

[0.93548387 0.94108152 0.94269572 0.94027441 0.94108152]


In [32]:
#Create a svm Classifier
svm_model = svm.SVC(random_state=1, probability=True)

svm_model.fit(x_train, y_train)

results = cross_val_score(svm_model, x, y, cv=kfold)

print(results)

[0.87983871 0.87409201 0.87328491 0.8708636  0.85714286]


In [33]:
#Decision Tree Classifier

dtc_model = DecisionTreeClassifier(max_depth=10, random_state=0)
dtc_model.fit(x,y)
cross_val_score(dtc_model, x, y, cv=5)

array([0.93064516, 0.90637611, 0.85875706, 0.84100081, 0.68926554])

In [34]:
#Random Forest Classifier

rfc_model = RandomForestClassifier(max_depth=10, random_state=0)
rfc_model.fit(x, y)
cross_val_score(rfc_model, x, y, cv=5)

array([0.9266129 , 0.93139629, 0.86924939, 0.8724778 , 0.94027441])

In [35]:
#LightGBM

lgb_model = lgb.LGBMClassifier(learning_rate=0.09,max_depth=-5,random_state=42)
lgb_model.fit(x_train.to_numpy(),y_train.to_numpy(),eval_set=[(x_test.to_numpy(),y_test.to_numpy()),(x_train.to_numpy(),y_train.to_numpy())],verbose=20,eval_metric='logloss')



[20]	valid_0's binary_logloss: 0.103315	valid_1's binary_logloss: 0.0641211
[40]	valid_0's binary_logloss: 0.0536018	valid_1's binary_logloss: 0.0182648
[60]	valid_0's binary_logloss: 0.0382721	valid_1's binary_logloss: 0.00656769
[80]	valid_0's binary_logloss: 0.0320046	valid_1's binary_logloss: 0.00309038
[100]	valid_0's binary_logloss: 0.0296311	valid_1's binary_logloss: 0.00194925


LGBMClassifier(learning_rate=0.09, max_depth=-5, random_state=42)

In [36]:
print('LightGBM results:')
print(f'Training accuracy {lgb_model.score(x_train,y_train):.4f}')
print(f'Testing accuracy {lgb_model.score(x_test,y_test):.4f}')

LightGBM results:
Training accuracy 0.9993
Testing accuracy 0.9919


In [50]:
lgb_model.predict_proba(df_test)


array([[9.99949992e-01, 5.00075338e-05],
       [9.97681149e-01, 2.31885149e-03],
       [9.99201399e-01, 7.98600817e-04],
       [9.99482432e-01, 5.17568195e-04],
       [9.85365698e-01, 1.46343019e-02],
       [8.01761981e-01, 1.98238019e-01],
       [9.94600280e-01, 5.39971952e-03],
       [9.90632543e-01, 9.36745651e-03],
       [9.95723766e-01, 4.27623387e-03],
       [9.76591998e-01, 2.34080024e-02],
       [9.99880731e-01, 1.19268876e-04],
       [9.97690691e-01, 2.30930888e-03],
       [9.97878337e-01, 2.12166337e-03],
       [9.79338700e-01, 2.06612997e-02],
       [8.18530941e-01, 1.81469059e-01],
       [9.80414882e-01, 1.95851177e-02],
       [9.97745989e-01, 2.25401063e-03],
       [9.94173041e-01, 5.82695941e-03],
       [5.82814979e-01, 4.17185021e-01],
       [8.86931784e-01, 1.13068216e-01],
       [8.86931784e-01, 1.13068216e-01],
       [3.70032542e-01, 6.29967458e-01],
       [4.30577036e-01, 5.69422964e-01],
       [6.74787377e-01, 3.25212623e-01]])