In [25]:
import pandas as pd
import numpy as np
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import svm, preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score

In [26]:
df_train_LOX = pd.read_csv('../data/training_set_LOX.csv')
df_train_ANT = pd.read_csv('../data/training_set_antioxidant.csv')
df_test = pd.read_csv('../data/test_set.csv')

#drop title column
df_train_LOX = df_train_LOX.drop(columns = ['Title'], axis = 1)
df_train_ANT = df_train_ANT.drop(columns = ['Title'], axis = 1)
df_test = df_test.drop(columns = ['Title'], axis = 1)



In [27]:
#replace NaN values in each file with mean of column

mean_values_LOX = df_train_LOX.mean()
mean_values_ANT = df_train_ANT.mean()

df_train_LOX = df_train_LOX.fillna(mean_values_LOX)
df_train_ANT = df_train_ANT.fillna(mean_values_ANT)

#merge LOX and ANT arrays
frames = [df_train_LOX, df_train_ANT]
merged_train = pd.concat(frames)
merged_train.shape

train_set = merged_train

In [28]:
#normalization
scaler = preprocessing.MinMaxScaler()

cols = train_set.columns
d = scaler.fit_transform(train_set)
train_set = pd.DataFrame(d, columns=cols)

cols = df_test.columns
d = scaler.fit_transform(df_test)
df_test = pd.DataFrame(d, columns=cols)

In [29]:
#form data for tests
x = train_set.loc[:, train_set.columns != 'class']
y = train_set.iloc[:,0]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0)

In [30]:
#Calculate logistic regression with KFold cross validation

kfold = KFold(n_splits=5, random_state=0, shuffle=True)
lr_model = LogisticRegression(solver='liblinear', max_iter = 500)
results = cross_val_score(lr_model, x, y, cv=kfold)

print(results)

[0.93548387 0.93946731 0.94350282 0.93946731 0.94027441]


In [31]:
#Create a svm Classifier
svm_model = svm.SVC(random_state=1, probability=True)

svm_model.fit(x_train, y_train)

results = cross_val_score(svm_model, x, y, cv=kfold)

print(results)

[0.94435484 0.96125908 0.94511703 0.95157385 0.94027441]


In [32]:
#Decision Tree Classifier

dtc_model = DecisionTreeClassifier(max_depth=10, random_state=0)
dtc_model.fit(x,y)
cross_val_score(dtc_model, x, y, cv=5)

array([0.93064516, 0.90637611, 0.85875706, 0.84100081, 0.68926554])

In [33]:
#Random Forest Classifier

rfc_model = RandomForestClassifier(max_depth=10, random_state=0)
rfc_model.fit(x, y)
cross_val_score(rfc_model, x, y, cv=5)

array([0.925     , 0.93058918, 0.87409201, 0.86521388, 0.93866021])

In [34]:
#LightGBM

lgb_model = lgb.LGBMClassifier(learning_rate=0.09,max_depth=-5,random_state=42)
lgb_model.fit(x_train.to_numpy(),y_train.to_numpy(),eval_set=[(x_test.to_numpy(),y_test.to_numpy()),(x_train.to_numpy(),y_train.to_numpy())],verbose=20,eval_metric='logloss')



[20]	valid_0's binary_logloss: 0.102757	valid_1's binary_logloss: 0.0638257
[40]	valid_0's binary_logloss: 0.0544309	valid_1's binary_logloss: 0.0184517
[60]	valid_0's binary_logloss: 0.0391404	valid_1's binary_logloss: 0.00659733
[80]	valid_0's binary_logloss: 0.0324896	valid_1's binary_logloss: 0.00306784
[100]	valid_0's binary_logloss: 0.0303458	valid_1's binary_logloss: 0.00196185


LGBMClassifier(learning_rate=0.09, max_depth=-5, random_state=42)

In [35]:
print('LightGBM results:')
print(f'Training accuracy {lgb_model.score(x_train,y_train):.5f}')
print(f'Testing accuracy {lgb_model.score(x_test,y_test):.5f}')

LightGBM results:
Training accuracy 0.99928
Testing accuracy 0.99194


In [36]:
lgb_model.predict_proba(df_test)

array([[0.65653862, 0.34346138],
       [0.17330484, 0.82669516],
       [0.61474862, 0.38525138],
       [0.36774581, 0.63225419],
       [0.19199214, 0.80800786],
       [0.21718542, 0.78281458],
       [0.98185694, 0.01814306],
       [0.99372403, 0.00627597],
       [0.98061812, 0.01938188],
       [0.96271712, 0.03728288],
       [0.92602146, 0.07397854],
       [0.98000265, 0.01999735],
       [0.99239512, 0.00760488],
       [0.28671649, 0.71328351],
       [0.0477698 , 0.9522302 ],
       [0.03183158, 0.96816842],
       [0.46024267, 0.53975733],
       [0.5267568 , 0.4732432 ],
       [0.11690631, 0.88309369],
       [0.05557329, 0.94442671],
       [0.05557329, 0.94442671],
       [0.81861631, 0.18138369],
       [0.39116084, 0.60883916],
       [0.84211679, 0.15788321]])