In [100]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna

import matplotlib.pyplot as plt
import seaborn as sns

from functions import *

from sklearn import svm, preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score

In [101]:
df_train_LOX = pd.read_csv('../data/training_set_LOX.csv')
df_train_ANT = pd.read_csv('../data/training_set_antioxidant.csv')
df_test = pd.read_csv('../data/test_set.csv')

#drop title column
df_train_LOX = df_train_LOX.drop(columns = ['Title'], axis = 1)
df_train_ANT = df_train_ANT.drop(columns = ['Title'], axis = 1)
df_test = df_test.drop(columns = ['Title'], axis = 1)



In [102]:
#replace NaN values in each file with mean of column

mean_values_LOX = df_train_LOX.mean()
mean_values_ANT = df_train_ANT.mean()

df_train_LOX = df_train_LOX.fillna(mean_values_LOX)
df_train_ANT = df_train_ANT.fillna(mean_values_ANT)

#merge LOX and ANT arrays
frames = [df_train_LOX, df_train_ANT]
merged_train = pd.concat(frames)
merged_train.shape

train_set = merged_train

In [103]:
#normalization
scaler = preprocessing.MinMaxScaler()

cols = train_set.columns
d = scaler.fit_transform(train_set)
train_set = pd.DataFrame(d, columns=cols)

cols = df_test.columns
d = scaler.fit_transform(df_test)
df_test = pd.DataFrame(d, columns=cols)

In [104]:
#form data for tests
x = train_set.loc[:, train_set.columns != 'class']
y = train_set.iloc[:,0]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [105]:
#Calculate logistic regression with KFold cross validation

kfold = KFold(n_splits=5, random_state=0, shuffle=True)
lr_model = LogisticRegression(solver='liblinear', max_iter = 500)
results = cross_val_score(lr_model, x, y, cv=kfold)

print(results)

[0.93548387 0.93946731 0.94350282 0.93946731 0.94027441]


In [106]:
#Create a svm Classifier
svm_model = svm.SVC(random_state=1, probability=True)

svm_model.fit(x_train, y_train)

results = cross_val_score(svm_model, x, y, cv=kfold)

print(results)

[0.94435484 0.96125908 0.94511703 0.95157385 0.94027441]


In [107]:
#Decision Tree Classifier

dtc_model = DecisionTreeClassifier(max_depth=10, random_state=0)
dtc_model.fit(x,y)
cross_val_score(dtc_model, x, y, cv=5)

array([0.93064516, 0.90637611, 0.85875706, 0.84100081, 0.68926554])

In [108]:
#Random Forest Classifier

rfc_model = RandomForestClassifier(max_depth=10, random_state=0)
rfc_model.fit(x, y)
cross_val_score(rfc_model, x, y, cv=5)

array([0.925     , 0.93058918, 0.87409201, 0.86521388, 0.93866021])

In [109]:
#Find best parameters for LGBMClassifier

study = optuna.create_study(direction="maximize")
func = lambda trial: objective(trial, x, y)
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(objective, n_trials=100)

In [110]:
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 100
Best trial:
  Value: 0.986013986013986
  Params: 
    lambda_l1: 0.0009483150331245361
    lambda_l2: 1.6672637660060458e-05
    num_leaves: 73
    feature_fraction: 0.5507518146845714
    bagging_fraction: 0.9897600927541909
    bagging_freq: 6
    min_child_samples: 64


In [111]:
lgb_model = lgb.LGBMClassifier(**study.best_params)
lgb_model.fit(x_train.to_numpy(), y_train.to_numpy())



LGBMClassifier(bagging_fraction=0.9897600927541909, bagging_freq=6,
               feature_fraction=0.5507518146845714,
               lambda_l1=0.0009483150331245361,
               lambda_l2=1.6672637660060458e-05, min_child_samples=64,
               num_leaves=73)

In [112]:
print('LightGBM results:')
print(f'Training accuracy {lgb_model.score(x_train,y_train):.5f}')
print(f'Testing accuracy {lgb_model.score(x_test,y_test):.5f}')

LightGBM results:
Training accuracy 0.99939
Testing accuracy 0.98548


In [113]:
lgb_model.predict_proba(df_test)

array([[0.12760242, 0.87239758],
       [0.06383526, 0.93616474],
       [0.1222979 , 0.8777021 ],
       [0.05794565, 0.94205435],
       [0.05674434, 0.94325566],
       [0.05579048, 0.94420952],
       [0.95776514, 0.04223486],
       [0.98073445, 0.01926555],
       [0.96397183, 0.03602817],
       [0.93904572, 0.06095428],
       [0.92003641, 0.07996359],
       [0.95406049, 0.04593951],
       [0.85094832, 0.14905168],
       [0.11231279, 0.88768721],
       [0.00244275, 0.99755725],
       [0.13678335, 0.86321665],
       [0.09063971, 0.90936029],
       [0.07210713, 0.92789287],
       [0.00325865, 0.99674135],
       [0.0243449 , 0.9756551 ],
       [0.0243449 , 0.9756551 ],
       [0.18758513, 0.81241487],
       [0.23958361, 0.76041639],
       [0.3534125 , 0.6465875 ]])