In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna

import matplotlib.pyplot as plt
import seaborn as sns

from diploma.utils.functions import objective

from sklearn import svm, preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score

In [2]:
df_train_LOX = pd.read_csv('../data/training_set_LOX.csv')
df_train_ANT = pd.read_csv('../data/training_set_antioxidant.csv')
df_test = pd.read_csv('../data/test_set.csv')

#drop title column
df_train_LOX = df_train_LOX.drop(columns = ['Title'], axis = 1)
df_train_ANT = df_train_ANT.drop(columns = ['Title'], axis = 1)
df_test = df_test.drop(columns = ['Title'], axis = 1)

In [3]:
#replace NaN values in each file with mean of column

mean_values_LOX = df_train_LOX.mean()
mean_values_ANT = df_train_ANT.mean()

df_train_LOX = df_train_LOX.fillna(mean_values_LOX)
df_train_ANT = df_train_ANT.fillna(mean_values_ANT)

#merge LOX and ANT arrays
frames = [df_train_LOX, df_train_ANT]
merged_train = pd.concat(frames)
merged_train.shape

train_set = merged_train

In [4]:
#normalization
scaler = preprocessing.MinMaxScaler()

cols = train_set.columns
d = scaler.fit_transform(train_set)
train_set = pd.DataFrame(d, columns=cols)

cols = df_test.columns
d = scaler.fit_transform(df_test)
df_test = pd.DataFrame(d, columns=cols)

In [5]:
#form data for tests
x = train_set.loc[:, train_set.columns != 'class']
y = train_set.iloc[:,0]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)


In [6]:
#Calculate logistic regression with KFold cross validation

kfold = KFold(n_splits=5, random_state=0, shuffle=True)
lr_model = LogisticRegression(solver='liblinear', max_iter = 500)
results = cross_val_score(lr_model, x, y, cv=kfold)

print(results)

[0.93548387 0.93946731 0.94350282 0.93946731 0.94027441]


In [7]:
#Create a svm Classifier
svm_model = svm.SVC(random_state=1, probability=True)

svm_model.fit(x_train, y_train)

results = cross_val_score(svm_model, x, y, cv=kfold)

print(results)

[0.94435484 0.96125908 0.94511703 0.95157385 0.94027441]


In [8]:
#Decision Tree Classifier

dtc_model = DecisionTreeClassifier(max_depth=10, random_state=0)
dtc_model.fit(x,y)
cross_val_score(dtc_model, x, y, cv=5)

array([0.93064516, 0.90637611, 0.85875706, 0.84100081, 0.68926554])

In [9]:
#Random Forest Classifier

rfc_model = RandomForestClassifier(max_depth=10, random_state=0)
rfc_model.fit(x, y)
cross_val_score(rfc_model, x, y, cv=5)

array([0.925     , 0.93058918, 0.87409201, 0.86521388, 0.93866021])

In [10]:
#Find best parameters for LGBMReggresor

study = optuna.create_study(direction="maximize")
func = lambda trial: objective(trial, x.to_numpy(), y.to_numpy())
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(func, n_trials=100)

[32m[I 2023-01-28 18:31:27,551][0m A new study created in memory with name: no-name-2cc26ae1-cc69-4dcd-9485-877c0a394e46[0m


In [11]:
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 100
Best trial:
  Value: 0.9980632666236281
  Params: 
    lambda_l1: 0.045530746769831466
    lambda_l2: 2.868067627657739e-07
    num_leaves: 161
    feature_fraction: 0.526764381967646
    bagging_fraction: 0.8958129478410691
    bagging_freq: 7
    min_child_samples: 69


In [12]:
lgb_model = lgb.LGBMClassifier(**study.best_params)
lgb_model.fit(x_train.to_numpy(), y_train.to_numpy())



LGBMClassifier(bagging_fraction=0.8958129478410691, bagging_freq=7,
               feature_fraction=0.526764381967646,
               lambda_l1=0.045530746769831466, lambda_l2=2.868067627657739e-07,
               min_child_samples=69, num_leaves=161)

In [13]:
print('LightGBM results:')
print(f'Training accuracy {lgb_model.score(x_train,y_train):.5f}')
print(f'Testing accuracy {lgb_model.score(x_test,y_test):.5f}')

LightGBM results:
Training accuracy 0.99939
Testing accuracy 0.98629


In [14]:
lgb_model.predict_proba(df_test)

array([[0.69966104, 0.30033896],
       [0.19202901, 0.80797099],
       [0.37383838, 0.62616162],
       [0.31772846, 0.68227154],
       [0.31466739, 0.68533261],
       [0.26401325, 0.73598675],
       [0.94223502, 0.05776498],
       [0.98331878, 0.01668122],
       [0.94026257, 0.05973743],
       [0.91735828, 0.08264172],
       [0.91974081, 0.08025919],
       [0.95494277, 0.04505723],
       [0.84481606, 0.15518394],
       [0.47526339, 0.52473661],
       [0.00514436, 0.99485564],
       [0.01102637, 0.98897363],
       [0.06959809, 0.93040191],
       [0.1060991 , 0.8939009 ],
       [0.02491722, 0.97508278],
       [0.05293639, 0.94706361],
       [0.05293639, 0.94706361],
       [0.19332162, 0.80667838],
       [0.16547339, 0.83452661],
       [0.41597815, 0.58402185]])

In [15]:
from sklearn.metrics import classification_report

y_test_temp = np.expand_dims(y_test.to_numpy(), axis=-1)

print(classification_report(y_test_temp, lgb_model.predict(x_test), target_names=['class 0', 'class1']))

              precision    recall  f1-score   support

     class 0       0.99      0.99      0.99      1088
      class1       0.95      0.94      0.94       152

    accuracy                           0.99      1240
   macro avg       0.97      0.97      0.97      1240
weighted avg       0.99      0.99      0.99      1240



In [16]:
lgb_model.predict(x_test).shape

(1240,)