In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna

import matplotlib.pyplot as plt
import seaborn as sns

from functions import *

from sklearn import svm, preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score

In [2]:
df_train_LOX = pd.read_csv('../data/training_set_LOX.csv')
df_train_ANT = pd.read_csv('../data/training_set_antioxidant.csv')
df_test = pd.read_csv('../data/test_set.csv')

#drop title column
df_train_LOX = df_train_LOX.drop(columns = ['Title'], axis = 1)
df_train_ANT = df_train_ANT.drop(columns = ['Title'], axis = 1)
df_test = df_test.drop(columns = ['Title'], axis = 1)

In [3]:
#replace NaN values in each file with mean of column

mean_values_LOX = df_train_LOX.mean()
mean_values_ANT = df_train_ANT.mean()

df_train_LOX = df_train_LOX.fillna(mean_values_LOX)
df_train_ANT = df_train_ANT.fillna(mean_values_ANT)

#merge LOX and ANT arrays
frames = [df_train_LOX, df_train_ANT]
merged_train = pd.concat(frames)
merged_train.shape

train_set = merged_train

In [4]:
#normalization
scaler = preprocessing.MinMaxScaler()

cols = train_set.columns
d = scaler.fit_transform(train_set)
train_set = pd.DataFrame(d, columns=cols)

cols = df_test.columns
d = scaler.fit_transform(df_test)
df_test = pd.DataFrame(d, columns=cols)

In [5]:
#form data for tests
x = train_set.loc[:, train_set.columns != 'class']
y = train_set.iloc[:,0]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [6]:
#Calculate logistic regression with KFold cross validation

kfold = KFold(n_splits=5, random_state=0, shuffle=True)
lr_model = LogisticRegression(solver='liblinear', max_iter = 500)
results = cross_val_score(lr_model, x, y, cv=kfold)

print(results)

[0.93548387 0.93946731 0.94350282 0.93946731 0.94027441]


In [7]:
#Create a svm Classifier
svm_model = svm.SVC(random_state=1, probability=True)

svm_model.fit(x_train, y_train)

results = cross_val_score(svm_model, x, y, cv=kfold)

print(results)

[0.94435484 0.96125908 0.94511703 0.95157385 0.94027441]


In [8]:
#Decision Tree Classifier

dtc_model = DecisionTreeClassifier(max_depth=10, random_state=0)
dtc_model.fit(x,y)
cross_val_score(dtc_model, x, y, cv=5)

array([0.93064516, 0.90637611, 0.85875706, 0.84100081, 0.68926554])

In [9]:
#Random Forest Classifier

rfc_model = RandomForestClassifier(max_depth=10, random_state=0)
rfc_model.fit(x, y)
cross_val_score(rfc_model, x, y, cv=5)

array([0.925     , 0.93058918, 0.87409201, 0.86521388, 0.93866021])

In [10]:
#Find best parameters for LGBMReggresor

study = optuna.create_study(direction="maximize")
func = lambda trial: objective(trial, x.to_numpy(), y.to_numpy())
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(func, n_trials=100)

[32m[I 2023-01-25 14:59:37,579][0m A new study created in memory with name: no-name-ef35cde3-1fb7-4d7b-a3ce-5a941bc4b72a[0m


In [11]:
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 100
Best trial:
  Value: 0.9967721110393802
  Params: 
    lambda_l1: 7.338795302958588e-08
    lambda_l2: 8.428409858049367e-05
    num_leaves: 235
    feature_fraction: 0.9298572477432173
    bagging_fraction: 0.8963109588553573
    bagging_freq: 7
    min_child_samples: 78


In [12]:
lgb_model = lgb.LGBMRegressor(**study.best_params)
lgb_model.fit(x_train.to_numpy(), y_train.to_numpy())



LGBMRegressor(bagging_fraction=0.8963109588553573, bagging_freq=7,
              feature_fraction=0.9298572477432173,
              lambda_l1=7.338795302958588e-08, lambda_l2=8.428409858049367e-05,
              min_child_samples=78, num_leaves=235)

In [13]:
print('LightGBM results:')
print(f'Training accuracy {lgb_model.score(x_train,y_train):.5f}')
print(f'Testing accuracy {lgb_model.score(x_test,y_test):.5f}')

LightGBM results:
Training accuracy 0.96271
Testing accuracy 0.84182


In [15]:
lgb_model.predict(df_test)

array([0.58155325, 0.77926954, 0.69347654, 0.72190994, 0.80973831,
       0.77881118, 0.42438262, 0.48910668, 0.42684031, 0.46955821,
       0.49275965, 0.46296304, 0.56946469, 0.81952144, 0.83113863,
       0.78816333, 0.54138454, 0.55593854, 0.63780771, 0.75789718,
       0.75789718, 0.58444094, 0.67427794, 0.59967183])