In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna

import matplotlib.pyplot as plt
import seaborn as sns

from diploma.utils.functions import objective

from sklearn import svm, preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score

In [2]:
df_train_LOX = pd.read_csv('../data/training_set_LOX.csv')
df_train_ANT = pd.read_csv('../data/training_set_antioxidant.csv')
df_test = pd.read_csv('../data/test_set.csv')

#drop title column
df_train_LOX = df_train_LOX.drop(columns = ['Title'], axis = 1)
df_train_ANT = df_train_ANT.drop(columns = ['Title'], axis = 1)
df_test = df_test.drop(columns = ['Title'], axis = 1)

In [3]:
#replace NaN values in each file with mean of column

mean_values_LOX = df_train_LOX.mean()
mean_values_ANT = df_train_ANT.mean()

df_train_LOX = df_train_LOX.fillna(mean_values_LOX)
df_train_ANT = df_train_ANT.fillna(mean_values_ANT)

#merge LOX and ANT arrays
frames = [df_train_LOX, df_train_ANT]
merged_train = pd.concat(frames)
merged_train.shape

train_set = merged_train

In [4]:
#normalization
scaler = preprocessing.MinMaxScaler()

cols = train_set.columns
d = scaler.fit_transform(train_set)
train_set = pd.DataFrame(d, columns=cols)

cols = df_test.columns
d = scaler.fit_transform(df_test)
df_test = pd.DataFrame(d, columns=cols)

In [5]:
#form data for tests
x = train_set.loc[:, train_set.columns != 'class']
y = train_set.iloc[:,0]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)


In [6]:
#Calculate logistic regression with KFold cross validation

kfold = KFold(n_splits=5, random_state=0, shuffle=True)
lr_model = LogisticRegression(solver='liblinear', max_iter = 500)
results = cross_val_score(lr_model, x, y, cv=kfold)

print(results)

[0.93548387 0.93946731 0.94350282 0.93946731 0.94027441]


In [7]:
#Create a svm Classifier
svm_model = svm.SVC(random_state=1, probability=True)

svm_model.fit(x_train, y_train)

results = cross_val_score(svm_model, x, y, cv=kfold)

print(results)

[0.94435484 0.96125908 0.94511703 0.95157385 0.94027441]


In [8]:
#Decision Tree Classifier

dtc_model = DecisionTreeClassifier(max_depth=10, random_state=0)
dtc_model.fit(x,y)
cross_val_score(dtc_model, x, y, cv=5)

array([0.93064516, 0.90637611, 0.85875706, 0.84100081, 0.68926554])

In [9]:
#Random Forest Classifier

rfc_model = RandomForestClassifier(max_depth=10, random_state=0)
rfc_model.fit(x, y)
cross_val_score(rfc_model, x, y, cv=5)

array([0.925     , 0.93058918, 0.87409201, 0.86521388, 0.93866021])

In [10]:
#Find best parameters for LGBMClassifier

study = optuna.create_study(direction="maximize")
func = lambda trial: objective(trial, x.to_numpy(), y.to_numpy())
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(func, n_trials=100)

[32m[I 2023-05-11 14:42:02,493][0m A new study created in memory with name: no-name-c50d04cf-743e-483e-8dac-8a5f19f1a11d[0m


In [11]:
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 100
Best trial:
  Value: 0.9974176888315042
  Params: 
    lambda_l1: 4.5022521974395696e-05
    lambda_l2: 6.695262212853087e-06
    num_leaves: 173
    feature_fraction: 0.6012173189079907
    bagging_fraction: 0.854547120407732
    bagging_freq: 5
    min_child_samples: 85


In [12]:
lgb_model = lgb.LGBMRegressor(**study.best_params)
lgb_model.fit(x_train.to_numpy(), y_train.to_numpy())



LGBMRegressor(bagging_fraction=0.854547120407732, bagging_freq=5,
              feature_fraction=0.6012173189079907,
              lambda_l1=4.5022521974395696e-05, lambda_l2=6.695262212853087e-06,
              min_child_samples=85, num_leaves=173)

In [13]:
print('LightGBM results:')
print(f'Training accuracy {lgb_model.score(x_train,y_train):.5f}')
print(f'Testing accuracy {lgb_model.score(x_test,y_test):.5f}')

LightGBM results:
Training accuracy 0.95319
Testing accuracy 0.82617


In [15]:
lgb_model.predict(df_test)

array([0.43709231, 0.73223799, 0.45227682, 0.5099205 , 0.72337504,
       0.74555534, 0.34125417, 0.355734  , 0.27522441, 0.36343678,
       0.40305118, 0.36393379, 0.41526117, 0.65381958, 0.94833954,
       0.82038917, 0.60500225, 0.69388021, 0.83230369, 0.91553031,
       0.91553031, 0.55629355, 0.68021774, 0.5177364 ])

In [16]:
from sklearn.metrics import classification_report

y_test_temp = np.expand_dims(y_test.to_numpy(), axis=-1)

print(classification_report(y_test_temp, lgb_model.predict(x_test), target_names=['class 0', 'class1']))

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [None]:
lgb_model.predict(x_test).shape