## Introduction

In [1]:
#imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

from sklearn.metrics import accuracy_score, f1_score

from itertools import cycle

np.random.seed(1234)

import sys
sys.path.append('../')
from utils import *

# MODEL CLASSIFICATION

LLegim les dades preprocessades. És important que s'executi abans el notebook de preprocessing perquè es guarden les dades preprocessades en un fitxer .csv.

In [2]:
# read clean data
data = pd.read_csv('../data/clean_data.csv')

### Resampling protocl

In [3]:
X = data.loc[:, data.columns != "Severity"]
y = data["Severity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1234)

Now, let's create a dataframe to store the results for each model.

In [4]:
results = pd.DataFrame(index=[], columns= ['Accuracy', 'Recall', 'Precision', 'F1 Score'])

### DECISION TREE CLASSIFIER

In [5]:
model = DecisionTreeClassifier(random_state=1234)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234)

param_grid = {
    'max_depth': [3, 10, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(model, param_grid = param_grid, cv = cv, scoring='f1_weighted', n_jobs=-1)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

best_model = DecisionTreeClassifier(**best_params)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

In [6]:
results.loc['DecisionTreeClassifier', :] = get_metrics(y_pred, y_test)
results

Unnamed: 0,Accuracy,Recall,Precision,F1 Score
DecisionTreeClassifier,0.810473,0.810473,0.797908,0.803417


## LOGISTIC REGRESSION

In [14]:
logreg = LogisticRegressionCV(Cs=5, cv = 5, scoring = 'f1_weighted', multi_class='multinomial', random_state = 1234)

logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [13]:
optimal_C = logreg.Cs_[logreg.scores_[1].mean(axis=0).argmax()]
print("Optimal value for C:", optimal_C)
logreg = LogisticRegression(C=optimal_C, multi_class='multinomial')

logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

AttributeError: 'LogisticRegression' object has no attribute 'Cs_'

In [None]:
results.loc['Logistic Regression', :] = get_metrics(y_pred, y_test)
results

## QDA

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234, )

param_grid = {'reg_param': [0, 0.1, 0.2, 0.5]}
qda_model = QuadraticDiscriminantAnalysis()
grid_search = GridSearchCV(qda_model, param_grid = param_grid, cv = cv, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_score = grid_search.best_score_

results_df = pd.DataFrame({'Best_Parameters': [best_params], 'Best_F1_Weighted': [best_score]}, index=['QDA'])


best_qda_model = QuadraticDiscriminantAnalysis(**best_params)
best_qda_model.fit(X_train, y_train)

y_pred = best_qda_model.predict(X_test)

results.loc['QDA', :] = get_metrics(y_pred, y_test)
results
#f1 = f1_score(y_test, y_pred, average='weighted')
#results_df = pd.DataFrame({'F1_Weighted': [f1]}, index=['QDA_test'])
# print(results_df)