## Introduction

In [1]:
#imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

import pandas as pd
from numpy.typing import NDArray
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.metrics import accuracy_score, f1_score

np.random.seed(1234)

### Metrics 

We have a classification problem with a strong imbalance on the target class. This time we will asume equally important the missclassification errors for both classes. For this reason we are going to use the next metrics to evaluate our model:
* F1-score for class 1. 
* F1-score for class 0. 
* F1-score macro average. 
* Accuracy. (Just for checking, accuracy is not the best metric with imbalanced data)


In [2]:
def get_metrics(y_pred : NDArray, y_test: pd.core.series.Series) -> pd.DataFrame:
    accuracy = accuracy_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred, average='weighted')
    precision = precision_score(y_test,y_pred, average='weighted')
    f1_score_w = f1_score(y_test,y_pred, average='weighted')
    f1_score_m = f1_score(y_test,y_pred, average='macro')
    return [accuracy, recall, precision, f1_score_w, f1_score_m]
    

results = pd.DataFrame(columns=['Accuracy', 'Recall', 'Precision', 'F1-score weight.', 'F1-score macro.'])

# MODEL CLASSIFICATION

First of all, read the preprocessed data. It is important to be this data and not the original, because they are saved in a .csv file.

In [3]:
# read clean data
X_train = pd.read_csv('../data/X_train.csv')
y_train = pd.read_csv('../data/y_train.csv')['Severity']
X_test = pd.read_csv('../data/X_test.csv')
y_test = pd.read_csv('../data/y_test.csv')['Severity']

### Resampling protocl

It is very important to make a good resampling protocol.

For further information, check out the report.

## DECISION TREE CLASSIFIER

In [5]:
criterion = ['gini', 'entropy']

max_dephts = [None, 5, 10, 15]
min_samples_split = [1, 2, 3, 4, 5]
min_samples_leaf = [1, 2, 3, 4, 5]
max_features = ['auto', 'sqrt', 'log2', None]

model = DecisionTreeClassifier()

trc = GridSearchCV(estimator=model,
                   param_grid={
                          'criterion': criterion,
                          'max_depth': max_dephts,
                          'min_samples_split': min_samples_split,
                          'min_samples_leaf': min_samples_leaf,
                          'max_features': max_features
                   },
                   scoring='f1_weighted',
                   cv=5,
                   n_jobs=-1,
)

model_5CV = trc.fit(X_train, y_train)

800 fits failed out of a total of 4000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
800 fits failed with the following error:
Traceback (most recent call last):
  File "/home/pol/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pol/.local/lib/python3.10/site-packages/sklearn/tree/_classes.py", line 889, in fit
    super().fit(
  File "/home/pol/.local/lib/python3.10/site-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/home/pol/.local/lib/python3.10/site-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/home/pol/.local/lib/p

In [6]:
best_params = model_5CV.best_params_
best_params

{'criterion': 'gini',
 'max_depth': 15,
 'max_features': None,
 'min_samples_leaf': 5,
 'min_samples_split': 3}

In [8]:
dt_model_tuned = DecisionTreeClassifier(**best_params)
dt_model_tuned.fit(X_train, y_train)

y_pred = dt_model_tuned.predict(X_test)

In [11]:
# Append values to the results DataFrame
results.loc[0] = get_metrics(y_pred, y_test)
print(results)

   Accuracy    Recall  Precision  F1-score weight.  F1-score macro.
0  0.822554  0.822554   0.800695          0.807755         0.475068


In [None]:
# from sklearn.metrics import roc_curve, auc
# from sklearn.preprocessing import label_binarize
# import matplotlib.pyplot as plt

# y_test_binarized = label_binarize(y_test, classes=[1, 2, 3, 4])
# y_pred_proba = best_model.predict_proba(X_test)


# fpr = dict()
# tpr = dict()
# roc_auc = dict()
# for i in range(4): 
#     fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_pred_proba[:, i])
#     roc_auc[i] = auc(fpr[i], tpr[i])

# plt.figure()
# colors = ['blue', 'red', 'green', 'orange'] 
# for i, color in zip(range(4), colors):
#     plt.plot(fpr[i], tpr[i], color=color, lw=2,
#              label='ROC curve of class {0} (area = {1:0.2f})'
#              ''.format(i+1, roc_auc[i]))

# plt.plot([0, 1], [0, 1], 'k--', lw=2)
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('Receiver Operating Characteristic (ROC) Curve')
# plt.legend(loc="lower right")
# plt.show()


## RANDOM FOREST

In [None]:
# model = RandomForestClassifier()

# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234, )

# param_grid = {
#     'max_depth': [3, 10, 30, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# grid_search = GridSearchCV(model, param_grid = param_grid, cv = cv, scoring='f1_weighted', n_jobs=-1)

# grid_search.fit(X_train, y_train)

# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

# results_df = pd.DataFrame({'Best_Parameters': [best_params], 'Best_F1_Weighted': [best_score]}, index=['Random Forest'])

# print(results_df)

# best_model = RandomForestClassifier(**best_params)
# best_model.fit(X_train, y_train)

# y_pred = best_model.predict(X_test)
# f1 = f1_score(y_test, y_pred, average='weighted')

# results_df = pd.DataFrame({'F1_Weighted': [f1]}, index=['Random Forest'])

# print(results_df)

## EXTRA TREES CLASSIFIER

In [None]:
# model = ExtraTreesClassifier()

# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234, )

# param_grid = {
#     'max_depth': [3, 10, 30, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# grid_search = GridSearchCV(model, param_grid = param_grid, cv = cv, scoring='f1_weighted', n_jobs=-1)

# grid_search.fit(X_train, y_train)

# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

# results_df = pd.DataFrame({'Best_Parameters': [best_params], 'Best_F1_Weighted': [best_score]}, index=['Extra Trees'])

# print(results_df)

# best_model = ExtraTreesClassifier(**best_params)
# best_model.fit(X_train, y_train)

# y_pred = best_model.predict(X_test)
# f1 = f1_score(y_test, y_pred, average='weighted')

# results_df = pd.DataFrame({'F1_Weighted': [f1]}, index=['Extra Trees'])

# print(results_df)

## Voting Classifier

## Stacking classifier

## LOGISTIC REGRESSION

In [None]:
logreg = LogisticRegressionCV(Cs=5, cv = 5, scoring = 'f1_weighted', multi_class='multinomial', random_state = 1234)

logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
optimal_C = logreg.Cs_[logreg.scores_[1].mean(axis=0).argmax()]
print("Optimal value for C:", optimal_C)

Optimal value for C: 10000.0


In [None]:
logreg = LogisticRegression(C=optimal_C, multi_class='multinomial')
cross_val_results = pd.DataFrame(cross_validate(logreg, X_train, y_train, cv = 5, scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'] ))

print(cross_val_results)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit

   fit_time  score_time  test_accuracy  test_f1_macro  test_precision_macro  \
0  4.472640    0.042470       0.802048       0.222538              0.200512   
1  4.761011    0.039888       0.802048       0.222538              0.200512   
2  4.389025    0.038607       0.801857       0.223758              0.294337   
3  4.285354    0.042983       0.802048       0.222538              0.200512   
4  4.605562    0.044959       0.802067       0.222541              0.200517   

   test_recall_macro  
0           0.250000  
1           0.250000  
2           0.250436  
3           0.250000  
4           0.250000  


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
logreg = LogisticRegression(C=optimal_C, multi_class='multinomial')
cross_val_results = pd.DataFrame(cross_validate(logreg, X_train, y_train, cv = 5, scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'] ))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit

In [None]:
print(cross_val_results)

   fit_time  score_time  test_accuracy  test_f1_macro  test_precision_macro  \
0  4.023393    0.037168       0.802048       0.222538              0.200512   
1  4.329232    0.035230       0.802048       0.222538              0.200512   
2  4.226688    0.039190       0.801857       0.223758              0.294337   
3  4.219748    0.042141       0.802048       0.222538              0.200512   
4  4.211854    0.041782       0.802067       0.222541              0.200517   

   test_recall_macro  
0           0.250000  
1           0.250000  
2           0.250436  
3           0.250000  
4           0.250000  


## QDA

This model is used to predict the class from an input feature space. The model is based on prior probabilities on the input data. It assumes they are Gaussian distributed, so it was important to normalize the data in the preprocessing.

In [None]:
#We will do a 5-fold cross-validation to find the best hyperparameters for the QDA model
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234)

param_grid = {'reg_param': [0, 0.1, 0.2, 0.5]}
qda_model = QuadraticDiscriminantAnalysis()
grid_search = GridSearchCV(qda_model, param_grid = param_grid, cv = cv, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_params_qda = grid_search.best_params_
best_score_qda = grid_search.best_score_

results_df_qda = pd.DataFrame({'Best_Parameters': [best_params_qda], 'Best_F1_Weighted': [best_score_qda]}, index=['QDA'])

print(results_df_qda)

best_qda_model = QuadraticDiscriminantAnalysis(**best_params_qda)
best_qda_model.fit(X_train, y_train)

y_pred_qda = best_qda_model.predict(X_test)
f1 = f1_score(y_test, y_pred_qda, average='weighted')

results_df_qda = pd.DataFrame({'F1_Weighted': [f1]}, index=['QDA_test'])

print(results_df_qda)



        Best_Parameters  Best_F1_Weighted
QDA  {'reg_param': 0.1}          0.773041




          F1_Weighted
QDA_test     0.772005


## LDA

This model is very similar to the previous one, here we will suppose that every class in Severity has the same covariance matrix.

In [None]:
#We will do a 5-fold cross-validation to find the best hyperparameters for the LDA model
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234, )

param_grid = {'solver': ['lsqr', 'eigen'], 'shrinkage': [0, 0.1, 0.2, 0.5]}
lda_model = LinearDiscriminantAnalysis()
grid_search = GridSearchCV(lda_model, param_grid = param_grid, cv = cv, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_params_lda = grid_search.best_params_
best_score_lda = grid_search.best_score_

results_df_lda = pd.DataFrame({'Best_Parameters': [best_params_lda], 'Best_F1_Weighted': [best_score_lda]}, index=['LDA'])

print(results_df_lda)

best_lda_model = LinearDiscriminantAnalysis(**best_params_lda)
best_lda_model.fit(X_train, y_train)

y_pred_lda = best_lda_model.predict(X_test)
f1 = f1_score(y_test, y_pred_lda, average='weighted')

results_df_lda = pd.DataFrame({'F1_Weighted': [f1]}, index=['LDA_test'])

print(results_df_qda)

                        Best_Parameters  Best_F1_Weighted
QDA  {'shrinkage': 0, 'solver': 'lsqr'}          0.760441
          F1_Weighted
QDA_test     0.772005


## GAUSSIAN NAIVE BAYE

Gaussian Naive Bayes (GNB) is a simple probabilistic classifier based on Bayes' theorem with the assumption of independence between features. In the preprocessing we saw that there is not a lot of correlation between variables, so we can apply this method.

In [None]:
#We will do a 5-fold cross-validation to find the best hyperparameters for the GNB model
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234, )

#For the priors we will use the default values of None and the proportion of every class in training the dataset
class_counts = y_train.value_counts()
class_counts_sorted = class_counts.sort_index()
class_proportions = class_counts_sorted / len(y_train)
param_grid = {'priors': [None, class_proportions.values]}
gnb_model = GaussianNB()
grid_search = GridSearchCV(gnb_model, param_grid = param_grid, cv = cv, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_params_gnb = grid_search.best_params_
best_score_gnb = grid_search.best_score_

results_df_gnb = pd.DataFrame({'Best_Parameters': [best_params_gnb], 'Best_F1_Weighted': [best_score_gnb]}, index=['GNB'])

print(results_df_gnb)

best_gnb_model = LinearDiscriminantAnalysis(**best_params_gnb)
best_gnb_model.fit(X_train, y_train)

y_pred_gnb = best_gnb_model.predict(X_test)
f1 = f1_score(y_test, y_pred_gnb, average='weighted')

results_df_gnb = pd.DataFrame({'F1_Weighted': [f1]}, index=['GNB_test'])

print(results_df_gnb)

      Best_Parameters  Best_F1_Weighted
GNB  {'priors': None}          0.339296
          F1_Weighted
GNB_test     0.761569


As we see this models does not fits well the training data, however it performs well on the test data. This could be due to we supposed variables were uncorrelated, but in fact they are. Thus there exists dependece between variables.

## $k$-NN

This methods predicts the actual sample by using similar properties from the k nearest neighbours. Each new sample will go to the class with the most similar values from the train set. 

In [None]:
#We will do a 5-fold cross-validation to find the best hyperparameters for the LDA model
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234, )

#For the priors we will use the default values of None and the proportion of every class in training the dataset
param_grid = {'n_neighbors': [1, 3, 5, 7, 10, 15, 20], 'metric': ['euclidean', 'minkowski', 'manhattan']}
knn_model = KNeighborsClassifier()
grid_search = GridSearchCV(knn_model, param_grid = param_grid, cv = cv, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_params_knn = grid_search.best_params_
best_score_knn = grid_search.best_score_

results_df_knn = pd.DataFrame({'Best_Parameters': [best_params_knn], 'Best_F1_Weighted': [best_score_knn]}, index=['K-NN'])

print(results_df_knn)

best_knn_model = LinearDiscriminantAnalysis(**best_params_knn)
best_knn_model.fit(X_train, y_train)

y_pred_knn = best_knn_model.predict(X_test)
f1 = f1_score(y_test, y_pred_knn, average='weighted')

results_df_knn = pd.DataFrame({'F1_Weighted': [f1]}, index=['K-NN_test'])

print(results_df_knn)