In [65]:
import os
import sys
import joblib
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

sys.path.append('../')
from functionality import funs

from sklearnex import patch_sklearn
patch_sklearn()

from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import ExtraTreeClassifier, DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier,  GradientBoostingClassifier

from sklearn.gaussian_process.kernels import RBF

from sklearn.base import clone
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, LabelEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import recall_score, precision_score, accuracy_score, roc_auc_score, make_scorer, classification_report

# Set the random generators for reproducibility.
os.environ['PYTHONHASHSEED']= str(2124)

# Set a custom color palette:
colors = ['red','darksalmon','olive','darkseagreen','dodgerblue','navy']

color = ['maroon','red','tomato','darksalmon','firebrick',
         'darkseagreen','seagreen','lightseagreen','olive','green',
         'dodgerblue','deepskyblue','navy','blue','royalblue']

my_palette = sns.color_palette(color)
sns.set_palette(my_palette)

working = os.getcwd()
dirname = os.path.dirname(working)

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


# Higher Education Students Performance Evaluation

- The data was collected from Turkish students at two faculties: Faculty of Engineering and Faculty of Educational Sciences students in 2019. <br>

- The goal is to create an ML model that can predict student performance given the data taken from a survey.

- The grades are in categorical –AA, BA, BB, CB, CC, DC, DD, and Fail– hence it should be model as a mutli-class classification.

<br>

**Data Set Information**

The data contains results from a survey with columns 1-10 relate to personal questions, 11-16 are family related, and the remaining questions include education habits.

In [66]:
# Load data.
data = pd.read_csv('../data/data.csv', dtype={'Course ID':object})

## 1. Exploratory Data Analysis

In [67]:
ages = data.Age.sort_values().unique()
grades = data.Grade.sort_values().unique()
hours = ['none','<5 hours','6-10 hours','11-20 hours','more than 20 hours']
scholarship = data.Scholarship.sort_values().unique()
notes = ['always', 'sometimes', 'never']
listening = ['always', 'sometimes', 'never']
attendance = ['always', 'sometimes']
exams1 = data['Preparation to Midterm Exams 1'].sort_values().unique()
exams2 = data['Preparation to Midterm Exams 2'].sort_values().unique()
fathers = ['Ph.D.','MSc.','university','high school','primary school','secondary school']
mothers = ['Ph.D.','MSc.','university','high school','primary school','secondary school']
courses = data['Course ID'].sort_values().unique()
transport = data['Transportation to University'].unique()
courses = data['Course ID'].sort_values().unique()

In [68]:
props = (data.groupby('Grade')['Grade']
         .count().to_frame('Count')
         .reset_index())

fig = make_subplots(
    rows=1, 
    cols=1,
    specs=[[{'type':'domain'}]])

fig.add_trace(
    go.Pie(
        labels=props['Grade'], 
        values=props['Count'],
        marker={'colors':color[0:len(grades)+1]},
        sort=False),
    1, 1)

fig.update_traces(textposition='inside', textinfo='percent+label')

# Update layout settings for the figure.
fig.update_layout(
    title={'text':'Grades % Distribution','font_size':20},
    showlegend=False, 
    height=650,
    width=1650,
    template='plotly_white')

The outcome data –the grades– shows an imbalanced distribution. Whilst DD has 25% of the data, BA and CB have less than 10% and Fail represents only 5.5% of the whole data –only eight points–. This eventually will present a problem as the model will have few data points to train on predicting the Fail grade, but more data points to train the model on predicting the DD grade.

In [69]:
fig = px.bar((data
                .pivot_table(index='Grade', columns='Course ID', values='Student ID', fill_value=0, aggfunc='count')
                .unstack()
                .to_frame('Count')
                .reset_index()), 
             x='Course ID', 
             y='Count',
             color='Grade',
             category_orders={'Grade': grades},
             labels={'Count':'# Students'},
             color_discrete_map=dict(zip(grades, np.flip(color)[0:len(grades)])),
             )

fig.update_layout(
    showlegend=True, 
    height=500, 
    width=1650, 
    template='plotly_white',
    title='Grades by Course ID',
    yaxis_range = [0,70])

fig.show()

In [70]:
funs.eda_plotter(data, 'Age', ages, facet_col='Sex')

In [71]:
funs.eda_plotter(data, 'Weekly Study Hours', hours)

In [72]:
funs.eda_plotter(data, 'Scholarship', scholarship)

In [73]:
funs.eda_plotter(data, ['Attendance to Classes','Listening in Classes','Taking Notes in Classes'], [attendance,listening,notes])

In [74]:
funs.eda_plotter(data, ['Preparation to Midterm Exams 1','Preparation to Midterm Exams 2'], [exams1, exams2])

In [75]:
funs.eda_plotter(data, 'Fathers Education', fathers)

In [76]:
funs.eda_plotter(data, 'Mothers Education', fathers)

## 2. Data Preparation

In [77]:
# Filter data with one apereance.
data = data.loc[data['Scholarship'] != 'None']
data = data.loc[data['Transportation to University'] != 'bicycle']
data = data.loc[data['Accommodation Type'] != 'other']
data = data.loc[data['Fathers Education'] != 'Ph.D.']

In [78]:
# Create the X matrix and y outcome.
y = data['Grade']
X = data.drop(['Student ID','Grade'], axis=1)

In [79]:
# Get the classes from the outcome. 
classes = y.sort_values().unique()

Error, Data Transformation, K-Fold and Metrics

In [80]:
# Create a transformer to one hot encode categorical variables.
Transformer = make_column_transformer(
    (OneHotEncoder(sparse_output=False), make_column_selector(dtype_include=object)), 
    remainder="passthrough")

# Create a stratified shuffled split.
sss = StratifiedShuffleSplit(3, test_size=0.1, random_state=6064)

# Set the list of metrics to asses the models' performances.
metrics = {'accuracy':make_scorer(accuracy_score, greater_is_better=True),
           'precision_macro':make_scorer(precision_score, greater_is_better=True, average='macro', zero_division=0),
           'recall_macro':make_scorer(recall_score, greater_is_better=True, average='macro', zero_division=0),
           'auc': make_scorer(roc_auc_score, greater_is_better=True, average='macro', needs_proba=True, multi_class='ovr', labels=classes)}

Precision and recall provide insights into the model's performance for each class individually, while accuracy gives an overall view of the model's correctness. Since this is a multi-class classification problem, precision and recall are calculated individually for each class and then averaged.


Precision: measures the proportion of correctly predicted grades out of all grades predicted as a specific grade. In this case, when predicting an AA grade what proportion of all predicted AA grades where truly AA grades. The procedure is repeated for each individual grade. High precision indicates that the model is good at correctly identifying a specific grade without misclassifying with the other grades. However, it doesn't consider the case when a grade was not predicted as the real grade.


Recall: measures the proportion of correctly predicted grades out of all actual grades in the set. In this case, when predicting an AA grade what proportion of all AA grades were predicted as AA grades. The procedure is repeated for each individual grade. High recall indicates that the models good at predicting most of the grades from each category to its real category.


Accuracy: measures the overall correctness of the model's predictions across all grades. It calculates the proportion of correctly predicted grades out of the total number of grades. It provides an overall assessment of the model's performance, considering both correct predictions for identifying the real and false grade category. However, it may not be the most informative metric when dealing with imbalanced datasets, where the number of instances in each class varies significantly.


In [81]:
# Create a label binarizer fitted using y.
binarizer = LabelBinarizer().fit(y)

# Create a label encoder fitted using y.
encoder = LabelEncoder().fit(y)

**Train and Test Subsets**

Since the data is imbalanced when splitting to the train and test sets the imbalance has to taken into account. The even split is needed so that the data can train using all possible outcomes – with a distribution comparable to the expected in none seen data.

In [82]:
# Create a train and test set for X and y. Set test size to 20% of the data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=1234, stratify=data[['Grade']])

In [83]:
weights = funs.grades_distribution([y_train, y_test], ['Train Set', 'Test Set'])

## 3. Classification Study

The following list shows the estimators –and their parameters– that are studied to identify the best possible model:

```python
log = LogisticRegression(penalty=None, random_state=6064, solver='saga', max_iter=7500, multi_class='multinomial', n_jobs=-1)
l1 = LogisticRegression(penalty='l1', random_state=6064, solver='saga', max_iter=7500, multi_class='multinomial', n_jobs=-1)
l2 = LogisticRegression(penalty='l2', random_state=6064, solver='sag', max_iter=10500, multi_class='multinomial', n_jobs=-1)
net = LogisticRegression(penalty='elasticnet', random_state=6064, solver='saga', max_iter=10500, multi_class='multinomial', n_jobs=-1, l1_ratio=0.5)
sgd = SGDClassifier(loss='modified_huber', penalty=None, max_iter=7500, n_jobs=-1, random_state=6064)
mlp = MLPClassifier(solver='adam', max_iter=4500, random_state=6064)
dtc = DecisionTreeClassifier(random_state=6064)
rfc = RandomForestClassifier(random_state=6064, n_jobs=1)
etc = ExtraTreeClassifier(random_state=6064)
ets = ExtraTreesClassifier(random_state=6064, n_jobs=1)
abc = AdaBoostClassifier(random_state=6064)
gpc = GaussianProcessClassifier(kernel=RBF(0.05), random_state=6064, n_jobs=1)
gbc = GradientBoostingClassifier(loss='log_loss', random_state=6064)
svc = SVC(kernel=RBF(), probability=True)
```

In [84]:
log = LogisticRegression(penalty=None, random_state=6064, solver='saga', max_iter=7500, multi_class='multinomial', n_jobs=-1)
l1 = LogisticRegression(penalty='l1', random_state=6064, solver='saga', max_iter=7500, multi_class='multinomial', n_jobs=-1)
l2 = LogisticRegression(penalty='l2', random_state=6064, solver='sag', max_iter=10500, multi_class='multinomial', n_jobs=-1)
net = LogisticRegression(penalty='elasticnet', random_state=6064, solver='saga', max_iter=10500, multi_class='multinomial', n_jobs=-1, l1_ratio=0.5)
sgd = SGDClassifier(loss='modified_huber', penalty=None, max_iter=7500, n_jobs=-1, random_state=6064)
mlp = MLPClassifier(solver='adam', max_iter=4500, random_state=6064)
dtc = DecisionTreeClassifier(random_state=6064)
rfc = RandomForestClassifier(random_state=6064, n_jobs=1)
etc = ExtraTreeClassifier(random_state=6064)
ets = ExtraTreesClassifier(random_state=6064, n_jobs=1)
abc = AdaBoostClassifier(random_state=6064)
gpc = GaussianProcessClassifier(kernel=RBF(0.05), random_state=6064, n_jobs=1)
gbc = GradientBoostingClassifier(loss='log_loss', random_state=6064)
svc = SVC(kernel=RBF(), probability=True)

In [85]:
validation = []

In [86]:
estimators = [log, l1, l2, net, sgd, mlp, dtc, rfc, etc, abc, ets, gpc, gbc, svc]

### 3.1 Standard Estimators

In [87]:
train, validate = funs.cv_models_performance(estimators, Transformer, X_train, y_train, metrics, sss)

In [88]:
funs.performance_plotter(train, validate, 'Validation', color)

In [89]:
validation.append(validate)

In [90]:
validate.style.hide(axis='index')

Model,Accuracy,Recall weighted,Precision weighted,AUC
logisticregression,0.25,0.2292,0.1889,0.6257
logisticregression_l1,0.3333,0.2917,0.2236,0.7147
logisticregression_l2,0.25,0.1944,0.1565,0.6443
logisticregression_elasticnet,0.3056,0.2639,0.2108,0.6838
sgd,0.2778,0.25,0.1935,0.5518
mlp,0.3611,0.3125,0.2413,0.6359
decisiontree,0.1667,0.1319,0.1035,0.5035
randomforest,0.4167,0.3472,0.2917,0.6957
extratree,0.1389,0.1528,0.1215,0.5126
adaboost,0.25,0.1597,0.0846,0.6077


### 3.2 Standard Estimators & Feature Selection with Variance Threshold of 0.10

In [91]:
train, validate = funs.cv_models_performance(estimators, Transformer, X_train, y_train, metrics, sss, variance_threshold=0.10)

In [92]:
funs.performance_plotter(train, validate, 'Validation', color)

In [93]:
validation.append(validate)

In [94]:
validate.style.hide(axis='index')

Model,Accuracy,Recall weighted,Precision weighted,AUC
logisticregression,0.2778,0.2431,0.2146,0.6324
logisticregression_l1,0.3333,0.2847,0.2299,0.7169
logisticregression_l2,0.2778,0.2222,0.1993,0.6379
logisticregression_elasticnet,0.2778,0.2014,0.1653,0.6736
sgd,0.25,0.1806,0.1944,0.5724
mlp,0.25,0.25,0.1562,0.6003
decisiontree,0.25,0.2847,0.2118,0.5862
randomforest,0.25,0.2222,0.1601,0.6602
extratree,0.1944,0.1389,0.116,0.5101
adaboost,0.1944,0.2014,0.1685,0.605


### 3.3 Standard Estimators & Feature Selection with Variance Threshold of 0.20

In [95]:
train, validate = funs.cv_models_performance(estimators, Transformer, X_train, y_train, metrics, sss, variance_threshold=0.2)

In [96]:
funs.performance_plotter(train, validate, 'Validation', color)

In [97]:
validation.append(validate)

In [98]:
validate.style.hide(axis='index')

Model,Accuracy,Recall weighted,Precision weighted,AUC
logisticregression,0.4167,0.3542,0.3542,0.7004
logisticregression_l1,0.3056,0.25,0.2192,0.6772
logisticregression_l2,0.2778,0.2778,0.2132,0.6757
logisticregression_elasticnet,0.25,0.2153,0.1736,0.7034
sgd,0.2222,0.1944,0.1354,0.582
mlp,0.3333,0.2986,0.1833,0.6515
decisiontree,0.25,0.1806,0.1528,0.534
randomforest,0.2778,0.25,0.1632,0.6749
extratree,0.3056,0.2847,0.1819,0.5918
adaboost,0.2222,0.1944,0.1403,0.7377


### 3.4. Grid Search for Best Estimators

In [99]:
# Create a pipeline with data transformation and variance threshold.
preprocessor = make_pipeline(Transformer, VarianceThreshold())

In [100]:
if not 'logisticregression.joblib' in os.listdir('../working/best_estimators'):

    log_cv = GridSearchCV(
        estimator =  make_pipeline(preprocessor, log),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25,0.025),
            'logisticregression__multi_class':['multinomial','ovr']},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=True)
    
    _ = log_cv.fit(X_train, y_train)

In [101]:
if not 'logisticregression_l1.joblib' in os.listdir('../working/best_estimators'):

    l1_cv = GridSearchCV(
        estimator = make_pipeline(preprocessor, l1),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25,0.025),
            'logisticregression__C':[0.1,0.5,1,5,10,50,100],
            'logisticregression__multi_class':['multinomial','ovr']},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=True)
    
    _ = l1_cv.fit(X_train, y_train)

In [102]:
if not 'logisticregression_l2.joblib' in os.listdir('../working/best_estimators'):

    l2_cv = GridSearchCV(
        estimator = make_pipeline(preprocessor, l2),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25,0.025),
            'logisticregression__C':[0.1,0.5,1,5,10,50,100],
            'logisticregression__multi_class':['multinomial','ovr']},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=True)
    
    _ = l2_cv.fit(X_train, y_train)

In [103]:
if not 'logisticregression_elasticnet.joblib' in os.listdir('../working/best_estimators'):

    net_cv = GridSearchCV(
        estimator = make_pipeline(preprocessor, net),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25,0.025),
            'logisticregression__C':[0.1,0.5,1,5,10,50,100],
            'logisticregression__l1_ratio':np.arange(0.1,1.1,0.1),
            'logisticregression__multi_class':['multinomial','ovr']},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=True)
    
    _ = net_cv.fit(X_train, y_train)

In [104]:
if not 'sgd_l1.joblib' in os.listdir('../working/best_estimators'):

    sgd_cv = GridSearchCV(
        estimator =  make_pipeline(preprocessor, sgd),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25,0.025),
            'sgdclassifier__loss':['log_loss','modified_huber'],
            'sgdclassifier__penalty':['l2', 'l1', 'elasticnet', None],
            'sgdclassifier__alpha':np.arange(0.0001,0.11,0.025)},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=True)
    
    _ = sgd_cv.fit(X_train, y_train)

In [105]:
if not 'mlp.joblib' in os.listdir('../working/best_estimators'):

    mlp_cv = GridSearchCV(
        estimator = make_pipeline(preprocessor, mlp),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25,0.025),
            'mlpclassifier__hidden_layer_sizes':[(50,), (100,), (150,), (200,)],
            'mlpclassifier__activation':['identity', 'logistic', 'tanh', 'relu'],
            'mlpclassifier__alpha':[0.1, 0.05, 0.01, 0.001]},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=True)
    
    _ = mlp_cv.fit(X_train, y_train)

In [106]:
if not 'decisiontree.joblib' in os.listdir('../working/best_estimators'):

    dtc_cv = GridSearchCV(
        estimator = make_pipeline(preprocessor, dtc),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.1, 0.225,0.025),
            'decisiontreeclassifier__criterion':['gini','entropy','log_loss'],
            'decisiontreeclassifier__max_depth':np.arange(5, 11),
            'decisiontreeclassifier__min_samples_split':np.arange(2, 5),
            'decisiontreeclassifier__min_samples_leaf':np.arange(1, 5),
            'decisiontreeclassifier__class_weight':[None, weights],
            'decisiontreeclassifier__ccp_alpha':np.arange(0.005, 0.035, 0.005)},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=True)
    
    _ = dtc_cv.fit(X_train, y_train)

In [107]:
if not 'randomforest.joblib' in os.listdir('../working/best_estimators'):

    rfc_cv = GridSearchCV(
        estimator = make_pipeline(preprocessor, rfc),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.1, 0.225,0.025),
            'randomforestclassifier__criterion':['gini','entropy','log_loss'],
            'randomforestclassifier__max_depth':np.arange(5, 11),
            'randomforestclassifier__min_samples_split':np.arange(2, 5),
            'randomforestclassifier__min_samples_leaf':np.arange(1, 5),
            'randomforestclassifier__class_weight':[None, weights],
            'randomforestclassifier__ccp_alpha':np.arange(0.005, 0.035, 0.005)},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=True)
    
    _ = rfc_cv.fit(X_train, y_train)

In [108]:
if not 'extratree.joblib' in os.listdir('../working/best_estimators'):

    etc_cv = GridSearchCV(
        estimator =  make_pipeline(preprocessor, etc),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.1, 0.225,0.025),
            'extratreeclassifier__criterion':['gini','entropy','log_loss'],
            'extratreeclassifier__max_depth':np.arange(5, 11),
            'extratreeclassifier__min_samples_split':np.arange(2, 5),
            'extratreeclassifier__min_samples_leaf':np.arange(1, 5),
            'extratreeclassifier__class_weight':[None, weights],
            'extratreeclassifier__ccp_alpha':np.arange(0.005, 0.035, 0.005)},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=False)
    
    _ = etc_cv.fit(X_train, y_train)

In [109]:
if not 'extratrees.joblib' in os.listdir('../working/best_estimators'):

    ets_cv = GridSearchCV(
        estimator =  make_pipeline(preprocessor, ets),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.1, 0.225,0.025),
            'extratreesclassifier__criterion':['gini','entropy','log_loss'],
            'extratreesclassifier__n_estimators':np.arange(5, 11),
            'extratreesclassifier__max_depth':np.arange(5, 11),
            'extratreesclassifier__min_samples_split':np.arange(2, 5),
            'extratreesclassifier__min_samples_leaf':np.arange(1, 5),
            'extratreesclassifier__class_weight':[None, weights],
            'extratreesclassifier__ccp_alpha':np.arange(0.005, 0.035, 0.005)},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=False)
    
    _ = ets_cv.fit(X_train, y_train)

In [110]:
if not 'adaboost.joblib' in os.listdir('../working/best_estimators'):

    abc_cv = GridSearchCV(
        estimator = make_pipeline(preprocessor, abc),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.1, 0.225,0.025),
            'adaboostclassifier__estimator':[dtc_cv.best_estimator_.steps[1][1], 
                                             rfc_cv.best_estimator_.steps[1][1], 
                                             etc_cv.best_estimator_.steps[1][1], 
                                             ets_cv.best_estimator_.steps[1][1]],
            'adaboostclassifier__n_estimators':np.arange(10, 110, 10),
            'adaboostclassifier__learning_rate':np.arange(0.1,1.1,0.1)},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=True)
    
    _ = abc_cv.fit(X_train, y_train)

In [111]:
if not 'gaussianprocess.joblib' in os.listdir('../working/best_estimators'):

    gpc_cv = GridSearchCV(
        estimator =  make_pipeline(preprocessor, gpc),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25,0.025),
            'gaussianprocessclassifier__kernel':[RBF(0.001), RBF(0.005), RBF(0.01), RBF(0.05)]},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=False)
    
    _ = gpc_cv.fit(X_train, y_train)

In [112]:
if not 'gradientboosting.joblib' in os.listdir('../working/best_estimators'):

    gbc_cv = RandomizedSearchCV(    
        estimator = make_pipeline(preprocessor, gbc),
        param_distributions={
            'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25, 0.025),
            'gradientboostingclassifier__learning_rate':[0.01, 0.05, 0.1, 0.5, 1, 5],
            'gradientboostingclassifier__n_estimators':[8,9,10,11,12,13,14,15],
            'gradientboostingclassifier__min_samples_split':np.arange(2,6),
            'gradientboostingclassifier__min_samples_leaf':np.arange(2,6),
            'gradientboostingclassifier__ccp_alpha':np.arange(0.005, 0.035, 0.005)},
        n_iter=30,
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        random_state=9597)

    _ = gbc_cv.fit(X_train, y_train)

In [113]:
if not 'svc.joblib' in os.listdir('../working/best_estimators'):

    svc_cv = GridSearchCV(
        estimator =  make_pipeline(preprocessor, svc),
        param_grid={
            'pipeline__variancethreshold__threshold':np.arange(0.05, 0.25,0.025),
            'svc__C':np.arange(1,11,1),
            'svc__kernel':['rbf','sigmoid']},
        scoring=metrics,
        n_jobs=-1,
        refit='accuracy', 
        cv=sss,
        return_train_score=False)
    
    _ = svc_cv.fit(X_train, y_train)

In [114]:
# Check if the best_estimators folder is empty. 
if len(os.listdir(os.path.join(dirname, 'working/best_estimators'))) == 0:

    # Create a list with the GridSearchCV best estimators.
    best_estimators = [
        log_cv.best_estimator_,
        l1_cv.best_estimator_, 
        l2_cv.best_estimator_,
        net_cv.best_estimator_,
        sgd_cv.best_estimator_,
        mlp_cv.best_estimator_,
        dtc_cv.best_estimator_,
        rfc_cv.best_estimator_,
        etc_cv.best_estimator_, 
        ets_cv.best_estimator_,
        abc_cv.best_estimator_,
        gpc_cv.best_estimator_,
        gbc_cv.best_estimator_,
        svc_cv.best_estimator_]
    
    # Save the best estimators to folder.
    funs.save_best_estimators(best_estimators)

else:
    # Load the best estimators from folder.
    best_estimators = funs.load_best_estimators()

In [115]:
train, validate = funs.cv_models_performance(best_estimators, Transformer, X_train, y_train, metrics, sss, best=True)

In [116]:
funs.performance_plotter(train, validate, 'Validation', color)

In [117]:
validation.append(validate)

In [118]:
validate.style.hide(axis='index')

Model,Accuracy,Recall weighted,Precision weighted,AUC
logisticregression,0.4167,0.3542,0.3542,0.7004
logisticregression_l1,0.4444,0.375,0.3611,0.6854
logisticregression_l2,0.3889,0.3194,0.2674,0.6997
logisticregression_elasticnet,0.4444,0.375,0.3611,0.6914
sgd_l1,0.4444,0.3403,0.2297,0.7607
mlp,0.4167,0.3958,0.2764,0.6782
decisiontree,0.4444,0.4097,0.3153,0.6713
randomforest,0.4722,0.3958,0.3139,0.7108
extratree,0.4444,0.4097,0.2917,0.7233
adaboost,0.4722,0.4444,0.3083,0.7772


### 3.5 Comparison Between Procedures

In [119]:
names = ['Standard Estimator',
         'Variance T (0.1)',
         'Variance T (0.2)',
         'Best Estimator']

In [120]:
funs.comparison_plotter('Accuracy', validation, names, color)

In [121]:
funs.comparison_plotter('Precision weighted', validation, names, color)

In [122]:
funs.comparison_plotter('Recall weighted', validation, names, color)

In [123]:
funs.comparison_plotter('AUC', validation, names, color)

## 4. Test Set Performance

In [124]:
train, test = funs.models_performance_train_test(best_estimators, Transformer, X_train, y_train, X_test, y_test, classes, best=True)

In [125]:
funs.performance_plotter(train, test, 'Test', color)

In [126]:
test.style.hide(axis='index')

Model,Accuracy,Recall weighted,Precision weighted,AUC
logisticregression,0.3103,0.3103,0.3163,0.7217
logisticregression_l1,0.2759,0.2759,0.4224,0.7344
logisticregression_l2,0.3448,0.3448,0.4054,0.7207
logisticregression_elasticnet,0.3448,0.3448,0.4054,0.7378
sgd_l1,0.2069,0.2069,0.115,0.6287
mlp,0.1379,0.1379,0.1336,0.6278
decisiontree,0.2069,0.2069,0.181,0.4861
randomforest,0.3103,0.3103,0.25,0.7159
extratree,0.1724,0.1724,0.2322,0.5661
adaboost,0.1724,0.1724,0.1379,0.5381


## 5 Best Classifiers for Grades Prediction

### 5.1.a Best Logistic Regression –elasticnet– Classifier

In [127]:
# Option 1: Load the model from best_estimators using joblib.
L1 = joblib.load('../working/best_estimators/logisticregression_l2.joblib')

<br>

Overall Test Performance Report

In [129]:
funs.classification_report(L1, y_train, X_train, y_test, X_test, classes, roc_plot=False)


		 TRAIN 	 TEST

Accuracy: 	 0.929 	 0.345
Recall: 	 0.929 	 0.345
Precision: 	 0.931 	 0.405

AUC: 		 0.995 	 0.721


<br>
Test Set Classification Report

In [130]:
print(classification_report(y_test, L1.predict(X_test), zero_division=0))

              precision    recall  f1-score   support

          AA       1.00      0.33      0.50         3
          BA       1.00      0.67      0.80         3
          BB       0.67      0.67      0.67         3
          CB       0.00      0.00      0.00         2
          CC       0.27      0.75      0.40         4
          DC       0.33      0.20      0.25         5
          DD       0.14      0.14      0.14         7
        Fail       0.00      0.00      0.00         2

    accuracy                           0.34        29
   macro avg       0.43      0.34      0.34        29
weighted avg       0.41      0.34      0.34        29



In [131]:
funs.confusion_matrix_plot(L1, (X_train, X_test), (y_train, y_test))

In [132]:
funs.roc_auc_plot(L1, (X_train, X_test), (y_train, y_test), binarizer)

In [164]:
# Create logodds plot by attribute and grade.
logodds = funs.linear_coefficients(L1, 'logisticregression')

In [169]:
# Create probabilities plot by attribute and grade.
probabilities = funs.linear_coefficients(L1, 'logisticregression', proba=True)

In [186]:
# Prepare the DataFrame for plotting.
df = probabilities.melt(id_vars='Variable', var_name='Grades', value_name='Coefficient')

In [228]:
funs.probabilities_by_grade(probabilities, probabilities.columns[0:-1])

### 5.1.b Best Random Forest Classifier

In [None]:
# Option 1: Load the model from best_estimators using pickle.
RFC = joblib.load('../working/best_estimators/randomforest.joblib')

In [None]:
_ = (RFC
     # .set_params(**extra_params)
     .fit(X_train, y_train))

<br>

Overall Test Performance Report

In [None]:
funs.classification_report(RFC, y_train, X_train, y_test, X_test, classes, roc_plot=False)


		 TRAIN 	 TEST

Accuracy: 	 1.000 	 0.310
Recall: 	 1.000 	 0.310
Precision: 	 1.000 	 0.250

AUC: 		 1.000 	 0.716


<br> 

Test Set Classification Report

In [None]:
print(classification_report(y_test, RFC.predict(X_test), zero_division=0))

              precision    recall  f1-score   support

          AA       1.00      0.67      0.80         3
          BA       0.00      0.00      0.00         3
          BB       0.00      0.00      0.00         3
          CB       0.00      0.00      0.00         2
          CC       0.17      0.25      0.20         4
          DC       0.25      0.20      0.22         5
          DD       0.33      0.71      0.45         7
        Fail       0.00      0.00      0.00         2

    accuracy                           0.31        29
   macro avg       0.22      0.23      0.21        29
weighted avg       0.25      0.31      0.26        29



In [None]:
funs.confusion_matrix_plot(RFC, (X_train, X_test), (y_train, y_test))

In [None]:
funs.roc_auc_plot(RFC, (X_train, X_test), (y_train, y_test), binarizer)

In [None]:
# Create features importance plot by attribute.
features = funs.tree_importance(RFC, 'randomforestclassifier')

## CONCLUSION

In conclusion, the evaluation of various models reveals their performance on the classification task. The results demonstrate the impact of feature selection and hyperparameter optimization on model performance. The best-performing model, the Logistic Regression with l1 penalization, shows promising results in terms of accuracy, recall, precision, and AUC in comparison to the other classifiers.

Nonetheless, the performance of such model is still poor – given the fact that the tunning process is made for just one model applied to each grade leaving the rest out. If a model per grade is developed and fine-tuned better classification performances can be achieved.  
