In [4]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

data = pd.read_csv('../data/dataset.csv')
data.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Nacionality,Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,8,5,2,1,1,1,13,10,6,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,6,1,11,1,1,1,1,3,4,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,5,1,1,1,22,27,10,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,8,2,15,1,1,1,23,27,6,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,12,1,3,0,1,1,22,28,10,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [6]:
with open('../data/X_train.pkl', 'rb') as f1, open('../data/X_test.pkl', 'rb') as f2, open('../data/y_train.pkl', 'rb') as f3, open('../data/y_test.pkl', 'rb') as f4:
    X_train, X_test, y_train, y_test = pd.read_pickle(f1), pd.read_pickle(f2), pd.read_pickle(f3), pd.read_pickle(f4)

Logistic Regression Baseline Model

In [16]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver='liblinear', random_state=14)
logreg.fit(X_train, y_train)

y_pred_test = logreg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_test)

print(f"Accuracy: {accuracy}")

Accuracy: 0.7502824858757062


Hyperparameter Tuning with GridSearchCV

In [24]:
from sklearn.model_selection import GridSearchCV


parameters = [{'penalty':['l1','l2']}, 
              {'C':[1, 10, 100, 1000]}]



grid_search = GridSearchCV(estimator = logreg,  
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 5,
                           verbose=0)


grid_search.fit(X_train, y_train)

final_predictions = grid_search.predict(X_test)

print(f'GridSearch CV best score : {grid_search.best_score_}\n')

print('Parameters that give the best results :','\n', (grid_search.best_params_))

print('\nEstimator that was chosen by the search :','\n', (grid_search.best_estimator_))

GridSearch CV best score : 0.7677326812584406

Parameters that give the best results : 
 {'penalty': 'l1'}

Estimator that was chosen by the search : 
 LogisticRegression(penalty='l1', random_state=14, solver='liblinear')


In [23]:
print(f'GridSearch CV accuracy: {grid_search.score(X_test, y_test)}')

GridSearch CV accuracy: 0.751412429378531


In [25]:
with open('label_encoder.pkl', 'rb') as file:
    label_encoder = pd.read_pickle(file)

print("\nClassification Report:\n", classification_report(y_test, final_predictions, target_names=label_encoder.classes_))


Classification Report:
               precision    recall  f1-score   support

     Dropout       0.80      0.78      0.79       316
    Enrolled       0.49      0.20      0.28       151
    Graduate       0.75      0.93      0.83       418

    accuracy                           0.75       885
   macro avg       0.68      0.64      0.64       885
weighted avg       0.73      0.75      0.72       885

