[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/saschaschworm/big-data-and-data-science/blob/master/notebooks/demos/exam-performance-logistic-regression.ipynb)

In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/saschaschworm/big-data-and-data-science/master/datasets/demos/exam-performance.csv')
X, y = data[['hours_studied', 'hours_slept']], data['passed']

hyperparams = {'loss': 'log', 'penalty': 'none', 'alpha': 0.0001, 'max_iter': 1000, 
               'tol': 1e-3, 'random_state': 1909, 'eta0': 0.0001}

model = SGDClassifier(**hyperparams)

In [3]:
numeric_features = ['hours_studied', 'hours_slept']
numeric_transformer = Pipeline([
    ('scaler', MinMaxScaler()),
])

preprocessor = ColumnTransformer([
    ('numeric_transformer', numeric_transformer, numeric_features),
])

pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('model', model)
])

pipeline = pipeline.fit(X, y)

In [4]:
scoring = ['accuracy', 'recall', 'precision', 'f1']
res_cv = cross_validate(pipeline, X, y, scoring=scoring, cv=10, return_train_score=True)

In [5]:
res_acc_tr = np.mean(res_cv['train_accuracy']) * 100
res_acc_te = np.mean(res_cv['test_accuracy']) * 100
f'Average Accurarcy on Training and Test Set: {res_acc_tr:.2f}%/{res_acc_te:.2f}%'

'Average Accurarcy on Training and Test Set: 85.57%/85.45%'

In [6]:
res_rec_tr = np.mean(res_cv['train_recall']) * 100
res_rec_te = np.mean(res_cv['test_recall']) * 100
f'Average Recall on Training and Test Set: {res_rec_tr:.2f}%/{res_rec_te:.2f}%'

'Average Recall on Training and Test Set: 92.93%/90.33%'

In [7]:
res_prec_tr = np.mean(res_cv['train_precision']) * 100
res_prec_te = np.mean(res_cv['test_precision']) * 100
f'Average Precision on Training and Test Set: {res_prec_tr:.2f}%/{res_prec_te:.2f}%'

'Average Precision on Training and Test Set: 84.16%/87.62%'

In [8]:
res_f1_tr = np.mean(res_cv['train_f1']) * 100
res_f1_te = np.mean(res_cv['test_f1']) * 100
f'Average F1 on Training and Test Set: {res_f1_tr:.2f}%/{res_f1_te:.2f}%'

'Average F1 on Training and Test Set: 87.67%/86.59%'