<a href="https://colab.research.google.com/github/rgumi/dataScience/blob/master/exam_performance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

In [0]:
data = pd.read_csv('https://raw.githubusercontent.com/saschaschworm/big-data-and-data-science/master/datasets/demos/exam-performance.csv')
X, y = data[['hours_studied', 'hours_slept']], data['passed']

hyperparams = {'loss': 'log', 'penalty': 'none', 'alpha': 0.0001, 'max_iter': 1000, 
               'tol': 1e-3, 'random_state': 1909, 'eta0': 0.0001}

model = SGDClassifier(**hyperparams)

In [0]:
numeric_features = ['hours_studied', 'hours_slept']

numeric_transformer = Pipeline([
    ('scaler', MinMaxScaler()),
])

preprocessor = ColumnTransformer([
    ('n_transformer', numeric_transformer, numeric_features),
])

pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('model', model)
])

pipeline = pipeline.fit(X, y)

In [61]:
predict_set = pd.DataFrame({'hours_studied': [4], 'hours_slept': [10]})
prediction = pipeline.predict_proba(predict_set)
f'Prediction for a 4 hours slept and 10 hours studied: {prediction}'

'Prediction for a 4 hours slept and 10 hours studied: [[0.09896779 0.90103221]]'

In [0]:
scoring = ['accuracy', 'recall', 'precision', 'f1']
res_cv = cross_validate(pipeline, X, y, scoring=scoring, cv=10, return_train_score=True)

In [63]:
res_acc_tr = np.mean(res_cv['train_accuracy']) * 100
res_acc_te = np.mean(res_cv['test_accuracy']) * 100
print(f'Average Accurarcy on Training and Test Set: {res_acc_tr:.2f}%/{res_acc_te:.2f}%')
res_rec_tr = np.mean(res_cv['train_recall']) * 100
res_rec_te = np.mean(res_cv['test_recall']) * 100
print(f'Average Recall on Training and Test Set: {res_rec_tr:.2f}%/{res_rec_te:.2f}%')
res_prec_tr = np.mean(res_cv['train_precision']) * 100
res_prec_te = np.mean(res_cv['test_precision']) * 100
print(f'Average Precision on Training and Test Set: {res_prec_tr:.2f}%/{res_prec_te:.2f}%')
res_f1_tr = np.mean(res_cv['train_f1']) * 100
res_f1_te = np.mean(res_cv['test_f1']) * 100
print(f'Average F1 on Training and Test Set: {res_f1_tr:.2f}%/{res_f1_te:.2f}%')

Average Accurarcy on Training and Test Set: 85.57%/85.45%
Average Recall on Training and Test Set: 92.93%/90.33%
Average Precision on Training and Test Set: 84.16%/87.62%
Average F1 on Training and Test Set: 87.67%/86.59%
