[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/saschaschworm/big-data-and-data-science/blob/master/notebooks/development-exercises/exam-performance-logistic-regression.ipynb)

In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/saschaschworm/big-data-and-data-science/master/datasets/exercises/exam-performance.csv')
X, y = data[['hours_studied', 'hours_slept']], data['passed']

hyperparams = {'loss': 'log', 'penalty': 'none', 'alpha': 0.0001, 'max_iter': 1000, 
               'tol': 1e-3, 'random_state': 1909, 'eta0': 0.0001}

model = SGDClassifier(**hyperparams)

In [3]:
numeric_features = ['hours_studied', 'hours_slept']
numeric_transformer = Pipeline([
    ('scaler', MinMaxScaler()),
])

preprocessor = ColumnTransformer([
    ('numeric_transformer', numeric_transformer, numeric_features),
])

pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('model', model)
])

pipeline = pipeline.fit(X, y)

In [4]:
prediction_set = pd.DataFrame({'hours_studied': [4], 'hours_slept': [10]})
prediction = pipeline.predict(prediction_set)
prediction_proba = pipeline.predict_proba(prediction_set)

In [5]:
f'Prediction: {prediction[0]}, Probability Estimate: {prediction_proba[0][1] * 100:.2f}%'

'Prediction: 1, Probability Estimate: 90.10%'