# Machine Learning in Python

by [Piotr Migdał](http://p.migdal.pl/) & Dominik Krzemiński

for El Passion, 2017

## 6. Logistic Regression

Same dataset: https://archive.ics.uci.edu/ml/datasets/Student+Performance

* [Logistic regression](https://en.wikipedia.org/wiki/Logistic_regression)
* [sklearn.linear_model.LogisticRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)


In [None]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [None]:
# logistic function
logit = np.linspace(-5, 5, num=100)
logistic = 1 / (1 + np.exp(-logit))
plt.plot(logit, logistic)

In [None]:
students = pd.read_csv("data/students_cleaner.csv")

In [None]:
# good grade
students["G"] = students["G1"] + students["G2"] + students["G3"]
students["good_G"] = students["G"] > students["G"].mean()

In [None]:
X = students.drop(['G', 'good_G', 'G1', 'G2', 'G3'], axis='columns')
Y = students['good_G']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

In [None]:
# more or less 50-50
Y.mean()

In [None]:
lr_clf = LogisticRegression(C=1.)
lr_clf.fit(X_train, Y_train)

In [None]:
# accuracy on the training dataset
lr_clf.score(X_train, Y_train)

In [None]:
# accuracy on the test dataset
lr_clf.score(X_test, Y_test)

In [None]:
confusion_matrix(Y_test, lr_clf.predict(X_test))

In [None]:
# confused? here it is explanation
sns.heatmap(confusion_matrix(Y_test, lr_clf.predict(X_test)), annot=True, fmt='d')
plt.xlabel("prediction")
plt.ylabel("ground_truth")

In [None]:
# raw predictions
lr_clf.predict(X_test)

In [None]:
# predicting probabilities
lr_clf.predict_proba(X_test)[:10]

In [None]:
# danger - unweighted
pd.Series(logreg.coef_[0], index=X.columns).sort_values().plot('barh', figsize=(6, 8))