# Logistic Regression Walkthrough

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']

In [4]:
pima = pd.read_csv("diabetes.csv", header=None, names=col_names)

In [5]:
pima.head()

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,<!DOCTYPE html>,,,,,,,,
1,"<html lang=""en"">",,,,,,,,
2,<head>,,,,,,,,
3,<title>Kaggle: Your Home for Data Science</title>,,,,,,,,
4,"<meta charset=""utf-8"" />",,,,,,,,


In [6]:
# Need Kaggle account to fully access data - confirmation email not arriving
# proceeding with steps but won't have actual data

In [None]:
# split dataset in features and target variables
feature_cols = ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree']
X = pima[feature_cols]
y = pima.label

In [None]:
# create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=0)

In [None]:
# initiate model
logreg = LogisticRegression()

In [None]:
# fit model
logreg.fit(X_train,y_train)

In [None]:
y_pred = logreg.predict(X_test)

In [None]:
# create confusion matrix for eval
cnf_matrix = metrics.confusion_matrix(y_test,y_pred)
cnf_matrix

In [None]:
# visualize matrix as heatmap
class_names=[0,1]
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
# eval model with metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

In [None]:
# generate ROC curve
y_pred_proba = logreg.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()