**Run the following two cells before you begin.**

In [None]:
%autosave 10

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import roc_curve, roc_auc_score

______________________________________________________________________
**First, import your data set and define the sigmoid function.**
<details>
    <summary>Hint:</summary>
    The definition of the sigmoid is $f(x) = \frac{1}{1 + e^{-X}}$.
</details>

In [3]:
# Import the data set
data = pd.read_csv("cleaned_data.csv")

In [4]:
data.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,...,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,EDUCATION_CAT,graduate school,high school,others,university
0,798fc410-45c1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,1,university,0,0,0,1
1,8a8c8f3b-8eb4,120000,2,2,2,26,-1,2,0,0,...,1000,1000,0,2000,1,university,0,0,0,1
2,85698822-43f5,90000,2,2,2,34,0,0,0,0,...,1000,1000,1000,5000,0,university,0,0,0,1
3,0737c11b-be42,50000,2,2,1,37,0,0,0,0,...,1200,1100,1069,1000,0,university,0,0,0,1
4,3b7f77cc-dbc0,50000,1,2,1,57,-1,0,-1,0,...,10000,9000,689,679,0,university,0,0,0,1


In [5]:
# Define the sigmoid function
def sigmoid(X):
    Y = 1/ (1+ np.exp(-X))
    return X

**Now, create a train/test split (80/20) with `PAY_1` and `LIMIT_BAL` as features and `default payment next month` as values. Use a random state of 24.**

In [6]:
# Create a train/test split
X = data[["PAY_1","LIMIT_BAL"]].values
y = data["default payment next month"].values

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=24)

In [8]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(21331, 2)
(5333, 2)
(21331,)
(5333,)


______________________________________________________________________
**Next, import LogisticRegression, with the default options, but set the solver to `'liblinear'`.**

In [9]:
regressor = LogisticRegression(solver='liblinear')

______________________________________________________________________
**Now, train on the training data and obtain predicted classes, as well as class probabilities, using the testing data.**

In [10]:
# Fit the logistic regression model on training data
regressor.fit(X_train,y_train)

LogisticRegression(solver='liblinear')

In [11]:
# Make predictions using `.predict()`
y_pred = regressor.predict(X_test)
y_pred[:5]

array([0, 0, 0, 0, 0])

In [12]:
# Find class probabilities using `.predict_proba()`
y_pred_prob = regressor.predict_proba(X_test)
y_pred_prob

array([[0.74826924, 0.25173076],
       [0.584297  , 0.415703  ],
       [0.79604453, 0.20395547],
       ...,
       [0.584297  , 0.415703  ],
       [0.82721498, 0.17278502],
       [0.66393435, 0.33606565]])

______________________________________________________________________
**Then, pull out the coefficients and intercept from the trained model and manually calculate predicted probabilities. You'll need to add a column of 1s to your features, to multiply by the intercept.**

In [13]:
# Get coefficients and intercepts from trained model
coef = regressor.coef_
intercept = regressor.intercept_

In [14]:
# Manually calculate predicted probabilities
y_prob_manual = sigmoid((coef* X_test) + intercept)
y_prob_manual

array([[ 9.97254917e-11, -1.08940276e+00],
       [ 1.69803730e-11, -3.40438363e-01],
       [-1.48509864e-10, -1.36175345e+00],
       ...,
       [-1.48509864e-10, -3.40438363e-01],
       [ 1.69803730e-11, -1.56601647e+00],
       [ 9.97254917e-11, -6.80876727e-01]])

______________________________________________________________________
**Next, using a threshold of `0.5`, manually calculate predicted classes. Compare this to the class predictions output by scikit-learn.**

In [15]:
# Manually calculate predicted classes
threshold = 0.5
y_pred_manual = []
 
for i in y_prob_manual[:, 1]:
    if i >= threshold:
        y_pred_manual.append(1)
    else:
        y_pred_manual.append(0)
        
y_pred_manual = np.asarray(y_pred_manual)

y_pred_manual

array([0, 0, 0, ..., 0, 0, 0])

In [20]:
# Compare to scikit-learn's predicted classes
positive = y_pred==1
negative = y_pred==0
pos_manual = y_pred_manual==1
neg_manual = y_pred_manual==0
print("Positives:",sum(positive)," Negatives:",sum(negative)," are predicted classes through scikit-learn.")
print("Positives:",sum(pos_manual), " Negatives:",sum(neg_manual)," are predicted classes manually.")

Positives: 0  Negatives: 5333  are predicted classes through scikit-learn.
Positives: 0  Negatives: 5333  are predicted classes manually.


______________________________________________________________________
**Finally, calculate ROC AUC using both scikit-learn's predicted probabilities, and your manually predicted probabilities, and compare.**

In [22]:
# Use scikit-learn's predicted probabilities to calculate ROC AUC
from sklearn.metrics import roc_auc_score
print("ROC AUC Score for sklearn's predicted probabilities:", roc_auc_score(y_test, y_pred_prob[:,1]))

ROC AUC Score for sklearn's predicted probabilities: 0.627207450280691


In [23]:
# Use manually calculated predicted probabilities to calculate ROC AUC
print("ROC AUC Score for manually predicted probabilities:", roc_auc_score(y_test, y_prob_manual[:,1]))

ROC AUC Score for manually predicted probabilities: 0.6201990844642832


Author: Purvit Vashishtha