**Run the following two cells before you begin.**

In [1]:
%autosave 100

Autosaving every 100 seconds


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc,roc_auc_score
from sklearn.linear_model import LogisticRegression

______________________________________________________________________
**First, import your data set and define the sigmoid function.**
<details>
    <summary>Hint:</summary>
    The definition of the sigmoid is $f(x) = \frac{1}{1 + e^{-X}}$.
</details>

In [3]:
# Import the data set
credit=pd.read_csv('cleaned_data.csv',index_col=[0])
df=credit.copy()

In [4]:
# Define the sigmoid function
def sigmoid(a):
    x1=a[0]
    x2=a[1]
    b=a[2]
    z=(8.27451187e-11*x1)+(-6.80876727e-06*x2)+(-6.57647457e-11*b) # f(x)=wx+b w and b are intersepts and cofficent 

    sig= 1/(1+np.exp(z)) #sig =1/(1+e**(f(x)))
    return sig
#sigmoid is define as the racipocal the function of x is reciprocal of 1 plus exponent to the power minus x

**Now, create a train/test split (80/20) with `PAY_1` and `LIMIT_BAL` as features and `default payment next month` as values. Use a random state of 24.**

In [5]:
X=df[['PAY_1','LIMIT_BAL']]
y=df['default payment next month']
# Create a train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=24)

______________________________________________________________________
**Next, import LogisticRegression, with the default options, but set the solver to `'liblinear'`.**

In [6]:
lr=LogisticRegression(solver='liblinear')

______________________________________________________________________
**Now, train on the training data and obtain predicted classes, as well as class probabilities, using the testing data.**

In [7]:
# Fit the logistic regression model on training data
lr.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [8]:
# Make predictions using `.predict()`
classes=lr.predict(X_test)

In [9]:
# Find class probabilities using `.predict_proba()`
probab=lr.predict_proba(X_test)[:,1]

In [10]:
probab

array([0.25173076, 0.415703  , 0.20395547, ..., 0.415703  , 0.17278502,
       0.33606565])

______________________________________________________________________
**Then, pull out the coefficients and intercept from the trained model and manually calculate predicted probabilities. You'll need to add a column of 1s to your features, to multiply by the intercept.**

In [11]:
# Add column of 1s to features
X_test['ones']=1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [12]:
X_test

Unnamed: 0_level_0,PAY_1,LIMIT_BAL,ones
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4cf593fd-c797,2,160000,1
0c82be6d-1795,1,50000,1
e72f6986-2ede,-1,200000,1
2d7e0251-c293,3,200000,1
1626f524-1b1d,1,50000,1
...,...,...,...
b091ba31-bc55,0,140000,1
539c887b-a82e,-1,50000,1
947debaf-64c7,-1,50000,1
92603ac0-9572,1,230000,1


In [13]:
# Get coefficients and intercepts from trained model
print('coefficient',lr.coef_)

coefficient [[ 8.27451187e-11 -6.80876727e-06]]


In [14]:
# Manually calculate predicted probabilities
print('intercept',lr.intercept_)

intercept [-6.57647457e-11]


In [15]:
X_test.head(1)

Unnamed: 0_level_0,PAY_1,LIMIT_BAL,ones
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4cf593fd-c797,2,160000,1


In [16]:
# Manually calculate predicted probabilities
manual_probs=1-X_test[['PAY_1','LIMIT_BAL','ones']].apply(sigmoid,axis=1) #using the def function created at top

In [17]:
manual_probs

ID
4cf593fd-c797    0.251731
0c82be6d-1795    0.415703
e72f6986-2ede    0.203955
2d7e0251-c293    0.203955
1626f524-1b1d    0.415703
                   ...   
b091ba31-bc55    0.278236
539c887b-a82e    0.415703
947debaf-64c7    0.415703
92603ac0-9572    0.172785
adbe0177-9471    0.336066
Length: 5333, dtype: float64

______________________________________________________________________
**Next, using a threshold of `0.5`, manually calculate predicted classes. Compare this to the class predictions output by scikit-learn.**

In [18]:
# Manually calculate predicted classes
manual_classes=[]
for i in manual_probs:
    if i >= 0.5:
        manual_classes.append(1)
    elif i<0.5:
        manual_classes.append(0)

In [19]:
manual_probs

ID
4cf593fd-c797    0.251731
0c82be6d-1795    0.415703
e72f6986-2ede    0.203955
2d7e0251-c293    0.203955
1626f524-1b1d    0.415703
                   ...   
b091ba31-bc55    0.278236
539c887b-a82e    0.415703
947debaf-64c7    0.415703
92603ac0-9572    0.172785
adbe0177-9471    0.336066
Length: 5333, dtype: float64

In [20]:
# Compare to scikit-learn's predicted classes
np.unique(classes==manual_classes,return_counts=True)

(array([ True]), array([5333], dtype=int64))

______________________________________________________________________
**Finally, calculate ROC AUC using both scikit-learn's predicted probabilities, and your manually predicted probabilities, and compare.**

In [22]:
# Use scikit-learn's predicted probabilities to calculate ROC AUC
roc_auc_score(y_test,probab)

0.627207450280691

In [24]:
# Use manually calculated predicted probabilities to calculate ROC AUC
roc_auc_score(y_test,manual_probs)

0.627207450280691