#Logistic Regression for Credit Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score, ShuffleSplit
import sklearn as sk

In [2]:
df = pd.read_csv('data/cs-train_clean.csv')
df = df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
X = df.drop('SeriousDlqin2yrs', axis=1)
y = df.SeriousDlqin2yrs

##Logistic Regression without cross validation

In [3]:
train, test = sk.cross_validation.train_test_split(df, train_size=0.75)
y = train['SeriousDlqin2yrs']
X = train.drop('SeriousDlqin2yrs', axis=1)
LR = LogisticRegression(penalty = 'l1', tol = 0.01)
LR.fit(X, y)
sk.metrics.roc_auc_score(LR.predict(X), y)
y_test = test['SeriousDlqin2yrs']
X_test = test.drop('SeriousDlqin2yrs', 1)
X_test = X_test.fillna(X_test.mean())
test_pred = LR.predict(X_test)
roc_auc = sk.metrics.roc_auc_score(test_pred, y_test)
roc_auc

0.73137442785609286

that is bad. But why? Let's examine the model:

In [4]:
train_pred = LR.predict(X)
sk.metrics.roc_auc_score(train_pred, y)

0.74166283771733732

ok, that is some serious __underfitting__. So let us try again with actual cross validation.

##Logistic Regression with cross validation

In [5]:
lr = LogisticRegression(penalty='l2', tol=0.01)
cv_lr = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.25, random_state=1)

In [6]:
test_scores_lr = cross_val_score(lr, X, y, cv=cv_lr)
test_scores_lr

array([ 0.93293879,  0.93268922,  0.93301009,  0.93390139,  0.933224  ,
        0.93194053,  0.93358052,  0.93372313,  0.93375878,  0.93425791])

In [9]:
from scipy.stats import sem

def mean_score(scores):
    return ('Mean score: {0:.8f} (+/-{1:.5f})').format(np.mean(scores), sem(scores))

In [10]:
print mean_score(test_scores_lr)

Mean score: 0.93330244 (+/-0.00022)
