#Logistic Regression for Credit Data

In [21]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score, ShuffleSplit
import sklearn as sk

In [22]:
df = pd.read_csv('data/cs-train_clean.csv')
X = df.drop('SeriousDlqin2yrs', axis=1)
y = df.SeriousDlqin2yrs

##Logistic Regression without cross validation

In [23]:
train, test = sk.cross_validation.train_test_split(df, train_size=0.75)
y = train['SeriousDlqin2yrs']
X = train.drop('SeriousDlqin2yrs', axis=1)
LR = LogisticRegression(penalty = 'l1', tol = 0.01)
LR.fit(X, y)
sk.metrics.roc_auc_score(LR.predict(X), y)
y_test = test['SeriousDlqin2yrs']
X_test = test.drop('SeriousDlqin2yrs', 1)
X_test = X_test.fillna(X_test.mean())
test_pred = LR.predict(X_test)
roc_auc = sk.metrics.roc_auc_score(test_pred, y_test)
roc_auc

0.75345824469535805

that is bad. But why? Let's examine the model:

In [24]:
train_pred = LR.predict(X)
sk.metrics.roc_auc_score(train_pred, y)

0.70953602398470339

ok, that is some serious __underfitting__. So let us try again with actual cross validation.

##Logistic Regression with cross validation

In [39]:
lr = LogisticRegression(penalty='l2', tol=0.01)
cv_lr = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.25, random_state=1)

In [40]:
test_scores_lr = cross_val_score(lr, X, y, cv=cv_lr)
test_scores_lr

array([ 0.93265357,  0.93372313,  0.93247531,  0.93290313,  0.93397269,
        0.93251096,  0.93318835,  0.93258227,  0.93204749,  0.93133445])

In [43]:
from scipy.stats import sem

def mean_score(scores):
    return ('Mean score: {0:.3f} (+/-{1:.5f})').format(np.mean(scores), sem(scores))

In [44]:
print mean_score(test_scores_lr)

Mean score: 0.933 (+/-0.00024)
