#Logistic Regression for Credit Data

In [9]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.cross_validation import cross_val_score, ShuffleSplit
import sklearn as sk

In [2]:
df = pd.read_csv('data/cs-train_clean.csv')
df = df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
X = df.drop('SeriousDlqin2yrs', axis=1)
y = df.SeriousDlqin2yrs

##Logistic Regression without cross validation

In [3]:
train, test = sk.cross_validation.train_test_split(df, train_size=0.75)
y = train['SeriousDlqin2yrs']
X = train.drop('SeriousDlqin2yrs', axis=1)
LR = LogisticRegression(penalty = 'l1', tol = 0.01)
LR.fit(X, y)
sk.metrics.roc_auc_score(LR.predict(X), y)
y_test = test['SeriousDlqin2yrs']
X_test = test.drop('SeriousDlqin2yrs', 1)
X_test = X_test.fillna(X_test.mean())
test_pred = LR.predict(X_test)
roc_auc = sk.metrics.roc_auc_score(test_pred, y_test)
roc_auc

0.70000357088656184

that is bad. But why? Let's examine the model:

In [4]:
train_pred = LR.predict(X)
sk.metrics.roc_auc_score(train_pred, y)

0.7518637565715397

ok, that is some serious __underfitting__. So let us try again with actual cross validation.

##Logistic Regression with cross validation

In [16]:
lr = LogisticRegression(penalty='l2', tol=0.01)
cv_lr = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.25, random_state=1)

In [17]:
test_scores_lr = cross_val_score(lr, X, y, cv=cv_lr)
test_scores_lr

array([ 0.93554137,  0.93511355,  0.93265357,  0.93525616,  0.93325965,
        0.93297444,  0.93251096,  0.93211879,  0.93358052,  0.93586224])

In [18]:
from scipy.stats import sem

def mean_score(scores):
    return ('Mean score: {0:.8f} (+/-{1:.5f})').format(np.mean(scores), sem(scores))

In [19]:
print mean_score(test_scores_lr)

Mean score: 0.93388713 (+/-0.00045)


In [20]:
lr_cv = LogisticRegressionCV( tol=0.01, cv=cv_lr)

In [21]:
lr_cv.fit(X,y)

LogisticRegressionCV(Cs=10, class_weight=None,
           cv=ShuffleSplit(112196, n_iter=10, test_size=0.25, random_state=1),
           dual=False, fit_intercept=True, intercept_scaling=1.0,
           max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2',
           refit=True, scoring=None, solver='lbfgs', tol=0.01, verbose=0)

In [22]:
lr_cv.score(X_test, y_test)

0.93259178052889113