#Creating models Based on the Credit Data

In [80]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import cross_val_score, ShuffleSplit
import sklearn as sk

In [81]:
df = pd.read_csv('data/cs-train_clean.csv')
X = df.drop('SeriousDlqin2yrs', axis=1)
y = df.SeriousDlqin2yrs

##Logistic Regression without cross validation

In [82]:
train, test = sk.cross_validation.train_test_split(df, train_size=0.75)
y = train['SeriousDlqin2yrs']
X = train.drop('SeriousDlqin2yrs', axis=1)
LR = LogisticRegression(penalty = 'l1', tol = 0.01)
LR.fit(X, y)
sk.metrics.roc_auc_score(LR.predict(X), y)
y_test = test['SeriousDlqin2yrs']
X_test = test.drop('SeriousDlqin2yrs', 1)
X_test = X_test.fillna(X_test.mean())
test_pred = LR.predict(X_test)
roc_auc = sk.metrics.roc_auc_score(test_pred, y_test)
roc_auc

0.75488706617159773

that is bad. But why? Let's examine the model:

In [83]:
train_pred = LR.predict(X)
sk.metrics.roc_auc_score(train_pred, y)

0.70943783687577822

ok, that is some serious __underfitting__. So let us try again with actual cross validation.

##Logistic Regression with cross validation

In [84]:
LR = LogisticRegression(penalty='l1', tol=0.01)
cv = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.25, random_state=1)

In [85]:
test_scores = cross_val_score(LR, X, y, cv=cv)
test_scores

array([ 0.93607615,  0.93547007,  0.93147706,  0.93254661,  0.93333096,
        0.93354487,  0.93308139,  0.93154836,  0.93407965,  0.93151271])

In [86]:
from scipy.stats import sem

def mean_score(scores):
    return ('Mean score: {0:.3f} (+/-{1:.3f})').format(np.mean(scores), sem(scores))

In [87]:
print mean_score(test_scores)

Mean score: 0.933 (+/-0.001)
