#Logistic Regression for Credit Data

In [187]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.cross_validation import cross_val_score, ShuffleSplit
import sklearn as sk
import pyroc

In [188]:
from scipy.stats import sem

def mean_score(scores):
    return ('Mean score: {0:.8f} (+/-{1:.5f})').format(np.mean(scores), sem(scores))

##Cleaned Data

In [189]:
def test_log_reg(df):
    lr = LogisticRegression()
    train, test = sk.cross_validation.train_test_split(df, train_size=0.75)
    y = train['SeriousDlqin2yrs']
    X = train.drop('SeriousDlqin2yrs', axis=1)
    y_test = test['SeriousDlqin2yrs']
    X_test = test.drop('SeriousDlqin2yrs', 1)
    lr.fit(X, y)
    probas = lr.predict_proba(X_test)
    print 'roc auc score: ' + str(sk.metrics.roc_auc_score(y_test, probas.T[1]))
    print 'Accuracy: ' + str(lr.score(X_test, y_test))

In [190]:
def test_log_reg_cv(df):
    train, test = sk.cross_validation.train_test_split(df, train_size=0.75)
    y = train['SeriousDlqin2yrs']
    X = train.drop('SeriousDlqin2yrs', axis=1)
    y_test = test['SeriousDlqin2yrs']
    X_test = test.drop('SeriousDlqin2yrs', 1)
    cv_lr = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.25, random_state=1)
    lr_cv = LogisticRegressionCV(cv=cv_lr)
    lr_cv.fit(X,y)
    probas = lr.predict_proba(X_test)
    print 'roc auc score: ' + str(sk.metrics.roc_auc_score(y_test, lr_cv.predict_proba(X_test).T[1]))
    print 'Accuracy: ' + str(lr_cv.score(X_test, y_test))

In [None]:
def test_both(df):
    print 'Without CV:'
    test_log_reg(df)
    print '\nWith CV:'
    test_log_reg_cv(df)

###Logistic Regression without cross validation

In [191]:
df = pd.read_csv('data/cs-train_clean.csv')
df = df.drop(['Unnamed: 0'], axis=1)
test_log_reg(df)

roc auc score: 0.695075499257
Accuracy: 0.933393941014


###Logistic Regression with cross validation

In [192]:
test_log_reg_cv(df)

roc auc score: 0.690077439267
Accuracy: 0.93195005214


##Cropped Data

In [195]:
df_cropped = pd.read_csv('data/cs-train_cropped.csv').drop(['Unnamed: 0'], axis=1)
print 'Without CV:'
test_log_reg(df_cropped)
print '\nWith CV:'
test_log_reg_cv(df_cropped)

Without CV:
roc auc score: 0.685463912666
Accuracy: 0.931156046295

With CV:
roc auc score: 0.680781463925
Accuracy: 0.930357855527


##Oversampled Data

In [155]:
df_oversampled = pd.read_csv('data/cs-train_oversampled.csv').drop(['Unnamed: 0'], axis=1)
test_lr(df_oversampled)

In [149]:
train, test = sk.cross_validation.train_test_split(df, train_size=0.75)
y = train['SeriousDlqin2yrs']
X = train.drop('SeriousDlqin2yrs', axis=1)
LR = LogisticRegression(penalty = 'l1', tol = 0.01)
LR.fit(X, y)
y_test = test['SeriousDlqin2yrs']
X_test = test.drop('SeriousDlqin2yrs', 1)
X_test = X_test.fillna(X_test.mean())
probas = LR.predict_proba(X_test)
print 'roc auc score: ' + str(sk.metrics.roc_auc_score(y_test, probas.T[1]))
print 'Accuracy: ' + str(LR.score(X_test, y_test))

roc auc score: 0.800933318379
Accuracy: 0.733509856907


In [150]:
lr = LogisticRegression(penalty='l1', tol=0.01)
cv_lr = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.25, random_state=1)

In [151]:
test_scores_lr = cross_val_score(lr, X, y, cv=cv_lr)
test_scores_lr

array([ 0.72972385,  0.7312231 ,  0.72699134,  0.73144073,  0.72355758,
        0.72786188,  0.7296513 ,  0.7275717 ,  0.73315761,  0.73269817])

In [152]:
print mean_score(test_scores_lr)

Mean score: 0.72938773 (+/-0.00093)


In [153]:
lr_cv = LogisticRegressionCV( tol=0.1, cv=cv_lr)
lr_cv.fit(X,y)
print 'roc auc score: ' + str(sk.metrics.roc_auc_score(y_test, lr_cv.predict_proba(X_test).T[1]))
print 'Accuracy: ' + str(lr_cv.score(X_test, y_test))

roc auc score: 0.799122237028
Accuracy: 0.726346143383


##Undersampled Data

In [163]:
df = pd.read_csv('data/cs-train_undersampled.csv')
df = df.drop(['Unnamed: 0'], axis=1)
X = df.drop('SeriousDlqin2yrs', axis=1)
y = df.SeriousDlqin2yrs

In [164]:
train, test = sk.cross_validation.train_test_split(df, train_size=0.75)
y = train['SeriousDlqin2yrs']
X = train.drop('SeriousDlqin2yrs', axis=1)
LR = LogisticRegression(penalty = 'l1', tol = 0.01)
LR.fit(X, y)
y_test = test['SeriousDlqin2yrs']
X_test = test.drop('SeriousDlqin2yrs', 1)
X_test = X_test.fillna(X_test.mean())
probas = LR.predict_proba(X_test)
print 'roc auc score: ' + str(sk.metrics.roc_auc_score(y_test, probas.T[1]))
print 'Accuracy: ' + str(LR.score(X_test, y_test))

roc auc score: 0.801649496626
Accuracy: 0.737497008854


In [165]:
lr = LogisticRegression(penalty='l1', tol=0.01)
cv_lr = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.25, random_state=1)

In [166]:
test_scores_lr = cross_val_score(lr, X, y, cv=cv_lr)
test_scores_lr

array([ 0.74218251,  0.73005743,  0.73388641,  0.71952776,  0.72590938,
        0.74313976,  0.73133376,  0.73771538,  0.7373963 ,  0.72080408])

In [167]:
print mean_score(test_scores_lr)

Mean score: 0.73219528 (+/-0.00261)


In [168]:
lr_cv = LogisticRegressionCV( tol=0.1, cv=cv_lr)
lr_cv.fit(X,y)
print 'roc auc score: ' + str(sk.metrics.roc_auc_score(y_test, lr_cv.predict_proba(X_test).T[1]))
print 'Accuracy: ' + str(lr_cv.score(X_test, y_test))

roc auc score: 0.800377604996
Accuracy: 0.736300550371
