#Logistic Regression for Credit Data

In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.cross_validation import cross_val_score, ShuffleSplit
from sklearn.preprocessing import StandardScaler
import sklearn as sk
import pyroc

In [11]:
from scipy.stats import sem

def mean_score(text, scores):
    return (str(text) + ' {0:.8f} (+/-{1:.5f})').format(np.mean(scores), sem(scores))

##Cleaned Data

In [12]:
def test_log_reg(df, n_iter):
    auc_scores = []
    accs = []
    for i in range(0, n_iter):
        lr = LogisticRegression()
        train, test = sk.cross_validation.train_test_split(df, train_size=0.75, random_state=1)
        y = train['SeriousDlqin2yrs']
        X = train.drop('SeriousDlqin2yrs', axis=1)
        y_test = test['SeriousDlqin2yrs']
        X_test = test.drop('SeriousDlqin2yrs', 1)
        lr.fit(X, y)
        probas = lr.predict_proba(X_test)
        auc_scores.append(sk.metrics.roc_auc_score(y_test, probas.T[1]))
        accs.append(lr.score(X_test, y_test))
    print(mean_score('roc auc score: ', auc_scores))
    print(mean_score('Accuracy: ', accs))

In [13]:
def test_log_reg_cv(df, n_iter):
    auc_scores = []
    accs = []
    for i in range(0, n_iter):
        train, test = sk.cross_validation.train_test_split(df, train_size=0.75, random_state=1)
        y = train['SeriousDlqin2yrs']
        X = train.drop('SeriousDlqin2yrs', axis=1)
        y_test = test['SeriousDlqin2yrs']
        X_test = test.drop('SeriousDlqin2yrs', 1)
        cv_lr = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.25, random_state=1)
        lr_cv = LogisticRegressionCV(cv=cv_lr)
        lr_cv.fit(X,y)
        probas = lr_cv.predict_proba(X_test)
        auc_scores.append(sk.metrics.roc_auc_score(y_test, lr_cv.predict_proba(X_test).T[1]))
        accs.append(lr_cv.score(X_test, y_test))
    print(mean_score('roc auc score: ', auc_scores))
    print(mean_score('Accuracy: ', accs))

In [14]:
def test_both(df, n_iter):
    print 'Without CV:'
    test_log_reg(df, n_iter)
    print '\nWith CV:'
    test_log_reg_cv(df, n_iter)

##Cleaned Data

In [15]:
df_clean = pd.read_csv('data/cs-train_clean.csv').drop(['Unnamed: 0'], axis=1)
test_both(df_clean, 10)

Without CV:
roc auc score:  0.68780475 (+/-0.00005)
Accuracy:  0.93297949 (+/-0.00002)

With CV:
roc auc score:  0.68679767 (+/-0.00008)
Accuracy:  0.93275221 (+/-0.00000)


##Cropped Data

In [16]:
df_cropped = pd.read_csv('data/cs-train_cropped.csv').drop(['Unnamed: 0'], axis=1)
test_both(df_cropped, 10)

Without CV:
roc auc score:  0.67537069 (+/-0.00168)
Accuracy:  0.93153538 (+/-0.00002)

With CV:
roc auc score:  0.67886055 (+/-0.00006)
Accuracy:  0.93156876 (+/-0.00002)


##Oversampled Data

In [17]:
df_oversampled = pd.read_csv('data/cs-train_oversampled.csv').drop(['Unnamed: 0'], axis=1)
test_both(df_oversampled, 10)

Without CV:
roc auc score:  0.78649721 (+/-0.00069)
Accuracy:  0.71259445 (+/-0.00115)

With CV:
roc auc score:  0.79916801 (+/-0.00012)
Accuracy:  0.72265932 (+/-0.00001)


##Undersampled Data

In [18]:
df_undersampled = pd.read_csv('data/cs-train_undersampled.csv').drop(['Unnamed: 0'], axis=1)
test_both(df_undersampled, 10)

Without CV:
roc auc score:  0.74508098 (+/-0.00264)
Accuracy:  0.67565078 (+/-0.00225)

With CV:
roc auc score:  0.75795545 (+/-0.00199)
Accuracy:  0.68924710 (+/-0.00074)
