#Logistic Regression for Credit Data

In [35]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.cross_validation import cross_val_score, ShuffleSplit
from sklearn.preprocessing import StandardScaler
import sklearn as sk
import pyroc

In [50]:
from scipy.stats import sem

def mean_score(text, scores):
    return (str(text) + ' {0:.8f} (+/-{1:.5f})').format(np.mean(scores), sem(scores))

##Cleaned Data

In [59]:
def test_log_reg(df, n_iter):
    auc_scores = []
    accs = []
    for i in range(0, n_iter):
        lr = LogisticRegression()
        train, test = sk.cross_validation.train_test_split(df, train_size=0.75)
        y = train['SeriousDlqin2yrs']
        X = train.drop('SeriousDlqin2yrs', axis=1)
        y_test = test['SeriousDlqin2yrs']
        X_test = test.drop('SeriousDlqin2yrs', 1)
        lr.fit(X, y)
        probas = lr.predict_proba(X_test)
        auc_scores.append(sk.metrics.roc_auc_score(y_test, probas.T[1]))
        accs.append(lr.score(X_test, y_test))
    print(mean_score('roc auc score: ', auc_scores))
    print(mean_score('Accuracy: ', accs))

In [60]:
def test_log_reg_cv(df, n_iter):
    auc_scores = []
    accs = []
    for i in range(0, n_iter):
        train, test = sk.cross_validation.train_test_split(df, train_size=0.75)
        y = train['SeriousDlqin2yrs']
        X = train.drop('SeriousDlqin2yrs', axis=1)
        y_test = test['SeriousDlqin2yrs']
        X_test = test.drop('SeriousDlqin2yrs', 1)
        cv_lr = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.25, random_state=1)
        lr_cv = LogisticRegressionCV(cv=cv_lr)
        lr_cv.fit(X,y)
        probas = lr_cv.predict_proba(X_test)
        auc_scores.append(sk.metrics.roc_auc_score(y_test, lr_cv.predict_proba(X_test).T[1]))
        accs.append(lr_cv.score(X_test, y_test))
    print(mean_score('roc auc score: ', auc_scores))
    print(mean_score('Accuracy: ', accs))

In [61]:
def test_both(df, n_iter):
    print 'Without CV:'
    test_log_reg(df, n_iter)
    print '\nWith CV:'
    test_log_reg_cv(df, n_iter)

##Cleaned Data

In [62]:
df_clean = pd.read_csv('data/cs-train_clean.csv').drop(['Unnamed: 0'], axis=1)
test_both(df_clean, 10)

Without CV:
roc auc score:  0.69582843 (+/-0.00290)
Accuracy:  0.93345009 (+/-0.00045)

With CV:
roc auc score:  0.69631654 (+/-0.00178)
Accuracy:  0.93310249 (+/-0.00036)


##Cropped Data

In [64]:
df_cropped = pd.read_csv('data/cs-train_cropped.csv').drop(['Unnamed: 0'], axis=1)
test_both(df_cropped, 10)

Without CV:
roc auc score:  0.68944698 (+/-0.00207)
Accuracy:  0.93118158 (+/-0.00032)

With CV:
roc auc score:  0.68822658 (+/-0.00157)
Accuracy:  0.93024700 (+/-0.00039)


##Oversampled Data

In [65]:
df_oversampled = pd.read_csv('data/cs-train_oversampled.csv').drop(['Unnamed: 0'], axis=1)
test_both(df_oversampled, 10)

Without CV:
roc auc score:  0.78795195 (+/-0.00388)
Accuracy:  0.70956608 (+/-0.00710)

With CV:
roc auc score:  0.80381864 (+/-0.00053)
Accuracy:  0.72424625 (+/-0.00071)


##Undersampled Data

In [66]:
df_undersampled = pd.read_csv('data/cs-train_undersampled.csv').drop(['Unnamed: 0'], axis=1)
test_both(df_undersampled, 10)

Without CV:
roc auc score:  0.76162387 (+/-0.00377)
Accuracy:  0.68908690 (+/-0.00364)

With CV:
roc auc score:  0.79303525 (+/-0.00286)
Accuracy:  0.71425711 (+/-0.00286)
