#Logistic Regression for Credit Data

In [18]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.cross_validation import cross_val_score, ShuffleSplit
from sklearn.preprocessing import StandardScaler
import sklearn as sk
import pyroc

In [19]:
from scipy.stats import sem

def mean_score(scores):
    return ('Mean score: {0:.8f} (+/-{1:.5f})').format(np.mean(scores), sem(scores))

##Cleaned Data

In [20]:
def test_log_reg(df):
    lr = LogisticRegression()
    train, test = sk.cross_validation.train_test_split(df, train_size=0.75)
    y = train['SeriousDlqin2yrs']
    X = train.drop('SeriousDlqin2yrs', axis=1)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y_test = test['SeriousDlqin2yrs']
    X_test = test.drop('SeriousDlqin2yrs', 1)
    X_test = scaler.fit_transform(X_test)
    lr.fit(X, y)
    probas = lr.predict_proba(X_test)
    print 'roc auc score: ' + str(sk.metrics.roc_auc_score(y_test, probas.T[1]))
    print 'Accuracy: ' + str(lr.score(X_test, y_test))

In [21]:
def test_log_reg_cv(df):
    train, test = sk.cross_validation.train_test_split(df, train_size=0.75)
    y = train['SeriousDlqin2yrs']
    X = train.drop('SeriousDlqin2yrs', axis=1)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    y_test = test['SeriousDlqin2yrs']
    X_test = test.drop('SeriousDlqin2yrs', 1)
    X_test = scaler.fit_transform(X_test)
    cv_lr = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.25, random_state=1)
    lr_cv = LogisticRegressionCV(cv=cv_lr)
    lr_cv.fit(X,y)
    probas = lr_cv.predict_proba(X_test)
    print 'roc auc score: ' + str(sk.metrics.roc_auc_score(y_test, lr_cv.predict_proba(X_test).T[1]))
    print 'Accuracy: ' + str(lr_cv.score(X_test, y_test))

In [22]:
def test_both(df):
    print 'Without CV:'
    test_log_reg(df)
    print '\nWith CV:'
    test_log_reg_cv(df)

##Cleaned Data

In [23]:
df_clean = pd.read_csv('data/cs-train_clean.csv').drop(['Unnamed: 0'], axis=1)
test_both(df_clean)

Without CV:
roc auc score: 0.691823983618
Accuracy: 0.932297655018

With CV:
roc auc score: 0.698367311676
Accuracy: 0.93307307682


##Cropped Data

In [198]:
df_cropped = pd.read_csv('data/cs-train_cropped.csv').drop(['Unnamed: 0'], axis=1)
test_both(df_cropped)

Without CV:
roc auc score: 0.677029551951
Accuracy: 0.929526406811

With CV:
roc auc score: 0.69058125882
Accuracy: 0.932253558601


##Oversampled Data

In [199]:
df_oversampled = pd.read_csv('data/cs-train_oversampled.csv').drop(['Unnamed: 0'], axis=1)
test_both(df_oversampled)

Without CV:
roc auc score: 0.796312845586
Accuracy: 0.728105333793

With CV:
roc auc score: 0.796431627544
Accuracy: 0.721104844121


##Undersampled Data

In [201]:
df_undersampled = pd.read_csv('data/cs-train_undersampled.csv').drop(['Unnamed: 0'], axis=1)
test_both(df_undersampled)

Without CV:
roc auc score: 0.797534120528
Accuracy: 0.72098588179

With CV:
roc auc score: 0.803146268055
Accuracy: 0.72098588179
