In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [2]:
c0=sns.color_palette()[0]
c1=sns.color_palette()[1]
c2=sns.color_palette()[2]

#### Get Data from pickle object

In [3]:
dfWithG=pd.read_pickle('data/dataframe.pkl')

In [4]:
dfWithG['PERFORMING']=[0 if x==0 else 1 for x in dfWithG['CURRENT LOAN DELINQUENCY STATUS']]

In [5]:
dfWithG.head()

Unnamed: 0,LOAN SEQUENCE NUMBER,MONTHLY REPORTING PERIOD,CURRENT ACTUAL UPB,CURRENT LOAN DELINQUENCY STATUS,REMAINING MONTHS TO LEGAL MATURITY,REPURCHASE FLAG,MODIFICATION FLAG,ZERO BALANCE CODE,CURRENT INTEREST RATE,CURRENT DEFERRED UPB,year,quarter,CREDIT SCORE,FIRST TIME HOMEBUYER FLAG,MORTGAGE INSURANCE PERCENTAGE,CLTV,DTI Ratio,change-current,change-chained,PERFORMING
0,F199Q1000002,2009-01-01,99584.03125,0,247,F,N,0,6.3,0.0,2009,1,733,N,0,51.0,0.0,-4.5,-5.4,0
1,F199Q1000002,2009-02-01,99386.359375,0,246,F,N,0,6.3,0.0,2009,1,733,N,0,51.0,0.0,-4.5,-5.4,0
2,F199Q1000002,2009-03-01,99187.648438,0,245,F,N,0,6.3,0.0,2009,1,733,N,0,51.0,0.0,-4.5,-5.4,0
3,F199Q1000016,2009-01-01,192371.5625,0,242,F,N,0,6.0,0.0,2009,1,738,N,0,73.0,44.0,-4.5,-5.4,0
4,F199Q1000016,2009-02-01,191958.671875,0,241,F,N,0,6.0,0.0,2009,1,738,N,0,73.0,44.0,-4.5,-5.4,0


In [6]:
print(len(dfWithG))
dtest=dfWithG[dfWithG["PERFORMING"]==1]
print (len(dtest))

1539707
110554


#### Get data for a quater

In [7]:
df1=dfWithG[(dfWithG['year']==2013) & (dfWithG['quarter']==4)]
df1=df1.groupby('LOAN SEQUENCE NUMBER').max()

In [8]:
print(len(df1))
dtest=df1[df1["PERFORMING"]==1]
print (len(dtest))

15722
1795


In [9]:
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score

def cv_score(clf, x, y, score_func=accuracy_score):
    result = 0
    nfold = 5
    for train, test in KFold(y.size, nfold): # split data into train/test groups, 5 times
        clf.fit(x[train], y[train]) # fit
        result += score_func(clf.predict(x[test]), y[test]) # evaluate score function on held-out data
    return result / nfold # average

#### Split the data into a training and test (hold-out) set

In [10]:
from sklearn.cross_validation import train_test_split
Xlr, Xtestlr, ylr, ytestlr = train_test_split(df1[['CREDIT SCORE','CURRENT INTEREST RATE']].values, 
                                              (df1['PERFORMING']== 1).values,random_state=5)

In [11]:
print (Xlr.shape)
print (Xtestlr.shape)
print (ylr.shape)
print (ytestlr.shape)

(11791, 2)
(3931, 2)
(11791,)
(3931,)


In [12]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(Xlr,ylr)
print (accuracy_score(clf.predict(Xtestlr),ytestlr))

0.892139404732


In [13]:
clf = LogisticRegression()
score = cv_score(clf, Xlr, ylr)
print (score)

0.883554833581


#### Using GridSearchCV tool

In [14]:
from sklearn.grid_search import GridSearchCV
lrbf2=LogisticRegression()
cparam={"C": [0.0001, 0.001, 0.1, 1, 10, 100]}
fitModel = GridSearchCV(lrbf2, param_grid=cparam, cv=5, scoring="accuracy")
fitModel.fit(Xlr, ylr)
fitModel.best_estimator_, fitModel.best_params_, fitModel.best_score_, fitModel.grid_scores_

(LogisticRegression(C=0.0001, class_weight=None, dual=False,
           fit_intercept=True, intercept_scaling=1, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
 {'C': 0.0001},
 0.88372487490458829,
 [mean: 0.88372, std: 0.00015, params: {'C': 0.0001},
  mean: 0.88372, std: 0.00015, params: {'C': 0.001},
  mean: 0.88372, std: 0.00015, params: {'C': 0.1},
  mean: 0.88372, std: 0.00031, params: {'C': 1},
  mean: 0.88364, std: 0.00020, params: {'C': 10},
  mean: 0.88372, std: 0.00031, params: {'C': 100}])

In [15]:
lrbf3=LogisticRegression(C=fitModel.best_params_['C'])
#fit the model using training data
lrbf3.fit(Xlr, ylr)
#predict values for y using the test data
ypredictNew=lrbf3.predict(Xtestlr)
#compare predicted y vales (in this case gender) with actual Ytest data
accuracy_score(ypredictNew,ytestlr)

0.89213940473162046

In [16]:
def cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=5):
    gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds)
    gs.fit(Xtrain, ytrain)
    print ("BEST PARAMS", gs.best_params_)
    best = gs.best_estimator_
    return best

In [17]:
from sklearn.cross_validation import train_test_split
def do_classify(clf, parameters, indf, featurenames, targetname, target1val, standardize=False, train_size=0.8):
    subdf=indf[featurenames]
    if standardize:
        subdfstd=(subdf - subdf.mean())/subdf.std()
    else:
        subdfstd=subdf
    X=subdfstd.values
    y=(indf[targetname].values==target1val)*1
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=train_size)
    clf = cv_optimize(clf, parameters, Xtrain, ytrain)
    clf=clf.fit(Xtrain, ytrain)
    training_accuracy = clf.score(Xtrain, ytrain)
    test_accuracy = clf.score(Xtest, ytest)
    print ("Accuracy on training data: %0.5f" % (training_accuracy))
    print ("Accuracy on test data:     %0.5f" % (test_accuracy))
    return clf, Xtrain, ytrain, Xtest, ytest

In [45]:
clf_l, Xtrain_l, ytrain_l, Xtest_l, ytest_l  = do_classify(LogisticRegression(), 
                                                           {"C": [0.0001, 0.001, 0.1,1, 10, 100]}, 
                                                           df1,['CREDIT SCORE','CURRENT INTEREST RATE','CLTV',
                                                                'DTI Ratio',
                                                               ], 
                                                           'PERFORMING',1)

BEST PARAMS {'C': 0.0001}
Accuracy on training data: 0.88590
Accuracy on test data:     0.88553


In [46]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
ypred_l=clf_l.predict(Xtest_l)
#f1=f1_score(ytest_l, ypred_l, average='macro')
#print (f1)
print(ypred_l.shape)
print(ytest_l.shape)
print(ypred_l)
print (ypred_l.min())
print (ypred_l.max())
print(ytest_l)
print (ytest_l.min())
print (ytest_l.max())

(3145,)
(3145,)
[0 0 0 ..., 0 0 0]
0
0
[0 0 0 ..., 0 0 0]
0
1


In [47]:
precision_recall_fscore_support(ytest_l, ypred_l,pos_label=0,average='binary')

(0.88553259141494434, 1.0, 0.93929173693085999, None)

In [48]:
f1=f1_score(ytest_l, ypred_l,pos_label=0,average='binary')
print (f1)

0.939291736931
