In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [2]:
c0=sns.color_palette()[0]
c1=sns.color_palette()[1]
c2=sns.color_palette()[2]

#### Get Data from pickle object

In [3]:
dfWithG=pd.read_pickle('data/dataframe.pkl')

In [4]:
dfWithG['NON-PERFORMING']=['Y' if x>0 else 'N' for x in dfWithG['CURRENT LOAN DELINQUENCY STATUS']]

In [5]:
dfWithG.head()

Unnamed: 0,LOAN SEQUENCE NUMBER,MONTHLY REPORTING PERIOD,CURRENT ACTUAL UPB,CURRENT LOAN DELINQUENCY STATUS,REMAINING MONTHS TO LEGAL MATURITY,REPURCHASE FLAG,MODIFICATION FLAG,ZERO BALANCE CODE,CURRENT INTEREST RATE,CURRENT DEFERRED UPB,year,quarter,CREDIT SCORE,FIRST TIME HOMEBUYER FLAG,MORTGAGE INSURANCE PERCENTAGE,CLTV,DTI Ratio,change-current,change-chained,NON-PERFORMING
0,F199Q1000002,2009-01-01,99584.03125,0,247,F,N,0,6.3,0.0,2009,1,733,N,0,51.0,0.0,-4.5,-5.4,N
1,F199Q1000002,2009-02-01,99386.359375,0,246,F,N,0,6.3,0.0,2009,1,733,N,0,51.0,0.0,-4.5,-5.4,N
2,F199Q1000002,2009-03-01,99187.648438,0,245,F,N,0,6.3,0.0,2009,1,733,N,0,51.0,0.0,-4.5,-5.4,N
3,F199Q1000016,2009-01-01,192371.5625,0,242,F,N,0,6.0,0.0,2009,1,738,N,0,73.0,44.0,-4.5,-5.4,N
4,F199Q1000016,2009-02-01,191958.671875,0,241,F,N,0,6.0,0.0,2009,1,738,N,0,73.0,44.0,-4.5,-5.4,N


In [6]:
print(len(dfWithG))
dtest=dfWithG[dfWithG["NON-PERFORMING"]=='Y']
print (len(dtest))
#dtest.head()

1538440
109287


#### Get data for a quater

In [7]:
df1=dfWithG[(dfWithG['year']==2013) & (dfWithG['quarter']==4)]
df1=df1.groupby('LOAN SEQUENCE NUMBER').max()

In [8]:
df1=df1.reset_index()

In [9]:
df1.head()

Unnamed: 0,LOAN SEQUENCE NUMBER,MONTHLY REPORTING PERIOD,CURRENT ACTUAL UPB,CURRENT LOAN DELINQUENCY STATUS,REMAINING MONTHS TO LEGAL MATURITY,REPURCHASE FLAG,MODIFICATION FLAG,CURRENT INTEREST RATE,CURRENT DEFERRED UPB,year,quarter,CREDIT SCORE,FIRST TIME HOMEBUYER FLAG,MORTGAGE INSURANCE PERCENTAGE,CLTV,DTI Ratio,change-current,change-chained,NON-PERFORMING
0,F199Q1000024,2013-12-01,86823.820312,0,186,F,N,6.0,0.0,2013,4,635,N,0,76.0,21.0,4.2,2.6,N
1,F199Q1000030,2013-12-01,29080.390625,0,186,F,N,5.875,0.0,2013,4,810,N,0,65.0,41.0,4.2,2.6,N
2,F199Q1000074,2013-12-01,133166.203125,0,185,F,N,8.0,0.0,2013,4,769,N,30,95.0,50.0,4.2,2.6,N
3,F199Q1000077,2013-12-01,162899.0625,33,480,F,Y,5.25,0.0,2013,4,625,N,0,80.0,29.0,4.2,2.6,Y
4,F199Q1000081,2013-12-01,63322.820312,0,185,F,N,7.0,0.0,2013,4,568,N,0,53.0,56.0,4.2,2.6,N


In [10]:
print(len(df1))
dtest=df1[df1["NON-PERFORMING"]=='Y']
print (len(dtest))

15684
1757


####  Get both two set of data with flag Y and N of equal size and merge them

In [11]:
dfNP=df1[df1["NON-PERFORMING"]=='Y']
dfP=df1[df1["NON-PERFORMING"]=='N']

In [12]:
dfNP=dfNP.sample(n=1700)
dfP=dfP.sample(n=1700)

In [13]:
print(len(dfNP))
print (len(dfP))

1700
1700


In [14]:
df1=dfNP.append(dfP)

In [15]:
print(len(df1))

3400


#### CV_SCORE function that uses KFold functions to split data in n fold and averages the score .

In [16]:
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
def cv_score(clf, x, y, score_func=accuracy_score):
    result = 0
    nfold = 5
    for train, test in KFold(y.size, nfold): # split data into train/test groups, 5 times
        clf.fit(x[train], y[train]) # fit
        result += score_func(clf.predict(x[test]), y[test]) # evaluate score function on held-out data
    return result / nfold # average

### Using do_classify method

In [17]:
def cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=5):
    gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds)
    gs.fit(Xtrain, ytrain)
    print ("BEST PARAMS", gs.best_params_)
    best = gs.best_estimator_
    return best

In [18]:
from sklearn.cross_validation import train_test_split
def do_classify(clf, parameters, indf, featurenames, targetname, target1val, standardize=False, train_size=0.8):
    subdf=indf[featurenames]
    if standardize:
        subdfstd=(subdf - subdf.mean())/subdf.std()
    else:
        subdfstd=subdf
    X=subdfstd.values
    y=(indf[targetname].values==target1val)*1
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=train_size)
    clf = cv_optimize(clf, parameters, Xtrain, ytrain)
    clf=clf.fit(Xtrain, ytrain)
    training_accuracy = clf.score(Xtrain, ytrain)
    test_accuracy = clf.score(Xtest, ytest)
    print ("Accuracy on training data: %0.5f" % (training_accuracy))
    print ("Accuracy on test data:     %0.5f" % (test_accuracy))
    return clf, Xtrain, ytrain, Xtest, ytest

####  1st Attempt with all four features

In [19]:
clf_l, Xtrain_l, ytrain_l, Xtest_l, ytest_l  = do_classify(LogisticRegression(), 
                                                           {"C": [0.0001, 0.001, 0.1,1, 10, 100]}, 
                                                           df1,['CREDIT SCORE','CURRENT INTEREST RATE','CLTV',
                                                                'DTI Ratio'], 
                                                           'NON-PERFORMING','Y')

BEST PARAMS {'C': 10}
Accuracy on training data: 0.67022
Accuracy on test data:     0.64265


#### Y values are 0 and 1 , 0 == False and 1== True , if Non-PERFORMING=='Y' then value is 1 

In [20]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
ypred_l=clf_l.predict(Xtest_l)


print(ypred_l.shape)
print(ytest_l.shape)
#print(ypred_l)
print (ypred_l.min())
print (ypred_l.max())
#print(ytest_l)
print (ytest_l.min())
print (ytest_l.max())


(680,)
(680,)
0
1
0
1


In [21]:
#yscore_l=clf_l.predict_proba(Xtest_l)

In [22]:
#yscore_l.

In [23]:
#from sklearn.metrics import roc_curve
#print (ytest_l.shape)
#print (yscore_l.shape)
#fpr, tpr, thresholds = roc_curve(ytest_l, yscore_l,pos_label=1)

In [24]:
from sklearn.metrics import confusion_matrix
confusion_matrix(ytest_l, ypred_l)

array([[208, 136],
       [107, 229]])

#### precision --> tp / (tp + fp )    recall --> tp / (tp + fn)

In [25]:
precision_recall_fscore_support(ytest_l, ypred_l,pos_label=1,average='binary')

(0.62739726027397258, 0.68154761904761907, 0.65335235378031387, None)

In [26]:
f1=f1_score(ytest_l, ypred_l,pos_label=1,average='binary')
print (f1)

0.65335235378


#### 2nd Attempt using three features 'CREDIT SCORE','CURRENT INTEREST RATE','CLTV'

In [27]:
clf_2, Xtrain_2, ytrain_2, Xtest_2, ytest_2  = do_classify(LogisticRegression(), 
                                                           {"C": [0.0001, 0.001, 0.1,1, 10, 100]}, 
                                                           df1,['CREDIT SCORE','CURRENT INTEREST RATE','CLTV'], 
                                                           'NON-PERFORMING','Y')

BEST PARAMS {'C': 1}
Accuracy on training data: 0.66176
Accuracy on test data:     0.66618


In [28]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
ypred_2=clf_2.predict(Xtest_2)



In [29]:
from sklearn.metrics import confusion_matrix
confusion_matrix(ytest_2, ypred_2)

array([[218, 123],
       [104, 235]])

In [30]:
precision_recall_fscore_support(ytest_2, ypred_2,pos_label=1,average='binary')

(0.65642458100558654, 0.69321533923303835, 0.67431850789096126, None)

In [31]:
f1=f1_score(ytest_2, ypred_2,pos_label=1,average='binary')
print (f1)

0.674318507891
