In [None]:
# Package imports
%matplotlib inline 
from IPython.display import Image
import matplotlib as mlp
import matplotlib.pyplot as plt
import numpy as np
import os
import glob
import pandas as pd
import sklearn
import warnings
from sklearn.metrics import roc_curve
from sklearn.utils import ConvergenceWarning
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn import cross_validation
from sklearn import tree
from sklearn import svm
from sklearn import ensemble
from sklearn import neighbors
from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing
plt.style.use('fivethirtyeight') # Good looking plots
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import logit, probit, poisson, ols
from sklearn import datasets
from pybrain.utilities import percentError
from pybrain.tools.shortcuts import buildNetwork
from pybrain.supervised.trainers import BackpropTrainer
from pybrain.structure.modules import SoftmaxLayer
from pybrain.datasets.classification import ClassificationDataSet
from pybrain.tools.validation import Validator

In [None]:
#Creating function for data processing & data cleaning

def featureSelectionRFE():
    from sklearn.feature_selection import RFE
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression()
    # create the RFE model and select 10 attributes
    rfe = RFE(model, 10)
    rfe = rfe.fit(train_data[0:,1:], train_data[0:,0])
    # summarize the selection of the attributes
    print(rfe.support_)
    print(rfe.ranking_)
    print(rfe.n_features_)
    #Check the accuracy of the model
    rfe.score(train_data[0:,1:], train_data[0:,0])

def transformDF(df):
    df['delinquent'] = (df.delq_sts > 0).astype(int)
    df = df.drop(['cd_zero_bal'],axis = 1)
    df = df.drop('delq_sts', axis = 1)
    return df

def createDummies(df):
    dummies = pd.get_dummies(df['repch_flag']).rename(columns=lambda x: 'repch_flag' + str(x))
    df = pd.concat([df, dummies], axis=1)
    dummies1 = pd.get_dummies(df['cd_zero_bal']).rename(columns=lambda x: 'cd_zero_bal' + str(x))
    df = pd.concat([df, dummies1], axis=1)
    return df
    

def fillNA(df):
    df['delq_sts'] = df['delq_sts'].fillna(0)
    df['repch_flag']=df['repch_flag'].fillna('X')
    df['flag_mod']=df['flag_mod'].fillna('N')
    df['cd_zero_bal']=df['cd_zero_bal'].fillna(0)
    df['dt_zero_bal']=df['dt_zero_bal'].fillna('189901')
    df['non_int_brng_upb']=df['non_int_brng_upb'].fillna(0)
    df['dt_lst_pi']=df['dt_lst_pi'].fillna('189901')
    df['mi_recoveries']=df['mi_recoveries'].fillna(0)
    df['net_sale_proceeds']=df['net_sale_proceeds'].fillna(0)
    df['non_mi_recoveries']=df['non_mi_recoveries'].fillna(0)
    df['expenses']=df['expenses'].fillna(0)
    df['legal_costs']=df['legal_costs'].fillna(0)
    df['maint_pres_costs']=df['maint_pres_costs'].fillna(0)
    df['taxes_ins_costs']=df['taxes_ins_costs'].fillna(0)
    df['misc_costs']=df['misc_costs'].fillna(0)
    df['actual_loss']=df['actual_loss'].fillna(0)
    df['modcost']=df['modcost'].fillna(0)
    return df

def changedtype(df):
    #Change the data types for all column
    df[['non_int_brng_upb','actual_loss','modcost','misc_costs','taxes_ins_costs','maint_pres_costs','legal_costs','expenses','current_int_rt','current_upb']] = df[['non_int_brng_upb','actual_loss','modcost','misc_costs','taxes_ins_costs','maint_pres_costs','legal_costs','expenses','current_int_rt','current_upb']].astype('float64')
    df[['loan_age','mths_remng','cd_zero_bal','delq_sts','flag_mod_n']] = df[['loan_age','mths_remng','cd_zero_bal','delq_sts','flag_mod_n']].astype('int64')
    df[['svcg_cycle','dt_zero_bal','dt_lst_pi']] = df[['svcg_cycle','dt_zero_bal','dt_lst_pi']].astype('str')
    return df


def createDataFrame(str):
    perf_df = pd.read_csv(str ,sep="|", names=['id_loan','svcg_cycle','current_upb','delq_sts','loan_age','mths_remng', 'repch_flag','flag_mod', 'cd_zero_bal', 'dt_zero_bal','current_int_rt','non_int_brng_upb','dt_lst_pi','mi_recoveries', 'net_sale_proceeds','non_mi_recoveries','expenses', 'legal_costs', 'maint_pres_costs','taxes_ins_costs','misc_costs','actual_loss', 'modcost'],skipinitialspace=True,error_bad_lines=False, index_col=False, dtype='unicode') 
    perf_df['delq_sts'] = [ 999 if x=='R' else x for x in (perf_df['delq_sts'].apply(lambda x: x))]
    perf_df['delq_sts'] = [ 0 if x=='XX' else x for x in (perf_df['delq_sts'].apply(lambda x: x))]
    perf_df['flag_mod_n'] = [ 0 if x=='N' else 1 for x in (perf_df['flag_mod'].apply(lambda x: x))]
    perf_df[['net_sale_proceeds']] = [ 0 if x=='U' else x for x in (perf_df['net_sale_proceeds'].apply(lambda x: x))]
    perf_df[['net_sale_proceeds']] = [ perf_df['current_upb'] if x=='C' else x for x in (perf_df['net_sale_proceeds'].apply(lambda x: x))]
    perf_df['Year'] = ['19'+x if x=='99' else '20'+x for x in (perf_df['id_loan'].apply(lambda x: x[2:4]))]
    perf_df = fillNA(perf_df)
    perf_df = changedtype(perf_df)
    return perf_df

#Ensures all required features 
def checkAllReqColumns(df):
    for x in cols_to_keep:
        if not x in df.columns:
            df[x]=0.0
    return df 

In [None]:
inputpath=str(os.getcwd())+"\\"+'input.csv'

reader=csv.reader(open(inputpath),delimiter=',')
data=[]
for row in reader:
    data.append(row)
    
train=data[0][0]
test=data[0][1]

#Creating DataFrame
foldername= 'historical_data1_'+str(year)
Historicalpath=str(os.getcwd())+"\\"+foldername
train=Historicalpath+"\historical_data1_time_"+str(train)+".txt"
test=Historicalpath+"\historical_data1_time_"+str(test)+".txt"
print("Creating Training Dataframe...")
train_df = createDataFrame(train)
print("Creating Test Dataframe...")
test_df = createDataFrame(test)

In [None]:
#Creating Dummy Variables
print("Creating Dummy vars...")
train_df=createDummies(train_df)
test_df=createDummies(test_df)

In [None]:
#Remove the deliquent column for the df
print("Transforming Dataframes...")
train_df=transformDF(train_df)
test_df=transformDF(test_df)

In [None]:
#Get all the column which are either int /Float
print("Converting to Numeric...")
train_num_df = train_df._get_numeric_data()
test_num_df = test_df._get_numeric_data()

In [None]:
#Keep the following 10 features (variables) which are important
cols_to_keep = ['cd_zero_bal6', 'cd_zero_bal1', 'repch_flagX','cd_zero_bal0','repch_flagN','repch_flagY','current_int_rt','cd_zero_bal3','flag_mod_n','loan_age']
print("Checking all required columns in train and test dataframes")
train_num_df=checkAllReqColumns(train_num_df)
test_num_df= checkAllReqColumns(test_num_df)

In [None]:
#Setting the input parameter for Classification Algorithm
print("Creating X and y variables for Train and Test Dataframes")
train_num_df_X = train_num_df[cols_to_keep]
delinquent_train_y = train_num_df['delinquent']

test_num_df_X = test_num_df[cols_to_keep]
delinquent_test_y = test_num_df['delinquent']

In [None]:
#Train the data using stratified_cross_validation technique
def stratified_cv(X, y, clf_class, shuffle=True, n_folds=2, **kwargs):
    stratified_k_fold = cross_validation.StratifiedKFold(y, n_folds=n_folds, shuffle=shuffle)
    y_pred = y.copy()
    for ii, jj in stratified_k_fold:
        X_train, X_test = X[ii], X[jj]
        y_train = y[ii]
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[jj] = clf.predict(X_test)
    return y_pred

In [None]:
def confusion_matrix_data(conf_matrix):
    fix, ax = plt.subplots(figsize=(16, 12))
    plt.suptitle('Confusion Matrix  on Data Set')
    for ii, values in conf_matrix.items():
        matrix = values['matrix']
        title = values['title']
        plt.subplot(2, 2, ii) # starts from 1
        plt.title(title);
        sns.heatmap(matrix, annot=True,  fmt='');

In [None]:
def build_logistic_Regression(train_num_df_X, train_y, test_num_df_X, test_y ):
    model = LogisticRegression()
    #Train the data on the one quater
    model = model.fit(train_num_df_X,train_y)
    model.score(train_num_df_X,train_y)  
    
    #Train the data on the one quater 
    scaler = preprocessing.StandardScaler()
    X = scaler.fit_transform(train_num_df_X)
    logistic_reg_acc_matrix=metrics.accuracy_score(train_y, stratified_cv(X, train_y, linear_model.LogisticRegression))
    logistic_reg_class_matrix=metrics.classification_report(train_y, stratified_cv(X, train_y, linear_model.LogisticRegression))
    logistic_reg_conf_matrix = metrics.confusion_matrix(train_y, stratified_cv(X, train_y, linear_model.LogisticRegression))
    print('Logistic Regression accuracy on train data:           {:.2f}'.format(logistic_reg_acc_matrix))
    print('Logistic Regression classification reprot on train data:\n {}\n'.format(logistic_reg_class_matrix))
    dli_pred_test=model.predict(test_num_df_X)
    logistic_reg_conf_matrix_test = confusion_matrix(test_y,dli_pred_test)
    print('Creating confusion Matrix on Train and Test data')
    conf_matrix = {                
                    1: {
                        'matrix': logistic_reg_conf_matrix,
                        'title': 'Logistic Regression on Train Data',
                       },
                    2: {
                        'matrix': logistic_reg_conf_matrix_test,
                        'title': 'Logistic Regression on Test Data',
                       },
                 }    
    confusion_matrix_data(conf_matrix)
    expected = test_y
    predicted = model.predict(test_num_df_X)
    acc = np.sum(predicted == expected)/len(expected)
    print("")
    print("Model Coeffiecient")
    print(model.coef_) 
    print("")
    print('accuracy on Test data={}'.format(acc))
    acc = 0
    print("")
    print("Classification report for Test data %s:\n%s\n"
     % (model, metrics.classification_report(expected, predicted)))     
    fpr, tpr, _ = roc_curve(test_y,predicted)
    #Plot ROC Curve
    print('Creating ROC curve on Test data')
    plt.figure()
    plt.plot(fpr,tpr,label="ROC Curve")
    plt.plot([0,1],[0,1],'k--')
    plt.xlim([0.0,1.0])
    plt.ylim([0.0,1.05])
    plt.xlabel("1-Specificity")
    plt.ylabel("Sensitivity")
    plt.title("ROC Curve")
    plt.legend(loc="lower right")
    plt.show()


In [None]:
#Calling Logistic Regression
print("Calling Logistic Regression with train and test dataframes")
build_logistic_Regression(train_num_df_X, delinquent_train_y, test_num_df_X, delinquent_test_y)


In [None]:
def build_Random_Forest(train_num_df_X, train_y, test_num_df_X, test_y ):
    model = RandomForestClassifier(n_estimators = 10)
    scaler = preprocessing.StandardScaler()
    X = scaler.fit_transform(train_num_df_X)
    #Train the data on the one quater
    random_acc_matrix=metrics.accuracy_score(train_y, stratified_cv(X, train_y, ensemble.RandomForestClassifier))
    random_class_matrix=metrics.classification_report(train_y, stratified_cv(X, train_y, ensemble.RandomForestClassifier))
    random_conf_matrix = metrics.confusion_matrix(train_y, stratified_cv(X, train_y, ensemble.RandomForestClassifier))
    print('Random Forest accuracy on train data:           {:.2f}'.format(random_acc_matrix))
    print('Random Forest classification reprot on train data:\n {}\n'.format(random_class_matrix))
    model = model.fit(train_num_df_X,train_y)
    model.score(train_num_df_X,train_y)
    dli_pred_test=model.predict(test_num_df_X)
    random_conf_matrix_test = confusion_matrix(test_y,dli_pred_test)
    print('Creating confusion Matrix on Train and Test data')
    conf_matrix = {                
                    1: {
                        'matrix': random_conf_matrix,
                        'title': 'Ramdom Forest on Train Data',
                       },
                    2: {
                        'matrix': random_conf_matrix_test,
                        'title': 'Ramdom Forest on Test Data',
                       },
                 }    
    confusion_matrix_data(conf_matrix)
    expected = test_y
    predicted = model.predict(test_num_df_X)
    acc = np.sum(predicted == expected)/len(expected)
    print("")
    print('accuracy on Test data={}'.format(acc))
    acc = 0
    print("")
    print("Classification report for Test data %s:\n%s\n"
     % (model, metrics.classification_report(expected, predicted)))
    preds=model.predict_proba(test_num_df_X)[:,1]
    fpr, tpr, _ = roc_curve(test_y,preds)
    
    #Plot ROC Curve
    print('Creating ROC curve on Test data')
    plt.figure()
    plt.plot(fpr,tpr,label="ROC Curve")
    plt.plot([0,1],[0,1],'k--')
    plt.xlim([0.0,1.0])
    plt.ylim([0.0,1.05])
    plt.xlabel("1-Specificity")
    plt.ylabel("Sensitivity")
    plt.title("ROC Curve")
    plt.legend(loc="lower right")
    plt.show()


In [None]:
#Calling Random Forest Classification
print("Calling Random Forest Classification with train and test dataframes")
build_Random_Forest(train_num_df_X, delinquent_train_y, test_num_df_X, delinquent_test_y)


In [None]:
def build_SVM(train_num_df_X, train_y, test_num_df_X, test_y ):
    classifier = svm.LinearSVC(C=1)
    scaler = preprocessing.StandardScaler()
    X = scaler.fit_transform(train_num_df_X)
    #Train the data on the one quater 
    svc_acc_matrix=metrics.accuracy_score(train_y, stratified_cv(X, train_y, svm.LinearSVC))
    svc_class_matrix=metrics.classification_report(train_y, stratified_cv(X, train_y,svm.LinearSVC))
    svc_conf_matrix = metrics.confusion_matrix(train_y, stratified_cv(X, train_y, svm.LinearSVC))
    print('Support Vector accuracy on train data:           {:.2f}'.format(svc_acc_matrix))
    print('Support Vector classification reprot on train data:\n {}\n'.format(svc_class_matrix))
    model = classifier.fit(train_num_df_X,train_y)
    classifier.score(train_num_df_X,train_y)
    dli_pred_test=classifier.predict(test_num_df_X)
    svm_conf_matrix_test = confusion_matrix(test_y,dli_pred_test)
    print('Creating confusion Matrix on Train and Test data')
    conf_matrix = {                
                    1: {
                        'matrix': svc_conf_matrix,
                        'title': 'SVM on Train Data',
                       },
                    2: {
                        'matrix': svm_conf_matrix_test,
                        'title': 'SVM on Test Data',
                       },
                 }    
    confusion_matrix_data(conf_matrix)
    preds=classifier.predict(test_num_df_X)
    
    expected = test_y
    predicted = classifier.predict(test_num_df_X)
    acc = np.sum(predicted == expected)/len(expected)
    print("")
    print('accuracy on Test data={}'.format(acc))
    acc = 0
    print("")
    print("Classification report for Test data %s:\n%s\n"
     % (classifier, metrics.classification_report(expected, predicted)))
    
    fpr, tpr, _ = roc_curve(test_y,preds)
    #Plot ROC Curve
    print('Creating ROC curve on Test data')
    plt.figure()
    plt.plot(fpr,tpr,label="ROC Curve")
    plt.plot([0,1],[0,1],'k--')
    plt.xlim([0.0,1.0])
    plt.ylim([0.0,1.05])
    plt.xlabel("1-Specificity")
    plt.ylabel("Sensitivity")
    plt.title("ROC Curve")
    plt.legend(loc="lower right")
    plt.show()


In [None]:
#Calling Liner SVM Classification
print("Calling Support Vector Machine with train and test dataframes")
build_SVM(train_num_df_X, delinquent_train_y, test_num_df_X, delinquent_test_y)


In [None]:
# build a neural network
def build_neural_network(train_num_df_X, train_y, test_num_df_X, test_y ):
 
    #Calculating rows and columns for input dfs
    trn_rows,trn_cols=train_num_df_X.shape
    tst_rows,tst_cols=test_num_df_X.shape
       
    # build train dataset
    print("Inside build_neural_network : ")
    print("Building train dataset")
    train_data = ClassificationDataSet(trn_cols, 1 , nb_classes=2)
    for k in range(len(train_num_df_X)): 
        train_data.addSample(train_num_df_X.iloc[k],train_y.iloc[k]) 
    
    # build test dataset
    print("Building test dataset")
    test_data = ClassificationDataSet(tst_cols, 1 , nb_classes=2)
    for k in range(len(test_num_df_X)): 
        test_data.addSample(test_num_df_X.iloc[k],test_y.iloc[k])
        
 
    print("Train Dataset input length: {}".format(len(train_data['input'])))
    print("Train Dataset output length: {}".format(len(train_data['target'])))
    print("Train Dataset input|output dimensions are {}|{}".format(train_data.indim, train_data.outdim))
     
    print("Train Data length: {}".format(len(train_data)))
    print("Test Data length: {}".format(len(test_data)))
 
    # encode with one output neuron per class
    train_data._convertToOneOfMany()
    test_data._convertToOneOfMany()
 
    print("Train Data input|output dimensions are {}|{}".format(train_data.indim, train_data.outdim))
    print("Test Data input|output dimensions are {}|{}".format(test_data.indim, test_data.outdim))
 
    # build network (INPUT=10,HIDDEN=5,CLASSES=2,outclass=SoftmaxLayer)
    print("Building Neural network with 5 hidden layer")
    network = buildNetwork(train_data.indim,5,train_data.outdim,outclass=SoftmaxLayer)
 
    # train network
    print("Training the network, it may take a while...")
    trainer = BackpropTrainer(network,dataset=train_data,momentum=0.1,verbose=True,weightdecay=0.01)
    trainer.trainOnDataset(train_data, 1) #training model on One epoch
 
    print("Total epochs: {}".format(trainer.totalepochs))
 
    # test network
    print("Predicting the output array with the trained model")
    output = network.activateOnDataset(test_data).argmax(axis=1)
     
    #Neural network Percent error and accuracy    
    print("Percent error: {}".format(percentError(output, test_data['class'])))
    accuracy=Validator.classificationPerformance(output, test_y)
    print("Model Accuracy: {}".format(accuracy))
    print("Classification report for Test data %s:\n%s\n"
     % (network, metrics.classification_report(test_y, output)))
    
    
    #Compute confusion metrics
    cm_train = confusion_matrix(test_y,output)
    cm_test = confusion_matrix(test_y,output)
    conf_matrix = {                
                    1: {
                        'matrix': cm_train,
                        'title': 'Neural Network on Train Data',
                       },
                    2: {
                        'matrix': cm_test,
                        'title': 'Neural Network on Test Data',
                       },
                 }    
    confusion_matrix_data(conf_matrix) 
    
    fpr, tpr, _ = roc_curve(test_y,output)
    #Plot ROC Curve
    print('Creating ROC curve on Test data')
    plt.figure()
    plt.plot(fpr,tpr,label="ROC Curve")
    plt.plot([0,1],[0,1],'k--')
    plt.xlim([0.0,1.0])
    plt.ylim([0.0,1.05])
    plt.xlabel("1-Specificity")
    plt.ylabel("Sensitivity")
    plt.title("ROC Curve")
    plt.legend(loc="lower right")
    plt.show()


In [None]:
#Calling Neural Network
print("Calling neural network with train and test dataframes")
build_neural_network(train_num_df_X, delinquent_train_y, test_num_df_X, delinquent_test_y)


In [None]:
def CopyToCSV(delinquent_test,cm,q1):
    q1="2005"
    
    columns=["Quarter","Total Number of actual Deliquent","Total Number of Predicted Deliquent","Total Number of Records in Dataset","Total Number of Deliquent Properly classified","Total Number of Deliquent improperly classified"]
    df=pd.DataFrame();
    rows=[q1,np.count_nonzero(delinquent_test==1),cm[1][0] + cm[1][1],len(test_num_df.axes[0]),cm[1][1],cm[1][0]]
    df=df.append({q1,np.count_nonzero(delinquent_test==1),cm[1][0] + cm[1][1],len(test_num_df.axes[0]),cm[1][1],cm[1][0]},ignore_index = True)    
    writeHeader = True
    filename= "DelinquentStatus.csv"
    if not os.path.exists(filename):
        writeHeader = False
    with open(filename, 'w',encoding='utf-8',newline="") as f:
        if writeHeader is False:
            df.to_csv(f, mode='a', header=True,index=False)
        else:
            df.to_csv(f, mode='a', header=False,index=False)