# Final Project

Josep Puig Ruiz

Due December 11th, 2019

EN.553.688: Financial Computing, Fall 2019

Instructor: Dr. Naiman

----

# Importing Packages

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import statistics
from sklearn import tree
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

----

# Preprocessing

In [2]:
# Reading data from CSV
dfA0 = pd.read_csv("A0.csv").set_index("LID")
dfA1 = pd.read_csv("A1.csv").set_index("LID")
dfA2 = pd.read_csv("A2.csv").set_index("LID")
dfCF0 = pd.read_csv("CASH_FLOW0.csv").set_index("LID")
dfCF1 = pd.read_csv("CASH_FLOW1.csv").set_index("LID")

# Function to preprocess Acquisition DataFrames
def preprocessdfA(dfA):
    # Convert ORIG_DATE into days from "2000-01-01"
    dfA["ORIG_DATE"] = pd.to_datetime(dfA["ORIG_DATE"]).apply(findDays)
    
    # Delete features that I do not consider for my model 
    del dfA['FIRST_PMT_DATE']
    del dfA['MORT_INS_PCT']
    del dfA['MORT_INS_TYPE']
    del dfA['SELLER']
    del dfA['ZIP']
    del dfA['CO_CREDIT_SCORE']
    del dfA['STATE']
    del dfA['OCHANNEL']
    del dfA['PROD_TYPE']
    del dfA['OCC_STATUS']
    
    # Fill missing values 
    dfA['CLTV'].fillna((dfA['CLTV'].mean()), inplace=True)
    dfA['LTV'].fillna((dfA['CLTV'].mean()), inplace=True)
    dfA['NBORROWERS'].fillna((dfA['NBORROWERS'].mean()), inplace=True)
    dfA['DTI'].fillna((dfA['DTI'].mean()), inplace=True)
    dfA['CREDIT_SCORE'].fillna((dfA['CREDIT_SCORE'].mean()), inplace=True)
    
    # Encode categorical features
    categorical_feature_mask = dfA.dtypes==object
    categorical_cols = dfA.columns[categorical_feature_mask].tolist()
    le = LabelEncoder()
    dfA[categorical_cols] = dfA[categorical_cols].apply(lambda col: le.fit_transform(col))
    
    return

# Function to find days from arbitrary date
def findDays(a):
    date_format = "%Y-%d-%m"
    b = datetime.strptime("2000-01-01", date_format)
    return (a-b).days

# Preprocess the Acquisition DataFrames
preprocessdfA(dfA0)
preprocessdfA(dfA1)
preprocessdfA(dfA2)

# Function to preprocess Performance DataFrames
def preprocessCF(dfCF):    
    Y = pd.pivot_table(dfCF, index = ['LID'], 
                            aggfunc={'LOAN_AGE':[p12, p24, p36, p48, p60], 
                                     'PAYMENT':[r12, r24, r36, r48, r60]})
    return Y

#Required functions to preprocess Performance DataFrames
def p12(a):
    return 1 if len(a)>=12 else 0
def p24(a):
    return 1 if len(a)>=24 else 0
def p36(a):
    return 1 if len(a)>=36 else 0
def p48(a):
    return 1 if len(a)>=48 else 0
def p60(a):
    return 1 if len(a)>=60 else 0
def r12(a):
    return sum([a.iloc[i] for i in range(min(13, len(a)))]) 
def r24(a):
    return sum([a.iloc[i] for i in range(min(25, len(a)))])
def r36(a):
    return sum([a.iloc[i] for i in range(min(37, len(a)))])
def r48(a):
    return sum([a.iloc[i] for i in range(min(49, len(a)))])
def r60(a):
    return sum([a.iloc[i] for i in range(min(61, len(a)))])


# Preprocess Performance Data Frames
X0 = dfA0
y0 = preprocessCF(dfCF0)
X0.sort_index(ascending=True, inplace=True)
y0.sort_index(ascending=True, inplace=True)

X1 = dfA1
y1 = preprocessCF(dfCF1)
X1.sort_index(ascending=True, inplace=True)
y1.sort_index(ascending=True, inplace=True)

-----

# Estimating loan active chance, p(m), from A0, CF0, A1, and CF1

In [3]:
# Function to find a score in case y_predicted is a 0 or a 1
def score(y_real, y_predicted):
    return (sum([1 for i in range(len(y_real)) if y_real[i] != y_predicted[i] ])/len(y_real))

# Function to find a score in case y_predicted is a continuous value between 0 and 1
def scoreProb(y_real, y_predicted):
    return statistics.mean([abs(y_real[i]-y_predicted[i][1]) for i in range(len(y_real))])
    
# Function to find scores to predict algorithm performance
def getScores(dataSetNumber, pX):
    if dataSetNumber == 0:
        X, y = X0, y0
    elif dataSetNumber == 1:
        X, y = X1, y1
    else: 
        print("dataSetNumber can be only 0 or 1. Error")
        return
    # Split data into Train & Test
    Xtrain, Xtest, ytrain, ytest = train_test_split(X.values, y.loc[:,("LOAN_AGE", pX)].values, 
                                                    test_size=0.2, random_state=0)
    
    
    # Logistic Regression    
    logmodel = LogisticRegression(solver="lbfgs")
    logmodel.fit(Xtrain, ytrain)
    ypred = logmodel.predict(Xtest)
    sLR = score(ytest, ypred)
    
    # Logistic Regression with probability    
    ypred = logmodel.predict_proba(Xtest)
    sLRp = scoreProb(ytest, ypred)
    
    # Random Forest
    rfclass = RandomForestClassifier(n_estimators = 10, random_state = 42)
    rfclass.fit(Xtrain, ytrain)
    ypred = rfclass.predict(Xtest)
    sRF = score(ytest, ypred)
    
    # Decision Tree
    treeclass = tree.DecisionTreeClassifier(min_samples_split=25,min_samples_leaf=25)
    treeclass = treeclass.fit(Xtrain, ytrain)
    ypred = treeclass.predict(Xtest)
    sDT = score(ytest, ypred)

    # Gaussian NB
    gnb = GaussianNB()
    gnbclass = gnb.fit(Xtrain, ytrain)
    ypred = gnbclass.predict(Xtest)
    sGNB = score(ytest, ypred)
    
    return [sLR, sLRp, sRF, sDT, sGNB]


# Create a dataframe to store results
scores = {'ML Algorithm':['Logistic Regression', 'Logistic Regression with Prob','Random Forest', 
                          'Decision Tree', 'Gaussian NB']}
dfP = pd.DataFrame(scores) 

pXlist = ["p60", "p48", "p36", "p24", "p12"]

# Double loop to use find scores for algorithms, using both datasets 0 and 1, and also 12, 24, ..., 60 months
for pX in pXlist:
    for i in [1, 0]:
        nameColumn = "score"+str(i)+pX
        dfP.insert(1, nameColumn, getScores(i, pX))

#
dfP

Unnamed: 0,ML Algorithm,score0p12,score1p12,score0p24,score1p24,score0p36,score1p36,score0p48,score1p48,score0p60,score1p60
0,Logistic Regression,0.145754,0.146201,0.392712,0.397535,0.218472,0.219845,0.141028,0.140676,0.101491,0.101779
1,Logistic Regression with Prob,0.244411,0.245538,0.468857,0.470413,0.315654,0.317952,0.223259,0.224152,0.169683,0.170887
2,Random Forest,0.159455,0.1607,0.378629,0.377224,0.222208,0.223454,0.146457,0.146329,0.105068,0.10593
3,Decision Tree,0.14997,0.151598,0.37106,0.372018,0.226519,0.22802,0.148085,0.147894,0.105835,0.107559
4,Gaussian NB,0.157666,0.161307,0.441574,0.446109,0.230256,0.234982,0.16284,0.164436,0.129595,0.132118


----

# Estimating cumulative revenues, r(m), from A0, CF0, A1, and CF1

In [36]:
# Function to find a score in case y_predicted is a 0 or a 1
def score(y_real, y_predicted):
    return statistics.mean([abs(y_real[i]-y_predicted[i]) for i in range(len(y_real))])

# Function to find scores to predict algorithm performance
def getScores(dataSetNumber, rX):
    if dataSetNumber == 0:
        X, y = X0, y0
    elif dataSetNumber == 1:
        X, y = X1, y1
    else: 
        print("dataSetNumber can be only 0 or 1. Error")
        return
    
    # Split data into Train & Test
    Xtrain, Xtest, ytrain, ytest = train_test_split(X.values, y.loc[:,("PAYMENT", rX)].values, 
                                                    test_size=0.2, random_state=0)
    
    
    # Linear Regression without polynomial feature
    reg = LinearRegression().fit(Xtrain, ytrain)
    ypred = reg.predict(Xtest)
    lr0 = score(ytest, ypred)
    
    # Linear Regression without polynomial feature
    poly = PolynomialFeatures(2, interaction_only = False)
    X2=poly.fit_transform(Xtrain)
    Xtest2 = poly.transform(Xtest)
    reg = LinearRegression().fit(X2, ytrain)
    ypred = reg.predict(Xtest2)
    lr1 = score(ytest, ypred)
    
    # Linear Regression with polynomial feature, only interaction
    poly = PolynomialFeatures(2, interaction_only = True)
    X3=poly.fit_transform(Xtrain)
    Xtest3 = poly.transform(Xtest)
    reg = LinearRegression().fit(X3, ytrain)
    ypred = reg.predict(Xtest3)
    lr2 = score(ytest, ypred)

    # Linear Regression with polynomial feature, degree 3
    poly = PolynomialFeatures(3, interaction_only = False)
    X4=poly.fit_transform(Xtrain)
    Xtest4 = poly.transform(Xtest)
    reg = LinearRegression().fit(X4, ytrain)
    ypred = reg.predict(Xtest4)
    lr3 = score(ytest, ypred)

    # Linear Regression with polynomial feature, degree 4
    poly = PolynomialFeatures(4, interaction_only = False)
    X4=poly.fit_transform(Xtrain)
    Xtest4 = poly.transform(Xtest)
    reg = LinearRegression().fit(X4, ytrain)
    ypred = reg.predict(Xtest4)
    lr4 = score(ytest, ypred)

    
    return [lr0, lr1, lr2, lr3, lr4]

# Create a dataframe to store results
scores = {'ML Algorithm':['No Polyn Feat', 'Polyn Feat=2', 'Polyn Feat=2, interactions only', 'Polyn Feat=3', 'Polyn Feat=4']}
dfR = pd.DataFrame(scores) 


# Double loop to use find scores for algorithms, using both datasets 0 and 1, and also 12, 24, ..., 60 months
rXlist = ["r60", "r48", "r36", "r24", "r12"]
for rX in rXlist:
    for i in [1, 0]:
        nameColumn = "error"+str(i)+rX
        dfR.insert(1, nameColumn, getScores(i, rX))

dfR

Unnamed: 0,ML Algorithm,error0r12,error1r12,error0r24,error1r24,error0r36,error1r36,error0r48,error1r48,error0r60,error1r60
0,No Polyn Feat,45146.482315,44921.199498,52235.489734,52072.389364,26949.938751,27006.449521,17574.723187,17597.281899,13260.524131,13300.445677
1,Polyn Feat=2,41309.782589,40984.623562,50790.067303,50784.255857,26570.419453,26555.996499,17290.061858,17282.126608,13070.907878,13103.745736
2,"Polyn Feat=2, interactions only",41629.78691,41378.536269,51141.600388,51116.554357,26972.102978,27000.679034,17470.170961,17489.102394,13123.272003,13167.677025
3,3,40642.218351,40362.870625,50323.84459,50249.051099,26477.067487,26481.922653,17322.87481,17324.209036,13101.98767,13163.846933
4,4,40782.356901,40528.019144,50759.435038,50890.275622,26725.777727,26694.735586,17349.238495,17404.920785,13097.156791,13205.929485


-------

# Predict performance for A2

In [5]:
# Concatenate data from datasets 0 and 1 to use it all for training
Xtrain = pd.concat([X0,X1], axis=0)
ytrain = pd.concat([y0,y1], axis=0)
Xtrain.sort_index(ascending=True, inplace=True)
ytrain.sort_index(ascending=True, inplace=True)

# dfA2 stores the acquisition data for dataset 2
X2 = dfA2


# Apply the best performing algorithms to calculate p12, p24, ..., p60

logmodel = LogisticRegression(solver="lbfgs")
logmodel.fit(Xtrain.values, ytrain.loc[:,("LOAN_AGE", "p12")].values)
p12 = logmodel.predict(X2)

treeclass = tree.DecisionTreeClassifier(min_samples_split=25,min_samples_leaf=25)
treeclass = treeclass.fit(Xtrain, ytrain.loc[:,("LOAN_AGE", "p24")].values)
p24 = treeclass.predict(X2)

logmodel = LogisticRegression(solver="lbfgs")
logmodel.fit(Xtrain.values, ytrain.loc[:,("LOAN_AGE", "p36")].values)
p36 = logmodel.predict(X2)

logmodel = LogisticRegression(solver="lbfgs")
logmodel.fit(Xtrain.values, ytrain.loc[:,("LOAN_AGE", "p48")].values)
p48 = logmodel.predict(X2)

logmodel = LogisticRegression(solver="lbfgs")
logmodel.fit(Xtrain.values, ytrain.loc[:,("LOAN_AGE", "p60")].values)
p60 = logmodel.predict(X2)


# Apply the best performing algorithms to calculate r12, r24, ..., r60

poly = PolynomialFeatures(3, interaction_only = False)
XtrainPF=poly.fit_transform(Xtrain)
X2PF = poly.transform(X2)
reg = LinearRegression().fit(XtrainPF, ytrain.loc[:,("PAYMENT", "r12")].values)
r12 = reg.predict(X2PF)

poly = PolynomialFeatures(3, interaction_only = False)
XtrainPF=poly.fit_transform(Xtrain)
X2PF = poly.transform(X2)
reg = LinearRegression().fit(XtrainPF, ytrain.loc[:,("PAYMENT", "r12")].values)
r24 = reg.predict(X2PF)

poly = PolynomialFeatures(3, interaction_only = False)
XtrainPF=poly.fit_transform(Xtrain)
X2PF = poly.transform(X2)
reg = LinearRegression().fit(XtrainPF, ytrain.loc[:,("PAYMENT", "r12")].values)
r36 = reg.predict(X2PF)

poly = PolynomialFeatures(2, interaction_only = False)
XtrainPF=poly.fit_transform(Xtrain)
X2PF = poly.transform(X2)
reg = LinearRegression().fit(XtrainPF, ytrain.loc[:,("PAYMENT", "r12")].values)
r48 = reg.predict(X2PF)

poly = PolynomialFeatures(2, interaction_only = False)
XtrainPF=poly.fit_transform(Xtrain)
X2PF = poly.transform(X2)
reg = LinearRegression().fit(XtrainPF, ytrain.loc[:,("PAYMENT", "r12")].values)
r60 = reg.predict(X2PF)


# Save the predicted values into a DataFrame
dfOutput = pd.DataFrame(index=X2.index)
dfOutput.insert(0, "r(60)", r60)
dfOutput.insert(0, "r(48)", r48)
dfOutput.insert(0, "r(36)", r36)
dfOutput.insert(0, "r(24)", r24)
dfOutput.insert(0, "r(12)", r12)
dfOutput.insert(0, "p(60)", p60)
dfOutput.insert(0, "p(48)", p48)
dfOutput.insert(0, "p(36)", p36)
dfOutput.insert(0, "p(24)", p24)
dfOutput.insert(0, "p(12)", p12)

# Export predicted values into csv file
dfOutput.to_csv(r'predictions.csv')

In [7]:
# Save the predicted values into a DataFrame
dfOutput = pd.DataFrame(index=X2.index)
dfOutput.insert(0, "r(60)", r60)
dfOutput.insert(0, "r(48)", r48)
dfOutput.insert(0, "r(36)", r36)
dfOutput.insert(0, "r(24)", r24)
dfOutput.insert(0, "r(12)", r12)
dfOutput.insert(0, "p(60)", p60)
dfOutput.insert(0, "p(48)", p48)
dfOutput.insert(0, "p(36)", p36)
dfOutput.insert(0, "p(24)", p24)
dfOutput.insert(0, "p(12)", p12)

# Export predicted values into csv file
dfOutput.to_csv(r'predictions.csv')

dfOutput.head()

Unnamed: 0_level_0,p(12),p(24),p(36),p(48),p(60),r(12),r(24),r(36),r(48),r(60)
LID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
899993094470,1.0,1.0,0.0,0.0,0.0,19220.285672,19220.285672,19220.285672,34832.315102,34832.315102
899984345565,1.0,0.0,0.0,0.0,0.0,20298.503803,20298.503803,20298.503803,36344.393849,36344.393849
899962467067,1.0,0.0,0.0,0.0,0.0,66227.309269,66227.309269,66227.309269,92823.555975,92823.555975
899956562768,1.0,0.0,0.0,0.0,0.0,41100.489221,41100.489221,41100.489221,89393.455914,89393.455914
899955704832,1.0,1.0,0.0,0.0,0.0,21953.316045,21953.316045,21953.316045,41179.30605,41179.30605


---
---
# END OF PROJECT

The remaining code snippets are from alternative approaches that I tried.

In [None]:

# Decision Tree
treeclass = tree.DecisionTreeClassifier(min_samples_split=25,min_samples_leaf=25)
treeclass = treeclass.fit(X_train, y_train)
y_tree = treeclass.predict(X_test)
CFtree = confusion_matrix(y_test, y_tree)
print("Confusion Matrix for DECISION TREE: \n", CFtree)
print("Accuracy: ", accuracy_CF(CFtree))

# Gaussian NB
gnb = GaussianNB()
gnbclass = gnb.fit(X_train, y_train)
y_gnb = gnbclass.predict(X_test)
CFgnb = confusion_matrix(y_test, y_gnb)
print("Confusion Matrix for Gaussian Naive Bayes: \n", CFgnb)
print("Accuracy: ", accuracy_CF(CFgnb))

# Neural Network
nnwclass = MLPClassifier().fit(X_train, y_train)
y_nnw = nnwclass.predict(X_test)
CFnnw = confusion_matrix(y_test, y_nnw)
print("Confusion Matrix for NEURAL NETWORK: \n", CFnnw)
print("Accuracy: ", accuracy_CF(CFnnw))

# SVM
svmclass = svm.SVC()
svmclass.fit(X_train, y_train)
y_svm = svmclass.predict(X_test)
CFsvm = confusion_matrix(y_test, y_svm)
print("Confusion Matrix for SVM: \n", CFsvm)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xtrain0.values, Ytrain0.loc[:,("LOAN_AGE", "p60")].values, 
                                                    test_size=0.25, random_state=0)
avg_score = [-1]*100


# Logistic Regession
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)
y_log_prob = logmodel.predict_proba(X_test)
y_log = logmodel.predict(X_test)

y_log_method= classifyLogistic(y_log_prob, 0.5)


CFlog = confusion_matrix(y_test, y_log_method)
print("Confusion Matrix for LOGISTIC REGRESSION: \n", CFlog)
print("Accuracy: ", accuracy_CF(CFlog))



X_score = Xtrain1
y_score = Ytrain1.loc[:,("LOAN_AGE", "p60")].values
y_log_proba_score = logmodel.predict_proba(X_score)

import statistics    
def findScore(y_score, y_log_proba_score, threshold):
    y_log = classifyLogistic(y_log_proba_score, threshold)
    absDifference = statistics.mean([abs(y_score[i]-y_log[i]) for i in range(len(y_score))])
    return absDifference


def classifyLogistic(y_log_prob, threshold):
    y_log = []
    for [a,b] in y_log_prob:
        if a<threshold:
            y_log.append(1)
        else:
            y_log.append(0)
    return y_log

score = []
threshold = [0.01+0.01*i for i in range(99)]
for t in threshold:
    print("Threshold: ", t, "Mean of deviation: ", findScore(y_score, y_log_proba_score, t))
    score.append(findScore(y_score, y_log_proba_score, t))


plt.plot(threshold, score)
print(y_log_prob)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xtrain0.values, Ytrain0.loc[:,("LOAN_AGE", "p12")].values, 
                                                    test_size=0.25, random_state=0)


def accuracy_CF(CF):
    return (CF[0][0]+CF[1][1])/(CF[0][0]+CF[1][1]+CF[1][0]+CF[0][1])

def meanDifference(y_prob, y_real):
    return(statistics.mean([abs(y_prob[i][1]-y_real[i]) for i in range(len(y_prob))]))


    
    
# Logistic Regession
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)
y_log_prob = logmodel.predict_proba(X_test)
y_log = logmodel.predict(X_test)
CFlog = confusion_matrix(y_test, y_log)
print("Confusion Matrix for LOGISTIC REGRESSION: \n", CFlog)
print("Accuracy: ", accuracy_CF(CFlog))
print("Mean difference: ", meanDifference(y_log_prob, y_test))