# Project MAP 569 - Credit Default Swap

In [1]:
## Import packages

import numpy as np
import math 
import os 
import pandas as pd
from sklearn.preprocessing import LabelEncoder
#from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

In [3]:
# we import the dataframe
dataframe_train = pd.read_csv('/Users/Romain/Desktop/Master_ITE/Courses_P2/MAP569-MLII/Project-CDS/CreditTraining.csv')
dataframe_train.head()

Unnamed: 0,Id_Customer,Y,Customer_Type,BirthDate,Customer_Open_Date,P_Client,Educational_Level,Marital_Status,Number_Of_Dependant,Years_At_Residence,Net_Annual_Income,Years_At_Business,Prod_Sub_Category,Prod_Decision_Date,Source,Type_Of_Residence,Nb_Of_Products,Prod_Closed_Date,Prod_Category
0,7440,0,Non Existing Client,07/08/1977,13/02/2012,NP_Client,University,Married,3.0,1,36,1.0,C,14/02/2012,Sales,Owned,1,,B
1,573,0,Existing Client,13/06/1974,04/02/2009,P_Client,University,Married,0.0,12,18,2.0,C,30/06/2011,Sales,Parents,1,,G
2,9194,0,Non Existing Client,07/11/1973,03/04/2012,NP_Client,University,Married,2.0,10,36,1.0,C,04/04/2012,Sales,Owned,1,,B
3,3016,1,Existing Client,08/07/1982,25/08/2011,NP_Client,University,Married,3.0,3,36,1.0,C,07/09/2011,Sales,New rent,1,31/12/2012,L
4,6524,0,Non Existing Client,18/08/1953,10/01/2012,NP_Client,University,Married,2.0,1,36,1.0,C,11/01/2012,Sales,Owned,1,,D


In [4]:
print('The shape of our features is:', dataframe_train.shape)
# 19 features originally! -> we need to expand it to deal with numeriacal values (one hot encoding)

The shape of our features is: (5380, 19)


In [5]:
# One-hot encode for categorical features
customer_type = pd.get_dummies(dataframe_train.Customer_Type)
p_client = pd.get_dummies(dataframe_train.P_Client)
educational_level = pd.get_dummies(dataframe_train.Educational_Level)
marital_status = pd.get_dummies(dataframe_train.Marital_Status)
source = pd.get_dummies(dataframe_train.Source)
type_of_residence = pd.get_dummies(dataframe_train.Type_Of_Residence)
prod_category = pd.get_dummies(dataframe_train.Prod_Category)

# concatenation of multiple dataframes
new_df = pd.concat([customer_type, p_client, educational_level, marital_status, source, type_of_residence, prod_category], axis=1, sort=False)
new_df.head()


Unnamed: 0,Existing Client,Non Existing Client,NP_Client,P_Client,Diploma,Master/PhD,Secondary or Less,University,Divorced,Married,...,D,E,F,G,H,I,J,K,L,M
0,0,1,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,1,0,1,...,0,0,0,1,0,0,0,0,0,0
2,0,1,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,0,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,1,0
4,0,1,1,0,0,0,0,1,0,1,...,1,0,0,0,0,0,0,0,0,0


In [7]:
# Specific case of Prod_Closed_Date dealing with NaT values. NaT values handled with the 1900-01-01 value for now.
null_date = pd.to_datetime('19000101', format='%Y%m%d', errors='ignore')
dataframe_train.replace({pd.NaT : null_date}, inplace=True)

new_df["BirthDate"] = pd.to_datetime(dataframe_train['BirthDate']).map(lambda BirthDate: BirthDate.timestamp())
new_df["Customer_Open_Date"] = pd.to_datetime(dataframe_train['Customer_Open_Date']).map(lambda Customer_Open_Date: Customer_Open_Date.timestamp())
new_df["Prod_Decision_Date"] = pd.to_datetime(dataframe_train['Prod_Decision_Date']).map(lambda Prod_Decision_Date: Prod_Decision_Date.timestamp())
new_df["Prod_Closed_Date"] = pd.to_datetime(dataframe_train['Prod_Closed_Date']).map(lambda Prod_Closed_Date: Prod_Closed_Date.timestamp())
new_df["Y"] = dataframe_train['Y'] # the labels!
new_df.head()

Unnamed: 0,Existing Client,Non Existing Client,NP_Client,P_Client,Diploma,Master/PhD,Secondary or Less,University,Divorced,Married,...,I,J,K,L,M,BirthDate,Customer_Open_Date,Prod_Decision_Date,Prod_Closed_Date,Y
0,0,1,1,0,0,0,0,1,0,1,...,0,0,0,0,0,237168000.0,1329091000.0,1329178000.0,-2208989000.0,0
1,1,0,0,1,0,0,0,1,0,1,...,0,0,0,0,0,140313600.0,1238630000.0,1309392000.0,-2208989000.0,0
2,0,1,1,0,0,0,0,1,0,1,...,0,0,0,0,0,111196800.0,1330819000.0,1333498000.0,-2208989000.0,0
3,1,0,1,0,0,0,0,1,0,1,...,0,0,0,1,0,397526400.0,1314230000.0,1310170000.0,1356912000.0,1
4,0,1,1,0,0,0,0,1,0,1,...,0,0,0,0,0,-516672000.0,1349050000.0,1351728000.0,-2208989000.0,0


## Features and labels

In [8]:
# Use numpy to convert to arrays
import numpy as np

# Labels are the values we want to predict
labels = np.array(new_df['Y'])

# Remove the labels from the features
# axis 1 refers to the columns
features= new_df.drop('Y', axis = 1)

# Saving feature names for later use
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)

In [9]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25,
                                                                           random_state = 42)

In [10]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (4035, 37)
Training Labels Shape: (4035,)
Testing Features Shape: (1345, 37)
Testing Labels Shape: (1345,)


## Training the Forest

In [26]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier

# Instantiate model 
rf = RandomForestClassifier(n_estimators=1000, random_state=42)

# Train the model on training data
rf.fit(train_features, train_labels);

## Make Predictions on Test Data

In [27]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))


Mean Absolute Error: 0.06


In [32]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(test_labels, predictions))
pd.DataFrame(confusion_matrix(test_labels, predictions), 
             columns=['Predicted Negative', 'Predicted Positive'], 
             index=['Actual Negative', 'Actual Positive'])

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1236
           1       0.83      0.37      0.51       109

    accuracy                           0.94      1345
   macro avg       0.89      0.68      0.74      1345
weighted avg       0.94      0.94      0.93      1345



Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,1228,8
Actual Positive,69,40


## Visualizing a Single Decision Tree

In [34]:
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot

# Pull out one tree from the forest
tree = rf.estimators_[5]

# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)

# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')

# Write graph to a png file
graph.write_png('tree.png'); 

In [35]:
print('The depth of this tree is:', tree.tree_.max_depth)

The depth of this tree is: 21


In [36]:
# Limit depth of tree to 2 levels
rf_small = RandomForestRegressor(n_estimators=10, max_depth = 3, random_state=42)
rf_small.fit(train_features, train_labels)

# Extract the small tree
tree_small = rf_small.estimators_[5]

# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = feature_list, rounded = True, precision = 1)

(graph, ) = pydot.graph_from_dot_file('small_tree.dot')

graph.write_png('small_tree.png')