# Customer defaulting likelyhood

In [79]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix

## Data preprocessing

In [138]:
# Set pd options
pd.set_option('display.max_columns', 500)

In [139]:
# Import data
data_0716 = pd.ExcelFile('models/FE data - 0716S.xlsx')

In [140]:
all_data = pd.read_excel(data_0716, '0716SDL')
all_data.head()

Unnamed: 0,NRIC,Age,Race,Sex,Nation,Address,Tel,Mth(s),Anklet,Bangle,Bracelet,Chain,Earring,Earstud,Necklace,Pendant,Ring,O,24K,22K,20K,18K,14K,9K,W,Value ($),Interest Payable ($),Status,Defaulted
0,S,51,C,F,S,H,M,1,0,0,0,0,0,0,0,1,0,0,0.0,20.0,0.0,0.0,0,0.0,0.0,1100,11.0,C,False
1,S,25,I,M,S,H,M,7,0,0,0,0,0,2,0,0,0,0,0.0,2.5,0.0,0.0,0,0.0,0.0,110,11.55,L,False
2,S,51,C,F,S,H,M,6,0,0,0,0,0,0,1,0,0,0,0.0,49.3,0.0,0.0,0,0.0,0.0,2170,195.3,C,False
3,S,51,C,F,S,H,M,6,0,0,0,1,0,0,0,0,0,0,0.0,37.9,0.0,0.0,0,0.0,0.0,1550,139.5,C,False
4,S,51,C,M,S,C,M,1,0,0,0,0,0,0,0,0,0,1,0.0,0.0,0.0,0.0,0,0.0,129.6,10000,100.0,C,False


In [141]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73 entries, 0 to 72
Data columns (total 29 columns):
NRIC                    73 non-null object
Age                     73 non-null int64
Race                    73 non-null object
Sex                     73 non-null object
Nation                  73 non-null object
Address                 73 non-null object
Tel                     73 non-null object
Mth(s)                  73 non-null object
Anklet                  73 non-null int64
Bangle                  73 non-null int64
Bracelet                73 non-null int64
Chain                   73 non-null int64
Earring                 73 non-null int64
Earstud                 73 non-null int64
Necklace                73 non-null int64
Pendant                 73 non-null int64
Ring                    73 non-null int64
O                       73 non-null int64
24K                     73 non-null float64
22K                     73 non-null float64
20K                     73 non-null float64
18

## Model 1: Including item types

In [142]:
# Drop unecessary features
features_to_drop = [
    'Mth(s)', 
    'Defaulted', 
    'Value ($)', 
    'Interest Payable ($)'
]
model_1_df = all_data.drop(features_to_drop, axis=1)

In [143]:
# Get dummies and save dependent variable
status_list = model_1_df.Status.tolist()
model_1_df = model_1_df.drop(['Status'], axis=1)
model_1_dummied_df = pd.get_dummies(model_1_df)
model_1_dummied_df['Status'] = status_list
model_1_dummied_df.head()

Unnamed: 0,Age,Anklet,Bangle,Bracelet,Chain,Earring,Earstud,Necklace,Pendant,Ring,O,24K,22K,20K,18K,14K,9K,W,NRIC_F,NRIC_S,Race_C,Race_I,Race_M,Race_O,Sex_F,Sex_M,Nation_F,Nation_I,Nation_M,Nation_P,Nation_S,Nation_T,Address_C,Address_H,Address_N,Tel_H,Tel_M,Status
0,51,0,0,0,0,0,0,0,1,0,0,0.0,20.0,0.0,0.0,0,0.0,0.0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,C
1,25,0,0,0,0,0,2,0,0,0,0,0.0,2.5,0.0,0.0,0,0.0,0.0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,L
2,51,0,0,0,0,0,0,1,0,0,0,0.0,49.3,0.0,0.0,0,0.0,0.0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,C
3,51,0,0,0,1,0,0,0,0,0,0,0.0,37.9,0.0,0.0,0,0.0,0.0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,C
4,51,0,0,0,0,0,0,0,0,0,1,0.0,0.0,0.0,0.0,0,0.0,129.6,0,1,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,C


In [144]:
model_1_df.head()

Unnamed: 0,NRIC,Age,Race,Sex,Nation,Address,Tel,Anklet,Bangle,Bracelet,Chain,Earring,Earstud,Necklace,Pendant,Ring,O,24K,22K,20K,18K,14K,9K,W
0,S,51,C,F,S,H,M,0,0,0,0,0,0,0,1,0,0,0.0,20.0,0.0,0.0,0,0.0,0.0
1,S,25,I,M,S,H,M,0,0,0,0,0,2,0,0,0,0,0.0,2.5,0.0,0.0,0,0.0,0.0
2,S,51,C,F,S,H,M,0,0,0,0,0,0,1,0,0,0,0.0,49.3,0.0,0.0,0,0.0,0.0
3,S,51,C,F,S,H,M,0,0,0,1,0,0,0,0,0,0,0.0,37.9,0.0,0.0,0,0.0,0.0
4,S,51,C,M,S,C,M,0,0,0,0,0,0,0,0,0,1,0.0,0.0,0.0,0.0,0,0.0,129.6


In [145]:
# Get dependent and independent variable arrays
x = model_1_dummied_df.iloc[:, :-1].values
y = model_1_dummied_df.iloc[:, -1].values

# Get training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [146]:
# Create and fit classifier
classifier = RandomForestClassifier(n_estimators=300, criterion='entropy', random_state=0)
classifier.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [147]:
# Predict and print result
y_pred = classifier.predict(x_test)
print(pd.DataFrame({
    'predicted': y_pred,
    'actual': y_test
}))

   predicted actual
0          C      D
1          D      D
2          C      L
3          D      D
4          D      C
5          D      D
6          C      D
7          D      D
8          D      D
9          C      L
10         D      D
11         C      D
12         D      D
13         C      D
14         D      D
15         L      D
16         C      C
17         L      D
18         D      D


In [148]:
# Print accuracy score
cm = confusion_matrix(y_test, y_pred)
print('Accuracy =', float(cm[0][0] + cm[1][1])/np.sum(cm) * 100, '%')

Accuracy = 52.63157894736842 %


## Model 2: Excluding item types (except watch)

In [158]:
# Drop unecessary features
features_to_drop = [
    'Mth(s)', 
    'Defaulted', 
    'Value ($)', 
    'Interest Payable ($)',
    'Anklet',
    'Bangle',
    'Bracelet',
    'Chain',
    'Earring',
    'Earstud',
    'Necklace',
    'Pendant',
    'Ring',
    'O'
]
model_2_df = all_data.drop(features_to_drop, axis=1)

In [159]:
# Get dummies and save dependent variable
status_list = all_data.Status.tolist()
model_2_df = model_2_df.drop(['Status'], axis=1)
model_2_dummied_df = pd.get_dummies(model_2_df)
model_2_dummied_df['Status'] = status_list
model_2_dummied_df.head()

Unnamed: 0,Age,24K,22K,20K,18K,14K,9K,W,NRIC_F,NRIC_S,Race_C,Race_I,Race_M,Race_O,Sex_F,Sex_M,Nation_F,Nation_I,Nation_M,Nation_P,Nation_S,Nation_T,Address_C,Address_H,Address_N,Tel_H,Tel_M,Status
0,51,0.0,20.0,0.0,0.0,0,0.0,0.0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,C
1,25,0.0,2.5,0.0,0.0,0,0.0,0.0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,L
2,51,0.0,49.3,0.0,0.0,0,0.0,0.0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,C
3,51,0.0,37.9,0.0,0.0,0,0.0,0.0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,C
4,51,0.0,0.0,0.0,0.0,0,0.0,129.6,0,1,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,C


In [160]:
# Get dependent and independent variable arrays
x = model_2_dummied_df.iloc[:, :-1].values
y = model_2_dummied_df.iloc[:, -1].values

# Get training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [161]:
# Create and fit classifier
classifier = RandomForestClassifier(n_estimators=300, criterion='entropy', random_state=0)
classifier.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [162]:
# Predict and print result
y_pred = classifier.predict(x_test)
print(pd.DataFrame({
    'predicted': y_pred,
    'actual': y_test
}))

   predicted actual
0          C      D
1          C      D
2          C      L
3          D      D
4          D      C
5          D      D
6          C      D
7          D      D
8          L      D
9          C      L
10         D      D
11         L      D
12         D      D
13         C      D
14         D      D
15         C      D
16         C      C
17         L      D
18         D      D


In [163]:
# Print accuracy score
cm = confusion_matrix(y_test, y_pred)
print('Accuracy =', float(cm[0][0] + cm[1][1])/np.sum(cm) * 100, '%')

Accuracy = 42.10526315789473 %


**Observations**: Model 2 actually performs worse, suggesting that the item being pawned is important

## Model 3: Excluding all item types

In [165]:
# Drop unecessary features
features_to_drop = [
    'Mth(s)', 
    'Defaulted', 
    'Value ($)', 
    'Interest Payable ($)',
    'Mth(s)', 
    'Defaulted', 
    'Value ($)', 
    'Interest Payable ($)',
    'Anklet',
    'Bangle',
    'Bracelet',
    'Chain',
    'Earring',
    'Earstud',
    'Necklace',
    'Pendant',
    'Ring',
    'O',
    'W'
]
model_3_df = all_data.drop(features_to_drop, axis=1)

In [166]:
# Get dummies and save dependent variable
status_list = all_data.Status.tolist()
model_3_df = model_3_df.drop(['Status'], axis=1)
model_3_dummied_df = pd.get_dummies(model_3_df)
model_3_dummied_df['Status'] = status_list
model_3_dummied_df.head()

Unnamed: 0,Age,24K,22K,20K,18K,14K,9K,NRIC_F,NRIC_S,Race_C,Race_I,Race_M,Race_O,Sex_F,Sex_M,Nation_F,Nation_I,Nation_M,Nation_P,Nation_S,Nation_T,Address_C,Address_H,Address_N,Tel_H,Tel_M,Status
0,51,0.0,20.0,0.0,0.0,0,0.0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,C
1,25,0.0,2.5,0.0,0.0,0,0.0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,L
2,51,0.0,49.3,0.0,0.0,0,0.0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,C
3,51,0.0,37.9,0.0,0.0,0,0.0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,C
4,51,0.0,0.0,0.0,0.0,0,0.0,0,1,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,C


In [167]:
# Get dependent and independent variable arrays
x = model_3_dummied_df.iloc[:, :-1].values
y = model_3_dummied_df.iloc[:, -1].values

# Get training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [168]:
# Create and fit classifier
classifier = RandomForestClassifier(n_estimators=300, criterion='entropy', random_state=0)
classifier.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [169]:
# Predict and print result
y_pred = classifier.predict(x_test)
print(pd.DataFrame({
    'predicted': y_pred,
    'actual': y_test
}))

   predicted actual
0          C      D
1          C      D
2          C      L
3          D      D
4          D      C
5          D      D
6          C      D
7          D      D
8          L      D
9          C      L
10         D      D
11         L      D
12         D      D
13         C      D
14         D      D
15         C      D
16         C      C
17         L      D
18         D      D


In [170]:
# Print accuracy score
cm = confusion_matrix(y_test, y_pred)
print('Accuracy =', float(cm[0][0] + cm[1][1])/np.sum(cm) * 100, '%')

Accuracy = 42.10526315789473 %


**Observations**: as expected, model 3 performs as bad as model 2