# Customer defaulting likelyhood

In [225]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib

## Data preprocessing

In [201]:
# Set pd options
pd.set_option('display.max_columns', 500)

In [202]:
# Import data
data_0716 = pd.ExcelFile('models/FE data - 0716S.xlsx')

In [203]:
all_data = pd.read_excel(data_0716, '0716SDL')
all_data.head()

Unnamed: 0,NRIC,Age,Race,Sex,Nation,Address,Tel,Mth(s),Anklet,Bangle,Bracelet,Chain,Earring,Earstud,Necklace,Pendant,Ring,O,24K,22K,20K,18K,14K,9K,W,Value ($),Interest Payable ($),Status,Defaulted
0,S,51,C,F,S,H,M,1,0,0,0,0,0,0,0,1,0,0,0.0,20.0,0.0,0.0,0,0.0,0.0,1100,11.0,C,False
1,S,25,I,M,S,H,M,7,0,0,0,0,0,2,0,0,0,0,0.0,2.5,0.0,0.0,0,0.0,0.0,110,11.55,L,False
2,S,51,C,F,S,H,M,6,0,0,0,0,0,0,1,0,0,0,0.0,49.3,0.0,0.0,0,0.0,0.0,2170,195.3,C,False
3,S,51,C,F,S,H,M,6,0,0,0,1,0,0,0,0,0,0,0.0,37.9,0.0,0.0,0,0.0,0.0,1550,139.5,C,False
4,S,51,C,M,S,C,M,1,0,0,0,0,0,0,0,0,0,1,0.0,0.0,0.0,0.0,0,0.0,129.6,10000,100.0,C,False


In [204]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73 entries, 0 to 72
Data columns (total 29 columns):
NRIC                    73 non-null object
Age                     73 non-null int64
Race                    73 non-null object
Sex                     73 non-null object
Nation                  73 non-null object
Address                 73 non-null object
Tel                     73 non-null object
Mth(s)                  73 non-null object
Anklet                  73 non-null int64
Bangle                  73 non-null int64
Bracelet                73 non-null int64
Chain                   73 non-null int64
Earring                 73 non-null int64
Earstud                 73 non-null int64
Necklace                73 non-null int64
Pendant                 73 non-null int64
Ring                    73 non-null int64
O                       73 non-null int64
24K                     73 non-null float64
22K                     73 non-null float64
20K                     73 non-null float64
18

## Model 1: Including item types

In [205]:
# Drop unecessary features
features_to_drop = [
    'Mth(s)', 
    'Defaulted', 
    'Value ($)', 
    'Interest Payable ($)'
]
model_1_df = all_data.drop(features_to_drop, axis=1)

In [206]:
# Get dummies and save dependent variable
status_list = model_1_df.Status.tolist()
model_1_df = model_1_df.drop(['Status'], axis=1)
model_1_dummied_df = pd.get_dummies(model_1_df)
model_1_dummied_df['Status'] = status_list
model_1_dummied_df.head()

Unnamed: 0,Age,Anklet,Bangle,Bracelet,Chain,Earring,Earstud,Necklace,Pendant,Ring,O,24K,22K,20K,18K,14K,9K,W,NRIC_F,NRIC_S,Race_C,Race_I,Race_M,Race_O,Sex_F,Sex_M,Nation_F,Nation_I,Nation_M,Nation_P,Nation_S,Nation_T,Address_C,Address_H,Address_N,Tel_H,Tel_M,Status
0,51,0,0,0,0,0,0,0,1,0,0,0.0,20.0,0.0,0.0,0,0.0,0.0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,C
1,25,0,0,0,0,0,2,0,0,0,0,0.0,2.5,0.0,0.0,0,0.0,0.0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,L
2,51,0,0,0,0,0,0,1,0,0,0,0.0,49.3,0.0,0.0,0,0.0,0.0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,C
3,51,0,0,0,1,0,0,0,0,0,0,0.0,37.9,0.0,0.0,0,0.0,0.0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,C
4,51,0,0,0,0,0,0,0,0,0,1,0.0,0.0,0.0,0.0,0,0.0,129.6,0,1,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,C


In [208]:
# Get dependent and independent variable arrays
x = model_1_dummied_df.iloc[:, :-1].values
y = model_1_dummied_df.iloc[:, -1].values

# Get training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [209]:
# Create and fit classifier
classifier = RandomForestClassifier(n_estimators=300, criterion='entropy', random_state=0)
classifier.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [210]:
# Predict and print result
y_pred_probabilties = classifier.predict_proba(x_test)
y_pred_class = classifier.predict(x_test)
print(pd.DataFrame({
    'C%_pred': y_pred_probabilties[:, 0],
    'D%_pred': y_pred_probabilties[:, 1],
    'L%_pred': y_pred_probabilties[:, 2],
    'most_likely_status': y_pred_class,
    'actual': y_test
}))

     C%_pred   D%_pred   L%_pred most_likely_status actual
0   0.726667  0.116667  0.156667                  C      D
1   0.250000  0.520000  0.230000                  D      D
2   0.660000  0.126667  0.213333                  C      L
3   0.016667  0.923333  0.060000                  D      D
4   0.113333  0.736667  0.150000                  D      C
5   0.260000  0.620000  0.120000                  D      D
6   0.670000  0.270000  0.060000                  C      D
7   0.190000  0.790000  0.020000                  D      D
8   0.113333  0.573333  0.313333                  D      D
9   0.510000  0.396667  0.093333                  C      L
10  0.033333  0.873333  0.093333                  D      D
11  0.346667  0.336667  0.316667                  C      D
12  0.103333  0.856667  0.040000                  D      D
13  0.630000  0.240000  0.130000                  C      D
14  0.013333  0.953333  0.033333                  D      D
15  0.363333  0.156667  0.480000                  L     

In [211]:
# Print accuracy score of absolute prediction
cm = confusion_matrix(y_test, y_pred_class)
print('Accuracy =', float(cm[0][0] + cm[1][1])/np.sum(cm) * 100, '%')

Accuracy = 52.63157894736842 %


In [226]:
# Pickle and save model
joblib.dump(classifier, 'models/model_1.pkl')

['models/model_1.pkl']

## Model 2: Excluding item types (except watch)

In [212]:
# Drop unecessary features
features_to_drop = [
    'Mth(s)', 
    'Defaulted', 
    'Value ($)', 
    'Interest Payable ($)',
    'Anklet',
    'Bangle',
    'Bracelet',
    'Chain',
    'Earring',
    'Earstud',
    'Necklace',
    'Pendant',
    'Ring',
    'O'
]
model_2_df = all_data.drop(features_to_drop, axis=1)

In [213]:
# Get dummies and save dependent variable
status_list = all_data.Status.tolist()
model_2_df = model_2_df.drop(['Status'], axis=1)
model_2_dummied_df = pd.get_dummies(model_2_df)
model_2_dummied_df['Status'] = status_list
model_2_dummied_df.head()

Unnamed: 0,Age,24K,22K,20K,18K,14K,9K,W,NRIC_F,NRIC_S,Race_C,Race_I,Race_M,Race_O,Sex_F,Sex_M,Nation_F,Nation_I,Nation_M,Nation_P,Nation_S,Nation_T,Address_C,Address_H,Address_N,Tel_H,Tel_M,Status
0,51,0.0,20.0,0.0,0.0,0,0.0,0.0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,C
1,25,0.0,2.5,0.0,0.0,0,0.0,0.0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,L
2,51,0.0,49.3,0.0,0.0,0,0.0,0.0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,C
3,51,0.0,37.9,0.0,0.0,0,0.0,0.0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,C
4,51,0.0,0.0,0.0,0.0,0,0.0,129.6,0,1,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,C


In [214]:
# Get dependent and independent variable arrays
x = model_2_dummied_df.iloc[:, :-1].values
y = model_2_dummied_df.iloc[:, -1].values

# Get training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [215]:
# Create and fit classifier
classifier = RandomForestClassifier(n_estimators=300, criterion='entropy', random_state=0)
classifier.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [216]:
# Predict and print result
y_pred_probabilties = classifier.predict_proba(x_test)
y_pred_class = classifier.predict(x_test)
print(pd.DataFrame({
    'C%_pred': y_pred_probabilties[:, 0],
    'D%_pred': y_pred_probabilties[:, 1],
    'L%_pred': y_pred_probabilties[:, 2],
    'most_likely_status': y_pred_class,
    'actual': y_test
}))

     C%_pred   D%_pred   L%_pred most_likely_status actual
0   0.690000  0.203333  0.106667                  C      D
1   0.520000  0.350000  0.130000                  C      D
2   0.686667  0.093333  0.220000                  C      L
3   0.003333  0.950000  0.046667                  D      D
4   0.080000  0.626667  0.293333                  D      C
5   0.163333  0.600000  0.236667                  D      D
6   0.573333  0.390000  0.036667                  C      D
7   0.226667  0.746667  0.026667                  D      D
8   0.060000  0.456667  0.483333                  L      D
9   0.686667  0.200000  0.113333                  C      L
10  0.046667  0.890000  0.063333                  D      D
11  0.183333  0.273333  0.543333                  L      D
12  0.030000  0.963333  0.006667                  D      D
13  0.766667  0.096667  0.136667                  C      D
14  0.006667  0.966667  0.026667                  D      D
15  0.653333  0.086667  0.260000                  C     

In [217]:
# Print accuracy score of absolute prediction
cm = confusion_matrix(y_test, y_pred_class)
print('Accuracy =', float(cm[0][0] + cm[1][1])/np.sum(cm) * 100, '%')

Accuracy = 42.10526315789473 %


**Observations**: Model 2 actually performs worse, suggesting that the item being pawned is important

In [227]:
# Pickle and save model
joblib.dump(classifier, 'models/model_2.pkl')

['models/model_2.pkl']

## Model 3: Excluding all item types

In [218]:
# Drop unecessary features
features_to_drop = [
    'Mth(s)', 
    'Defaulted', 
    'Value ($)', 
    'Interest Payable ($)',
    'Mth(s)', 
    'Defaulted', 
    'Value ($)', 
    'Interest Payable ($)',
    'Anklet',
    'Bangle',
    'Bracelet',
    'Chain',
    'Earring',
    'Earstud',
    'Necklace',
    'Pendant',
    'Ring',
    'O',
    'W'
]
model_3_df = all_data.drop(features_to_drop, axis=1)

In [219]:
# Get dummies and save dependent variable
status_list = all_data.Status.tolist()
model_3_df = model_3_df.drop(['Status'], axis=1)
model_3_dummied_df = pd.get_dummies(model_3_df)
model_3_dummied_df['Status'] = status_list
model_3_dummied_df.head()

Unnamed: 0,Age,24K,22K,20K,18K,14K,9K,NRIC_F,NRIC_S,Race_C,Race_I,Race_M,Race_O,Sex_F,Sex_M,Nation_F,Nation_I,Nation_M,Nation_P,Nation_S,Nation_T,Address_C,Address_H,Address_N,Tel_H,Tel_M,Status
0,51,0.0,20.0,0.0,0.0,0,0.0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,C
1,25,0.0,2.5,0.0,0.0,0,0.0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,L
2,51,0.0,49.3,0.0,0.0,0,0.0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,C
3,51,0.0,37.9,0.0,0.0,0,0.0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,C
4,51,0.0,0.0,0.0,0.0,0,0.0,0,1,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,C


In [220]:
# Get dependent and independent variable arrays
x = model_3_dummied_df.iloc[:, :-1].values
y = model_3_dummied_df.iloc[:, -1].values

# Get training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [221]:
# Create and fit classifier
classifier = RandomForestClassifier(n_estimators=300, criterion='entropy', random_state=0)
classifier.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [222]:
# Predict and print result
y_pred_probabilties = classifier.predict_proba(x_test)
y_pred_class = classifier.predict(x_test)
print(pd.DataFrame({
    'C%_pred': y_pred_probabilties[:, 0],
    'D%_pred': y_pred_probabilties[:, 1],
    'L%_pred': y_pred_probabilties[:, 2],
    'most_likely_status': y_pred_class,
    'actual': y_test
}))

     C%_pred   D%_pred   L%_pred most_likely_status actual
0   0.733333  0.146667  0.120000                  C      D
1   0.520000  0.343333  0.136667                  C      D
2   0.726667  0.060000  0.213333                  C      L
3   0.003333  0.953333  0.043333                  D      D
4   0.070000  0.646667  0.283333                  D      C
5   0.270000  0.573333  0.156667                  D      D
6   0.726667  0.256667  0.016667                  C      D
7   0.256667  0.723333  0.020000                  D      D
8   0.070000  0.423333  0.506667                  L      D
9   0.813333  0.093333  0.093333                  C      L
10  0.040000  0.913333  0.046667                  D      D
11  0.213333  0.223333  0.563333                  L      D
12  0.026667  0.960000  0.013333                  D      D
13  0.883333  0.010000  0.106667                  C      D
14  0.010000  0.990000  0.000000                  D      D
15  0.690000  0.060000  0.250000                  C     

In [223]:
# Print accuracy score of absolute prediction
cm = confusion_matrix(y_test, y_pred_class)
print('Accuracy =', float(cm[0][0] + cm[1][1])/np.sum(cm) * 100, '%')

Accuracy = 42.10526315789473 %


**Observations**: as expected, model 3 performs as bad as model 2

In [228]:
# Pickle and save model
joblib.dump(classifier, 'models/model_3.pkl')

['models/model_3.pkl']

## Model 4: Including item information, value and interest

In [229]:
# Drop unecessary features
features_to_drop = [
    'Mth(s)', 
    'Defaulted'
]
model_4_df = all_data.drop(features_to_drop, axis=1)

In [230]:
# Get dummies and save dependent variable
status_list = model_4_df.Status.tolist()
model_4_df = model_4_df.drop(['Status'], axis=1)
model_4_dummied_df = pd.get_dummies(model_4_df)
model_4_dummied_df['Status'] = status_list
model_4_dummied_df.head()

Unnamed: 0,Age,Anklet,Bangle,Bracelet,Chain,Earring,Earstud,Necklace,Pendant,Ring,O,24K,22K,20K,18K,14K,9K,W,Value ($),NRIC_F,NRIC_S,Race_C,Race_I,Race_M,Race_O,Sex_F,Sex_M,Nation_F,Nation_I,Nation_M,Nation_P,Nation_S,Nation_T,Address_C,Address_H,Address_N,Tel_H,Tel_M,Interest Payable ($)_2.5,Interest Payable ($)_4.2,Interest Payable ($)_7.8,Interest Payable ($)_11,Interest Payable ($)_11.55,Interest Payable ($)_16.8,Interest Payable ($)_21,Interest Payable ($)_21.6,Interest Payable ($)_23.5,Interest Payable ($)_25.2,Interest Payable ($)_27,Interest Payable ($)_27.3,Interest Payable ($)_33.3,Interest Payable ($)_47.25,Interest Payable ($)_54,Interest Payable ($)_57.75,Interest Payable ($)_60,Interest Payable ($)_84,Interest Payable ($)_100,Interest Payable ($)_115.5,Interest Payable ($)_139.5,Interest Payable ($)_157.5,Interest Payable ($)_168,Interest Payable ($)_195.3,Interest Payable ($)_210,Interest Payable ($)_-,Status
0,51,0,0,0,0,0,0,0,1,0,0,0.0,20.0,0.0,0.0,0,0.0,0.0,1100,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,C
1,25,0,0,0,0,0,2,0,0,0,0,0.0,2.5,0.0,0.0,0,0.0,0.0,110,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,L
2,51,0,0,0,0,0,0,1,0,0,0,0.0,49.3,0.0,0.0,0,0.0,0.0,2170,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,C
3,51,0,0,0,1,0,0,0,0,0,0,0.0,37.9,0.0,0.0,0,0.0,0.0,1550,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,C
4,51,0,0,0,0,0,0,0,0,0,1,0.0,0.0,0.0,0.0,0,0.0,129.6,10000,0,1,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,C


In [231]:
# Get dependent and independent variable arrays
x = model_4_dummied_df.iloc[:, :-1].values
y = model_4_dummied_df.iloc[:, -1].values

# Get training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [232]:
# Create and fit classifier
classifier = RandomForestClassifier(n_estimators=300, criterion='entropy', random_state=0)
classifier.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [233]:
# Predict and print result
y_pred_probabilties = classifier.predict_proba(x_test)
y_pred_class = classifier.predict(x_test)
print(pd.DataFrame({
    'C%_pred': y_pred_probabilties[:, 0],
    'D%_pred': y_pred_probabilties[:, 1],
    'L%_pred': y_pred_probabilties[:, 2],
    'most_likely_status': y_pred_class,
    'actual': y_test
}))

     C%_pred   D%_pred   L%_pred most_likely_status actual
0   0.263333  0.696667  0.040000                  D      D
1   0.120000  0.810000  0.070000                  D      D
2   0.626667  0.130000  0.243333                  C      L
3   0.000000  0.990000  0.010000                  D      D
4   0.376667  0.316667  0.306667                  C      C
5   0.060000  0.840000  0.100000                  D      D
6   0.096667  0.860000  0.043333                  D      D
7   0.060000  0.916667  0.023333                  D      D
8   0.060000  0.793333  0.146667                  D      D
9   0.550000  0.220000  0.230000                  C      L
10  0.043333  0.936667  0.020000                  D      D
11  0.123333  0.750000  0.126667                  D      D
12  0.036667  0.956667  0.006667                  D      D
13  0.136667  0.766667  0.096667                  D      D
14  0.020000  0.963333  0.016667                  D      D
15  0.130000  0.743333  0.126667                  D     

In [234]:
# Print accuracy score of absolute prediction
cm = confusion_matrix(y_test, y_pred_class)
print('Accuracy =', float(cm[0][0] + cm[1][1])/np.sum(cm) * 100, '%')

Accuracy = 89.47368421052632 %


In [235]:
# Pickle and save model
joblib.dump(classifier, 'models/model_4.pkl')

['models/model_4.pkl']