In [26]:
# Import important library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

### Q1 Read the input file and check the data dimension

In [2]:
#Read input file and understand the data
# "default" is my dependent variable

In [27]:
# Read source data, use the delimiter parameter as the file is separated by ;
data = pd.read_csv('german_credit.csv', delimiter=',')
data.head()

Unnamed: 0,default,account_check_status,duration_in_month,credit_history,purpose,credit_amount,savings,present_emp_since,installment_as_income_perc,personal_status_sex,...,present_res_since,property,age,other_installment_plans,housing,credits_this_bank,job,people_under_maintenance,telephone,foreign_worker
0,0,< 0 DM,6,critical account/ other credits existing (not ...,domestic appliances,1169,unknown/ no savings account,.. >= 7 years,4,male : single,...,4,real estate,67,none,own,2,skilled employee / official,1,"yes, registered under the customers name",yes
1,1,0 <= ... < 200 DM,48,existing credits paid back duly till now,domestic appliances,5951,... < 100 DM,1 <= ... < 4 years,2,female : divorced/separated/married,...,2,real estate,22,none,own,1,skilled employee / official,1,none,yes
2,0,no checking account,12,critical account/ other credits existing (not ...,(vacation - does not exist?),2096,... < 100 DM,4 <= ... < 7 years,2,male : single,...,3,real estate,49,none,own,1,unskilled - resident,2,none,yes
3,0,< 0 DM,42,existing credits paid back duly till now,radio/television,7882,... < 100 DM,4 <= ... < 7 years,2,male : single,...,4,if not A121 : building society savings agreeme...,45,none,for free,1,skilled employee / official,2,none,yes
4,1,< 0 DM,24,delay in paying off in the past,car (new),4870,... < 100 DM,1 <= ... < 4 years,3,male : single,...,4,unknown / no property,53,none,for free,2,skilled employee / official,2,none,yes


In [28]:
# Lets build a Ensemble model but need to modify the dataset first
# Print attribute names which are not numerical
print(data.select_dtypes(exclude =['float64','int64']).columns)

Index(['account_check_status', 'credit_history', 'purpose', 'savings',
       'present_emp_since', 'personal_status_sex', 'other_debtors', 'property',
       'other_installment_plans', 'housing', 'job', 'telephone',
       'foreign_worker'],
      dtype='object')


In [29]:
# Check for data types of all columns
print(data.dtypes)

default                        int64
account_check_status          object
duration_in_month              int64
credit_history                object
purpose                       object
credit_amount                  int64
savings                       object
present_emp_since             object
installment_as_income_perc     int64
personal_status_sex           object
other_debtors                 object
present_res_since              int64
property                      object
age                            int64
other_installment_plans       object
housing                       object
credits_this_bank              int64
job                           object
people_under_maintenance       int64
telephone                     object
foreign_worker                object
dtype: object


In [30]:
## Check missing values
data.isnull().values.any()

False

### Q2.Prepare the model data by converting non-numeric to dummy ( 1 Marks)
##### Hint: Use get_dummies

In [31]:
cols_to_transform = ['account_check_status', 'credit_history', 'purpose', 'savings', 'present_emp_since', 'installment_as_income_perc','personal_status_sex','other_debtors','present_res_since','property','other_installment_plans','housing','credits_this_bank','job','people_under_maintenance','telephone','foreign_worker']
df_with_dummies = pd.get_dummies( data, columns = cols_to_transform )
df_with_dummies.head()

Unnamed: 0,default,duration_in_month,credit_amount,age,account_check_status_0 <= ... < 200 DM,account_check_status_< 0 DM,account_check_status_>= 200 DM / salary assignments for at least 1 year,account_check_status_no checking account,credit_history_all credits at this bank paid back duly,credit_history_critical account/ other credits existing (not at this bank),...,job_management/ self-employed/ highly qualified employee/ officer,job_skilled employee / official,job_unemployed/ unskilled - non-resident,job_unskilled - resident,people_under_maintenance_1,people_under_maintenance_2,telephone_none,"telephone_yes, registered under the customers name",foreign_worker_no,foreign_worker_yes
0,0,6,1169,67,0,1,0,0,0,1,...,0,1,0,0,1,0,0,1,0,1
1,1,48,5951,22,1,0,0,0,0,0,...,0,1,0,0,1,0,1,0,0,1
2,0,12,2096,49,0,0,0,1,0,1,...,0,0,0,1,0,1,1,0,0,1
3,0,42,7882,45,0,1,0,0,0,0,...,0,1,0,0,0,1,1,0,0,1
4,1,24,4870,53,0,1,0,0,0,0,...,0,1,0,0,0,1,1,0,0,1


In [32]:
# Print Shape of model data
df_with_dummies.shape


(1000, 72)

In [33]:
# Check for data types of all columns
print(df_with_dummies.dtypes)

default                                                                       int64
duration_in_month                                                             int64
credit_amount                                                                 int64
age                                                                           int64
account_check_status_0 <= ... < 200 DM                                        uint8
account_check_status_< 0 DM                                                   uint8
account_check_status_>= 200 DM / salary assignments for at least 1 year       uint8
account_check_status_no checking account                                      uint8
credit_history_all credits at this bank paid back duly                        uint8
credit_history_critical account/ other credits existing (not at this bank)    uint8
credit_history_delay in paying off in the past                                uint8
credit_history_existing credits paid back duly till now                     

### Check for highly correlated variables but don't required any treatment for this use case

In [34]:

# Get the correlation 
corr = data.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,default,duration_in_month,credit_amount,installment_as_income_perc,present_res_since,age,credits_this_bank,people_under_maintenance
default,1.0,0.214927,0.154739,0.0724039,0.00296716,-0.0911274,-0.0457325,-0.00301485
duration_in_month,0.214927,1.0,0.624984,0.0747488,0.0340672,-0.0361364,-0.0112836,-0.0238345
credit_amount,0.154739,0.624984,1.0,-0.271316,0.0289263,0.0327164,0.0207946,0.0171422
installment_as_income_perc,0.0724039,0.0747488,-0.271316,1.0,0.0493024,0.0582657,0.0216687,-0.0712069
present_res_since,0.00296716,0.0340672,0.0289263,0.0493024,1.0,0.266419,0.0896252,0.0426434
age,-0.0911274,-0.0361364,0.0327164,0.0582657,0.266419,1.0,0.149254,0.118201
credits_this_bank,-0.0457325,-0.0112836,0.0207946,0.0216687,0.0896252,0.149254,1.0,0.109667
people_under_maintenance,-0.00301485,-0.0238345,0.0171422,-0.0712069,0.0426434,0.118201,0.109667,1.0


### Drop the original variables which are converted to dummy

In [35]:
model_data=data.drop(cols_to_transform, axis = 1)
model_data.dtypes

default              int64
duration_in_month    int64
credit_amount        int64
age                  int64
dtype: object

### Q3 Split Train/Test data 70:30 ratio( 1 Marks)
##### Hint:from sklearn.model_selection import train_test_split

In [36]:
# Split independent and dependent variables into X and y
from sklearn.model_selection import train_test_split
X = model_data.values[:,0:3]
y = model_data.values[:,3]
# Training and testing data split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.70, random_state=1211)

### Q4 Build Random Forest Model( 1 Marks)
#### Hint:from sklearn.ensemble import RandomForestClassifier using n_jobs=2,n_estimators=500,criterion="entropy",random_state=9999

In [37]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_jobs=2,n_estimators=500, criterion="entropy",random_state=9999)
clf.fit(X_train, y_train)
print(clf.feature_importances_)

# Use the forest's predict method on the test data
predictions = clf.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test)
# Print out the mean absolute error (mae)
print('Mean  absolute error', round(np.mean(errors), 2), ' Percentage')

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Precision:', round(accuracy, 2), '%.')

[0.04681982 0.28240831 0.67077187]
Mean  absolute error 12.57  Percentage
Precision: 62.88 %.


### Q5 Calculate Confusion Matrix and Accuracy score (1 Marks)
##### Hint: Use confusion_matrix and accuracy_score

In [38]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
conf_mat = confusion_matrix(y_test, predictions)
print ("Confusion Matrix :: ",(conf_mat))
# Train and Test Accuracy
print ("Train Accuracy :: ", accuracy_score(y_train, clf.predict(X_train)))
print ("Test Accuracy  :: ", accuracy_score(y_test, predictions))
 


Confusion Matrix ::  [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Train Accuracy ::  0.9957142857142857
Test Accuracy  ::  0.04


### Q6 Show the list of the features importance( 1 Marks)

In [39]:
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

Feature ranking:
1. feature 2 (0.670772)
2. feature 1 (0.282408)
3. feature 0 (0.046820)


### Q7 K-fold cross-validation( 2 Marks)
##### k-fold cross validation( without stratification)
##### Usually k is set as 10-20 in practical settings, depends on data set size

In [40]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [41]:
# Use below values
num_folds = 10
seed = 77

In [42]:
#Validate the Random Forest model build above using k 
score = cross_val_score(clf, X, y, cv=10)
print(score)

[0.024      0.01652893 0.         0.02777778 0.03960396 0.
 0.03191489 0.03488372 0.0625     0.05263158]


In [0]:
#Calculate Mean score

In [43]:
mean_score = np.mean(score)
print("The mean score on cross validation is", mean_score)


The mean score on cross validation is 0.028984085728827436


In [0]:
# Calculate score standard deviation using std()

In [44]:
std_score = np.std(score)
print("The std dev of the score on cross validation is", std_score)

The std dev of the score on cross validation is 0.0192361099683011


# Q8 Print the confusion matrix( 1 Marks)

In [45]:
conf_mat = confusion_matrix(y_test, predictions)
print(conf_mat)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


# Q9.Classification accuracy: 
percentage of correct predictions and Calculate sensitivity (or True Positive Rate or Recall) and Precision.
(2 Marks)

In [46]:
# calculate classification accuracy
from sklearn import metrics
print(metrics.accuracy_score(y_test, predictions))

0.04


In [47]:
# save confusion matrix and slice into four pieces
confusion = metrics.confusion_matrix(y_test, predictions)
print(confusion)
#[row, column]
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [48]:
sensitivity = TP / float(FN + TP)
print(sensitivity)

nan


In [49]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

          19       0.00      0.00      0.00         1
          20       0.00      0.00      0.00         6
          21       0.00      0.00      0.00         7
          22       0.00      0.00      0.00        10
          23       0.11      0.05      0.07        20
          24       0.00      0.00      0.00        13
          25       0.00      0.00      0.00         9
          26       0.00      0.00      0.00        17
          27       0.12      0.13      0.13        15
          28       0.07      0.08      0.07        13
          29       0.00      0.00      0.00        11
          30       0.00      0.00      0.00         8
          31       0.07      0.09      0.08        11
          32       0.00      0.00      0.00         6
          33       0.11      0.10      0.11        10
          34       0.00      0.00      0.00         7
          35       0.08      0.10      0.09        10
          36       0.00    

[link text](https://)### Bootstrapping ( Bonus and Optional)
##### Given a dataset of size n, a bootstrap sample is created by sampling n instances uniformly from the data (with/without replacement)
##### Create a model with each bootstrap sample and validate it with the test set
##### Final result is calculated by averaging the accuracy of models

In [50]:
# Number of iterations for bootstrapping
bootstrap_iteration = 10
accuracy = []

In [53]:
from sklearn.utils import resample
from sklearn.metrics import accuracy_score

for i in range(bootstrap_iteration):
    X_, y_ = resample(X_train, y_train)
    clf.fit(X_, y_)
    y_pred = clf.predict(X_test)
    
    acc = accuracy_score(y_pred, y_test)
    accuracy.append(acc)

In [54]:
accuracy = np.array(accuracy)
print('Accuracy Score')
print('Avearge: ', accuracy.mean())
print('Standard deviation: ', accuracy.std())

Accuracy Score
Avearge:  0.03333333333333333
Standard deviation:  0.005055250296034367
