In [1]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
data = pd.read_csv('MyCreditData.csv')

In [3]:
data.head()

Unnamed: 0,checking_account,duration,credit_history,purpose,amount,savings_account,employment_duration,installment_rate,other_debtors,present_residence,...,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,gender,profit
0,3,18,0,2,1049,4,2,2,2,3,...,21,1,0,0,1,0,0,0,female,242
1,3,9,0,5,2799,4,0,1,2,0,...,36,1,0,1,1,1,0,0,male,596
2,0,12,4,8,841,0,1,1,2,3,...,23,1,0,0,3,0,0,0,female,25
3,3,12,0,5,2122,4,0,0,2,0,...,39,1,0,1,3,1,0,1,male,568
4,3,12,0,5,2171,4,0,2,2,3,...,38,0,2,1,3,0,0,1,male,782


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   checking_account         1000 non-null   int64 
 1   duration                 1000 non-null   int64 
 2   credit_history           1000 non-null   int64 
 3   purpose                  1000 non-null   int64 
 4   amount                   1000 non-null   int64 
 5   savings_account          1000 non-null   int64 
 6   employment_duration      1000 non-null   int64 
 7   installment_rate         1000 non-null   int64 
 8   other_debtors            1000 non-null   int64 
 9   present_residence        1000 non-null   int64 
 10  property                 1000 non-null   int64 
 11  age                      1000 non-null   int64 
 12  other_installment_plans  1000 non-null   int64 
 13  housing                  1000 non-null   int64 
 14  number_credits           1000 non-null   

In [5]:
data.isnull().sum()

checking_account           0
duration                   0
credit_history             0
purpose                    0
amount                     0
savings_account            0
employment_duration        0
installment_rate           0
other_debtors              0
present_residence          0
property                   0
age                        0
other_installment_plans    0
housing                    0
number_credits             0
job                        0
people_liable              0
telephone                  0
foreign_worker             0
gender                     0
profit                     0
dtype: int64

In [6]:
data.columns

Index(['checking_account', 'duration', 'credit_history', 'purpose', 'amount',
       'savings_account', 'employment_duration', 'installment_rate',
       'other_debtors', 'present_residence', 'property', 'age',
       'other_installment_plans', 'housing', 'number_credits', 'job',
       'people_liable', 'telephone', 'foreign_worker', 'gender', 'profit'],
      dtype='object')

In [7]:
numerical_col = ['duration', 'amount', 'age', 'profit']

for column in data.columns:
    if column not in numerical_col:
        data[column] = pd.Categorical(data[column])

In [9]:
data.head()

Unnamed: 0,checking_account,duration,credit_history,purpose,amount,savings_account,employment_duration,installment_rate,other_debtors,present_residence,...,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,gender,profit
0,3,18,0,2,1049,4,2,2,2,3,...,21,1,0,0,1,0,0,0,female,242
1,3,9,0,5,2799,4,0,1,2,0,...,36,1,0,1,1,1,0,0,male,596
2,0,12,4,8,841,0,1,1,2,3,...,23,1,0,0,3,0,0,0,female,25
3,3,12,0,5,2122,4,0,0,2,0,...,39,1,0,1,3,1,0,1,male,568
4,3,12,0,5,2171,4,0,2,2,3,...,38,0,2,1,3,0,0,1,male,782


#### Data Preparation for classification algorithms

In [8]:
data['profit']

0       242
1       596
2        25
3       568
4       782
       ... 
995    -582
996    -341
997   -1419
998   -1853
999   -3086
Name: profit, Length: 1000, dtype: int64

In [10]:
data['is_profitable'] = np.where(data['profit'] > 0, 1, 0)

In [11]:
data.head()

Unnamed: 0,checking_account,duration,credit_history,purpose,amount,savings_account,employment_duration,installment_rate,other_debtors,present_residence,...,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,gender,profit,is_profitable
0,3,18,0,2,1049,4,2,2,2,3,...,1,0,0,1,0,0,0,female,242,1
1,3,9,0,5,2799,4,0,1,2,0,...,1,0,1,1,1,0,0,male,596,1
2,0,12,4,8,841,0,1,1,2,3,...,1,0,0,3,0,0,0,female,25,1
3,3,12,0,5,2122,4,0,0,2,0,...,1,0,1,3,1,0,1,male,568,1
4,3,12,0,5,2171,4,0,2,2,3,...,0,2,1,3,0,0,1,male,782,1


In [12]:
X = data.drop(columns = ['profit', 'is_profitable']) 
y = data['is_profitable']

In [13]:
X = pd.get_dummies(X, drop_first = True)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)

### Applying Decision Tree to the data

In [56]:
dec_tree = DecisionTreeClassifier(max_depth = 6)
dec_tree.fit(X_train, y_train)

In [57]:
y_pred = dec_tree.predict(X_test)

In [58]:
accuracy_score(y_pred, y_test)

0.725

In [59]:
y_pred_train = dec_tree.predict(X_train)

In [60]:
accuracy_score(y_pred_train, y_train)

0.8475

### Pruning the tree using GridSearchCV

In [66]:
#base model
dec_tree_new = DecisionTreeClassifier(random_state = 42)

In [79]:
param_grid = {'max_depth': [4,5,6,7,8,9],
             'criterion': ['gini', 'entropy'],
             'min_samples_split': [10, 15, 20, 25, 30]}

In [68]:
grid = GridSearchCV(dec_tree_new, param_grid, scoring = 'roc_auc')

In [71]:
grid_search = grid.fit(X_train, y_train)

In [72]:
grid_search.best_params_

{'criterion': 'entropy', 'max_depth': 4}

In [73]:
dec_tree_new = DecisionTreeClassifier(random_state = 42, criterion = 'entropy', max_depth = 4)

In [74]:
dec_tree_new.fit(X_train, y_train)

In [75]:
y_pred = dec_tree.predict(X_test)

In [76]:
accuracy_score(y_pred, y_test)

0.725

In [77]:
y_pred_train = dec_tree_new.predict(X_train)

In [78]:
accuracy_score(y_pred_train, y_train)

0.7575

### Apply Random Forest Classifier on the data

In [61]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

In [62]:
y_pred = rf_classifier.predict(X_test)

In [63]:
accuracy_score(y_pred, y_test)

0.75

In [64]:
y_pred_train = rf_classifier.predict(X_train)

In [65]:
accuracy_score(y_pred_train, y_train)

1.0