In [26]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [27]:
df = pd.read_csv('credit.csv')
df.head()

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
0,< 0 DM,6,critical,furniture/appliances,1169,unknown,> 7 years,4,4,67,none,own,2,skilled,1,yes,no
1,1 - 200 DM,48,good,furniture/appliances,5951,< 100 DM,1 - 4 years,2,2,22,none,own,1,skilled,1,no,yes
2,unknown,12,critical,education,2096,< 100 DM,4 - 7 years,2,3,49,none,own,1,unskilled,2,no,no
3,< 0 DM,42,good,furniture/appliances,7882,< 100 DM,4 - 7 years,2,4,45,none,other,1,skilled,2,no,no
4,< 0 DM,24,poor,car,4870,< 100 DM,1 - 4 years,3,4,53,none,other,2,skilled,2,no,yes


In [28]:
# Converting all 'object' columns into 'int' to make use of classifier models
for i in df.columns:
    if df[i].dtype == 'object':
        df[i] = pd.Categorical(df[i]).codes

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   checking_balance      1000 non-null   int8 
 1   months_loan_duration  1000 non-null   int64
 2   credit_history        1000 non-null   int8 
 3   purpose               1000 non-null   int8 
 4   amount                1000 non-null   int64
 5   savings_balance       1000 non-null   int8 
 6   employment_duration   1000 non-null   int8 
 7   percent_of_income     1000 non-null   int64
 8   years_at_residence    1000 non-null   int64
 9   age                   1000 non-null   int64
 10  other_credit          1000 non-null   int8 
 11  housing               1000 non-null   int8 
 12  existing_loans_count  1000 non-null   int64
 13  job                   1000 non-null   int8 
 14  dependents            1000 non-null   int64
 15  phone                 1000 non-null   int8 
 16  default

In [30]:
df['default'].value_counts()

default
0    700
1    300
Name: count, dtype: int64

Let's do Decision Tree classifier on this dataset in simple terms

In [31]:
X_train = df.drop("default",axis=1).head(700)

In [32]:
y_train = df[['default']].head(700)

In [33]:
X_test = df.drop('default',axis=1).tail(300)

In [34]:
y_test = df[['default']].tail(300)

In [35]:
model = DecisionTreeClassifier(criterion='entropy')
model.fit(X_train,y_train)

In [36]:
model.score(X_test,y_test)

0.67

In [37]:
model.score(X_train,y_train)   # Classic example of Overfit. Giving full accuracy on training set and dropping like a bomb on Test set

1.0

Bagging - Bootstrap Aggregation (Ensemble Learning)

In [38]:
# Splitting into train and test set not required
Independent_Labels = df.drop("default",axis=1)
Target_Label = df['default']

In [39]:
# Out of Box is by default false, we made it true so that we can get score of the model based on 37% data not used in training
# Max_Samples and Number of estimators can also be tweaked.
# This basically creates 50 variations of datasets using random generators, from each dataset, 80% of it is used for training a model.
# We would end up with 50 different models and ensemble/aggregation is taken to classify/predict. In this case unused data becomes test set.   
from sklearn.ensemble import BaggingClassifier
bgcl = BaggingClassifier(n_estimators=50, max_samples=0.8, oob_score=True)
bgcl.fit(Independent_Labels,Target_Label)

bgcl.oob_score_

0.745

Let's check score with regularized Decision Tree Model

In [40]:
model_reg = DecisionTreeClassifier(max_depth=5, max_leaf_nodes=5, criterion='entropy')
model_reg.fit(X_train,y_train)

In [41]:
print(model_reg.score(X_train,y_train))
print(model_reg.score(X_test,y_test))   # Relatively less overfitting. Also check 0.749 > 0.72. Ensemble worked better

0.7371428571428571
0.72


Ensemble Learning - AdaBoosting (Adaptive Boosting)

In [42]:
# Type of Ensemble Learning in which models are sequentially generated giving more weightage to incorrectly classified records
# from previous model iterations. By default 100 models are generated and classified as a whole. Below I have given 50
# Classifier by default is DecisionTree.
from sklearn.ensemble import AdaBoostClassifier
dt_model = DecisionTreeClassifier(max_depth=3)
model_ad = AdaBoostClassifier(n_estimators=50, estimator=dt_model)

model_ad.fit(X_train,y_train)
model_ad_predict = model_ad.predict(X_test)

  y = column_or_1d(y, warn=True)


In [43]:
model_ad.score(X_test,y_test)

0.7466666666666667

In [44]:
metrics.confusion_matrix(y_test,model_ad_predict)    # Not that good though

array([[180,  27],
       [ 49,  44]], dtype=int64)

Ensemble Learning - Gradient Boosting

In [45]:
from sklearn.ensemble import GradientBoostingClassifier
model_gb = GradientBoostingClassifier(n_estimators=50)

model_gb.fit(X_train,y_train)
model_gb_predict = model_gb.predict(X_test)

  y = column_or_1d(y, warn=True)


In [46]:
model_gb.score(X_test,y_test)

0.7566666666666667

In [47]:
metrics.confusion_matrix(y_test,model_gb_predict)

array([[185,  22],
       [ 51,  42]], dtype=int64)

Random Forest

In [48]:
# Since there is a chance of datasets in each variation being not so different from one another which might decrease the overall model score,
# Random Forests bring in an attempt to make these variations different. In each variation of dataset, certain number of columns are specifically selected
# randomly by which the model should use them inorder to divide it into different branches and make predictions. Since different columns set is selected
# each time, not only it ensures variation across all estimators but also helps in improving overall ensemble score.
# All estimators won't be doing similar errors to an extent  

from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(n_estimators=50,criterion='entropy',max_depth=5)

model_rf.fit(X_train,y_train)

  return fit_method(estimator, *args, **kwargs)


In [49]:
model_rf.score(X_test,y_test)

0.74

In [50]:
rf_predict = model_rf.predict(X_test)
metrics.confusion_matrix(y_test,rf_predict)

array([[197,  10],
       [ 68,  25]], dtype=int64)

In [None]:
fgfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffggggggggggggggggggggggggggggggggggggg