# Classification and Regression Tree

$\underline{Problem Statement}$: Given the Independent variables, we have to predict whether the customer will respond to the marketing campaign or not.

In [1]:
# Importing the necessary Modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split # to split data into training and testing sets
from sklearn.model_selection import cross_val_score # for cross validation
from sklearn.metrics import confusion_matrix # to create a confusion matrix
from sklearn.metrics import plot_confusion_matrix # to draw a confusion matrix

In [2]:
import os
os.getcwd()

'C:\\Users\\vivek\\Google Drive\\Training\\Data Mining\\CART\\CART folder - Instructor using Python\\Class Demo Files\\Example 1'

In [3]:
df = pd.read_csv('DATASET - Original Copy.csv')
df.head()

Unnamed: 0,Cust_ID,Target,Age,Gender,Balance,Occupation,No_OF_CR_TXNS,AGE_BKT,SCR,Holding_Period
0,C16505,0,41,M,91519.92,SELF-EMP,38,41-45,926,15
1,C17241,0,52,M,117288.96,SAL,17,>50,768,13
2,C18802,0,31,F,259827.44,SENP,8,31-35,816,5
3,C19289,0,45,F,26677.55,PROF,14,41-45,353,18
4,C14028,0,39,F,43440.31,SENP,1,36-40,751,31


Let us now check for the missing values.

In [4]:
df.isnull().sum()

Cust_ID           0
Target            0
Age               0
Gender            0
Balance           0
Occupation        0
No_OF_CR_TXNS     0
AGE_BKT           0
SCR               0
Holding_Period    0
dtype: int64

There are no missing values in the dataset and we can go ahead with building the model.

Let us check the number of rows and the number of columns in the dataframe.

In [5]:
print('The number of rows (observations) is:',df.shape[0],'\n''The number of columns(variables) is:',df.shape[1])

The number of rows (observations) is: 20000 
The number of columns(variables) is: 10


We will drop the 'Cust_ID' variable.
Also dropping AGE_BKT as we already have the Age as a seperate variable

In [6]:
df=df.drop(['Cust_ID','AGE_BKT'], axis=1)
df.head()

Unnamed: 0,Target,Age,Gender,Balance,Occupation,No_OF_CR_TXNS,SCR,Holding_Period
0,0,41,M,91519.92,SELF-EMP,38,926,15
1,0,52,M,117288.96,SAL,17,768,13
2,0,31,F,259827.44,SENP,8,816,5
3,0,45,F,26677.55,PROF,14,353,18
4,0,39,F,43440.31,SENP,1,751,31


Let us check the data types of each of the variables in the data.

In [7]:
df.dtypes

Target              int64
Age                 int64
Gender             object
Balance           float64
Occupation         object
No_OF_CR_TXNS       int64
SCR                 int64
Holding_Period      int64
dtype: object

There are two variables (Gender & Occupation) which has the object data type.

sklearn in Python does not take the input of object data types when building Classification Trees. So, we need to convert these variables into some numerical form.

We have a choice of converting objects into categories if there are only two levels in a variable like Gender (Male / Female) or if the data type is suuposedly ordinal in nature whereby assigning numbers will represent their corresponding weightage.

The category data type in pandas is a $\underline{hybrid}$ data type. It looks and behaves like a string in many instances but internally is represented by an array of integers. This way, Python will treat it as a numerical variable. An interesting article worth reading
https://benalexkeen.com/mapping-categorical-data-in-pandas/


If this is not the case where the catagory is nominal, One hot encoding is the recommended way forward.

The following code is provided to you to convert the 'object' type variables into categories(Hybrid data types) to numerical variables by assigning ranks/numbers to each category. Though we are not using it here in this case. Our Categoricalvariables have multiple levels and therefore "One hot encoding it is"

In [8]:
#We could use the following code snippet in the loop. 
#df['Gender']=pd.Categorical(df['Gender']).codes #code used for assigning numerical value to each category

You can read more about the Categorical function in the pandas library 

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Categorical.html

https://pbpython.com/pandas_dtypes_cat.html

In [9]:
#Not using this code here hence commented them
#for i in df.columns:#looping through all of the columns or variables
 #   if df[i].dtype == 'object': #checking if the data type of the variable is 'object'
  #      df[i] = pd.Categorical(df[i]).codes #converting the identified variable into Categorical and extracting the ranks

In [10]:
#We are prepping the data by segregating them into Target and independent variables to runt his model going forward

# Let us define the X(predictor) and Y(target) variables

X = df.drop("Target" , axis=1)

Y = df.pop("Target")
#we have made a copy of the data frame as the 'pop' function removes that particular variable from the data frame and stores 
# in another variable

In [11]:
# This line of code is to perform one hot encoding for Categorical Features
X = pd.get_dummies( X, drop_first = False )

In [12]:
list(X.columns)

['Age',
 'Balance',
 'No_OF_CR_TXNS',
 'SCR',
 'Holding_Period',
 'Gender_F',
 'Gender_M',
 'Gender_O',
 'Occupation_PROF',
 'Occupation_SAL',
 'Occupation_SELF-EMP',
 'Occupation_SENP']

In [13]:
X.dtypes

Age                      int64
Balance                float64
No_OF_CR_TXNS            int64
SCR                      int64
Holding_Period           int64
Gender_F                 uint8
Gender_M                 uint8
Gender_O                 uint8
Occupation_PROF          uint8
Occupation_SAL           uint8
Occupation_SELF-EMP      uint8
Occupation_SENP          uint8
dtype: object

We see that the data types of all the variables have been changed to either of $\underline{int64}$ or $\underline{uint8}$ or $\underline{float64}$.

In [14]:
#You shall notice One hot encoding converts catagorical variable (Levels) into seperate columns with binary values 
X.head()

Unnamed: 0,Age,Balance,No_OF_CR_TXNS,SCR,Holding_Period,Gender_F,Gender_M,Gender_O,Occupation_PROF,Occupation_SAL,Occupation_SELF-EMP,Occupation_SENP
0,41,91519.92,38,926,15,0,1,0,0,0,1,0
1,52,117288.96,17,768,13,0,1,0,0,1,0,0
2,31,259827.44,8,816,5,1,0,0,0,0,0,1
3,45,26677.55,14,353,18,1,0,0,1,0,0,0
4,39,43440.31,1,751,31,1,0,0,0,0,0,1


## Split the data into Train and Test.

Before building the model we should split the data into Train and Test. We will thus build a model on the training data and use this model to predict on the test data.

We will be doing a 70:30 split.
70% of the whole data will be used to train the data and then 30% of the data will be used for testing the model thus built.

Before splitting the data, we shall make a copy of the data frame.

In [15]:
data = df.copy()
data.head()
#Just keepipng the copy of the original dataset.

Unnamed: 0,Age,Gender,Balance,Occupation,No_OF_CR_TXNS,SCR,Holding_Period
0,41,M,91519.92,SELF-EMP,38,926,15
1,52,M,117288.96,SAL,17,768,13
2,31,F,259827.44,SENP,8,816,5
3,45,F,26677.55,PROF,14,353,18
4,39,F,43440.31,SENP,1,751,31


In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.30, random_state=1)

Now that we have split the data into Train and Test, let us go ahead and build our Decision Tree Model.

## Building the Decision Tree

We will start by building a very basic Decision Tree model.

In [17]:
from sklearn import tree

In [18]:
dt_model = tree.DecisionTreeClassifier(criterion = 'gini',random_state=1)

In the above code snippet we have defined a Decision Tree (which is to be used for classification problems) with the splitting criteria for each node as 'gini'. The 'random_state' parameter ensures that each time we run the code snippet the values remains the same. 

In the above code snippet default values of 'min_samples_split' and 'min_samples_leaf' is taken as 2 and 1 respectively.

To understand about the various other parameters that can be passed into this DecisionTreeClassifier function please refer to the scikitlearn documentation over $\href{https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html}{here}$.

Now, that we have defined a Decision Tree, let us go ahead and build the model on the training data.

In [19]:
dt_model.fit(X_train, Y_train)

DecisionTreeClassifier(random_state=1)

## Visualizing the Decision Tree

Now, that we have built the tree let us go ahead and visualize the tree to understand the various nuances of the Classification Tree that we just built.

In [20]:
train_char_label = ['No', 'Yes']# defining the classes of the target variable for the ease of the 

Now, we need to create a dot file which will contains all the instructions on how build this graphical visualization of the Classification Tree that we had built. In the following code, inside the Open command, please replace the path with your working directory.

C:\\Users\\vivek\\OneDrive\\Desktop\\Decission Tree 19th April 2020\\Example 1

In [21]:
os.getcwd()

'C:\\Users\\vivek\\Google Drive\\Training\\Data Mining\\CART\\CART folder - Instructor using Python\\Class Demo Files\\Example 1'

In [22]:
#We are opening a 'dot' file by the name of 'decision_tree' in the working directory and we are providing the write permission 
#to this particular file.
CART_File = open('C:\\Users\\vivek\\Google Drive\\Training\\Data Mining\\CART\\Python Codes Decission Tree\\Example 1\decision_tree.dot','w')


#Now that we have created the 'dot' file, we need to pass the appropriate parameters to export the graphical visualization.

dot_data = tree.export_graphviz(dt_model, #passing the model that we had built earlier
                                out_file=CART_File, #the output should be saved in the 'dot' file that we created earlier
                                feature_names = list(X_train), #names of the independent variables
                                class_names = list(train_char_label)) #passing the names of the classes that we had defined

#After opening the dot file, all the instructions have been passed and written in the 'dot' file, so we will go ahead and 
#close the file
CART_File.close()


Let us check the file in the DWorking Directory by the name of 'decision_tree' and visualize it. 

We can go to the following $\href{http://www.webgraphviz.com/}{link}$ and paste the contents of the 'dot' file that we have created to visualize the Classification Tree.

In case the Decision Tree built is very large, the above link takes a lot of time to give us the output.

But the following codes helps us in visualizing a very large tree comparatively faster.

In [23]:
#If the below two libraries are not installed do install them using the following code snippet in the Jupyter Notebook
# !pip install 'package name'
import pydotplus
import graphviz

In [24]:
#Optional - Install only if the prev code throws error else skip running this line
!pip install pydotplus



You should consider upgrading via the 'c:\users\vivek\anaconda3\python.exe -m pip install --upgrade pip' command.


In [25]:
#Optional - Install only if the prev code throws error else skip running this line
!pip install graphviz



You should consider upgrading via the 'c:\users\vivek\anaconda3\python.exe -m pip install --upgrade pip' command.


Before running the below codes we need to make sure that the Graphviz software is installed on the system. Please refer to the following steps to understand how to install the Graphviz software:

1) Got to this $\href{https://www.graphviz.org/}{link}$ and click on the 'Download' tab.

2) After downloading the Graphviz software, install the Graphviz in the default location. You can change the location but then you have to edit the subsequent codes as well.

In [26]:
#The following is a modified version of the above code where we tried create a dot file and visualize

os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/' #setting the path 
dot_data = tree.export_graphviz(dt_model, out_file=None,#passing the model built and setting the output to None as we do
                                #not need the dot file separately to visualize the graph
                         feature_names=list(X_train), #names of the independent variables 
                         class_names=list(train_char_label),  
                         filled=True) #colours the nodes for classification for the ease of visualization

graph = pydotplus.graph_from_dot_data(dot_data) #extracting the visuals from the above file to plot it

In [27]:
from IPython.display import Image
Image(graph.create_png())#creating '.png' file 

InvocationException: GraphViz's executables not found

Since this tree is too large to visualize, we can right-click on the above visual and save it as an image to visualize it or we can pass the following code snippet to save it in the directory that has been set above.

In [None]:
graph.write_jpeg("tree.jpeg")

Let us check importance of the variables in the Classification Tree that we just built. The importance of a feature or variable is computed as the (normalized) total reduction of the gini criterion brought by that feature. It is also known as the Gini importance. 

In [28]:
pd.Series(dt_model.feature_importances_,index=X_train.columns).sort_values(ascending=False)

SCR                    0.242872
Balance                0.237118
No_OF_CR_TXNS          0.156245
Holding_Period         0.139365
Age                    0.132393
Occupation_SENP        0.022496
Gender_F               0.016805
Occupation_SELF-EMP    0.015588
Gender_M               0.013593
Occupation_PROF        0.012369
Occupation_SAL         0.008398
Gender_O               0.002759
dtype: float64

From the above output, we can see that 'SCR' is the most important variable followed by 'Balance' and so on.

Let us take a look at the overall accuracy of the train and test data using the model that we just built.

In [None]:
#Train Data
dt_model.score(X_train,Y_train)

In [None]:
#Test Data
dt_model.score(X_test,Y_test)

The accuracy on the Training Data is 100% and the accuracy on the Test Data is lesser substantially. The model has surely been overfitted. 
Thus, we need to prune or regularize the tree.

## Pruning/Regularizing the Tree

For Pruning/Regularizing the Tree we need to be sure as to what parameters and how to prune the tree.

# Method 1 for Pruning:

#### (by visualizing the tree)

In [None]:
reg_dt_model = tree.DecisionTreeClassifier(criterion = 'gini', 
                                       max_depth=13,#upto this depth is where the tree has grown uniformly
                                      min_samples_leaf=30, #ensures that every terminal node (leaf node) have at least 10
                                      #observations in it
                                      min_samples_split=10) #for every node to be split into two child nodes that particular
                                      #node should have at least 30 observations  
    
#Genral Thumb rule: 1% to 3% of the data should be the 'min_samples_split' and one third times the 'min_samples_split'
#should be 'min_samples_leaf.These are only a rough guideline value.
#Here, we have chose 1% of our training data which has 14,000 observations (140 observations for min_samples_split)

In [None]:
reg_dt_model.fit(X_train, Y_train)

Now that we have built the Pruned/Regularized Classification Tree let us visualize the tree to understand the nuances of the tree.

### Method 1:

In [None]:
# Define the path of the dot.file from the output of this code below
os.getcwd()

In [None]:
#We are opening a 'dot' file by the name of 'decision_tree' in the D:Drive and we are providing the write permission 
#to this particular file.
CART_File = open('C:\\Users\\vivek\\Google Drive\\Training\\Data Mining\\CART\\Python Codes Decission Tree\\Example 1\decision_tree.dot','w')

#Now that we have created the 'dot' file, we need to pass the appropriate parameters to export the graphical visualization.

dot_data = tree.export_graphviz(reg_dt_model, #passing the model that we had built earlier
                                out_file=CART_File, #the output should be saved in the 'dot' file that we created earlier
                                feature_names = list(X_train), #names of the independent variables
                                class_names = list(train_char_label)) #passing the names of the classes that we had defined

#After opening the dot file, all the instructions have been passed and written in the 'dot' file, so we will go ahead and 
#close the file
CART_File.close()


We can go to the following $\href{http://www.webgraphviz.com/}{link}$ and paste the contents of the 'dot' file that we have created to visualize the Classification Tree.

Since,we have pruned the tree we see that the tree can be visualized properly with the help of the above link.

### Method 2:

In [None]:
#The following is a modified version of the above code where we tried create a dot file and visualize

os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/' #setting the path 
dot_data = tree.export_graphviz(reg_dt_model, out_file=None,#passing the model built and setting the output to None as we do
                                #not need the dot file separately to visualize the graph
                         feature_names=list(X_train), #names of the independent variables 
                         class_names=list(train_char_label),  
                         filled=True) #colours the nodes for classification for the ease of visualization

graph = pydotplus.graph_from_dot_data(dot_data) #extracting the visuals from the above file to plot it

In [None]:
from IPython.display import Image
Image(graph.create_png())#creating '.png' file 

As mentioned, we can go ahead and save the above tree as a an image file.

In [None]:
graph.write_jpeg("tree.jpeg")

Let us now go ahead and predict both the classes and the probability values on the test data using the Pruned/Regularized Decision Tree.

In [None]:
#We are only predicting the classes over here. Python by default takes the 0.5 cutoff of the probability values while
#predicting the classes

Y_train_predict_class = reg_dt_model.predict(X_train)
Y_test_predict_class = reg_dt_model.predict(X_test)

In [None]:
#Here, we are predicting the probabilities and we can manually input a cutoff value which is different than 0.5.

Y_train_predict_prob = reg_dt_model.predict_proba(X_train)
Y_test_predict_prob = reg_dt_model.predict_proba(X_test)

# Evaluation of the Train and Test Models.

Let us first build the confusion matrix, followed by the Classification Report and then the ROC (Receiver Operator Charateristic Curve) curve and the AUC (Area Under the Curve) value.

In [None]:
from sklearn import metrics

First we will evaluate the model on the Training Data.

In [None]:
print(metrics.confusion_matrix(Y_train,Y_train_predict_class))

In [None]:
tn, fp, fn, tp = metrics.confusion_matrix(Y_train,Y_train_predict_class).ravel()
print('True Negative:',tn,'\n''False Positives:' ,fp,'\n''False Negatives:', fn,'\n''True Positives:', tp)

Let us now go ahead and print the classification report to check the various other parameters.

In [None]:
print(metrics.classification_report(Y_train,Y_train_predict_class))

We have been able to predict 92% of the target variables correctly.

We will now calculate the Area Under the Curve (AUC) of the Receiver Operating Characteristic (ROC) curve and plot the ROC curve as well.

# AUC and ROC for the training data


# calculating on the basis of a person not defaulting
# calculate AUC
auc = metrics.roc_auc_score(Y_train,Y_train_predict_prob[:, 1])#keeping only the probabilities for the desired class outcome
print('AUC: %.3f' % auc)
# # calculate roc curve
# from sklearn.metrics import roc_curve
fpr, tpr, thresholds = metrics.roc_curve(Y_train,Y_train_predict_prob[:, 1])#keeping only the probabilities for the desired 
#class outcome
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
# show the plot
plt.show()

Let us check the confusion matrix for the test data.

In [None]:
print(metrics.confusion_matrix(Y_test,Y_test_predict_class),'\n')
tn, fp, fn, tp = metrics.confusion_matrix(Y_test,Y_test_predict_class).ravel()
print('True Negative:',tn,'\n''False Positives:' ,fp,'\n''False Negatives:', fn,'\n''True Positives:', tp)

Let us now go ahead and print the classification report for the test data and compare between train and test.

In [None]:
print(metrics.classification_report(Y_test,Y_test_predict_class))

We will now calculate the Area Under the Curve (AUC) of the Receiver Operating Characteristic (ROC) curve and plot the ROC curve for the Test data.

# AUC and ROC for the training data


# calculating on the basis of a person not defaulting
# calculate AUC
auc = metrics.roc_auc_score(Y_test,Y_test_predict_prob[:, 1])
print('AUC: %.3f' % auc)
# # calculate roc curve
# from sklearn.metrics import roc_curve
fpr, tpr, thresholds = metrics.roc_curve(Y_test,Y_test_predict_prob[:, 1])
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
# show the plot
plt.show()

# END