In [1]:
#By using decision tree classification, you can build models between a response variable and many feature variables.
#Make sure that each split gives us highest informational gain and results in least entropy ie decreases entropy after splitting by applying certain conditions 
''' collect_data
Prepare data-- feature scaling, encoding categorical data, feature selection
split dataset
build predictive model/algorithm
make prediction
evaluation--calculate errors
visualisation of data''' 
#importing dataset and set independent features matrix X and dependent variable vector y
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
dataset=pd.read_csv('Social_Network_Ads.csv')
print(dataset.shape)
X=dataset.iloc[ : , 2:4].values
y=dataset.iloc[ : , -1].values

(400, 5)


In [3]:
dataset.head(10)

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
5,15728773,Male,27,58000,0
6,15598044,Female,27,84000,0
7,15694829,Female,32,150000,1
8,15600575,Male,25,33000,0
9,15727311,Female,35,65000,0


In [4]:
dataset.tail()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0
399,15594041,Female,49,36000,1


In [5]:
# DATA PREPROCESSING 

# 1. checking and handling missing values
print("No. of missing values in data:",dataset.isnull().sum()) #gives the number of np.NaN values in each column
#checks for missing values in the dataset--if any, replace with mean of the column

No. of missing values in data: User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64


Decision trees can handle categorical data so encoding is not reqd

Decision trees are not alogirthms based on Euclidean distance so we do not have to do feature scaling as it is reqd in Euclidean distance dependent ML algorithms

In [6]:
#SPLITTING DATASET INTO TRAINING AND TEST SET
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=0)

In [7]:
# APPLYING DECISION TREE CLASSIFICATION MODEL
# import DecisionTreeClassifier from sklearn
from sklearn.tree import DecisionTreeClassifier
classifier=DecisionTreeClassifier( criterion='entropy')  #entropy is the measure of randomness and we split in such a way to bring order into the classifications after split and thus reduce entropy
classifier.fit(X_train,y_train)   
# Informational gain = entropy(before split) - entropy(after split)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [8]:
# PREDICTING TEST SET RESULTS
# prediction with sklearn
y_pred=classifier.predict(X_test) #predict() method is used to make test set predictions
y_pred #y_pred stores the predicted value of test set target output

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1], dtype=int64)

In [9]:
# COMPARING PREDICTED OUTPUT VALUES y_pred WITH ACTUAL TARGET TESTING SET VALUES y_test
df = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})
print(df.head(60))
print(df.tail(20))

    Actual  Predicted
0        0          0
1        0          0
2        0          0
3        0          0
4        0          0
5        0          0
6        0          0
7        1          1
8        0          0
9        0          0
10       0          0
11       0          0
12       0          0
13       0          1
14       0          0
15       0          1
16       0          1
17       0          0
18       1          1
19       0          0
20       0          0
21       1          1
22       0          0
23       1          1
24       0          0
25       1          0
26       0          0
27       0          0
28       0          0
29       0          0
30       0          0
31       1          0
32       1          1
33       0          0
34       0          0
35       0          0
36       0          0
37       0          0
38       0          0
39       1          1
40       0          0
41       0          0
42       0          0
43       0          0
44       1

Error and Accuracy Prediction --> Confusion Matrix

Since decision trees create boundaries and not compare predicted and actual values based on Euclidean distance, we cannot use r2_score or mean absolute error

In [10]:
# CALCULTING ERROR AND ACCURACY OF PREDICTION MODEL USING CONFUSION MATRIX
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(y_test,y_pred)
''' [ TP FN
      FP TN]'''
print("Confusion Matrix:",cm)
print('Training score:',classifier.score(X_train,y_train)*100)
print('Testing score:',classifier.score(X_test,y_test)*100)
print("Importance of each feature:" ,classifier.feature_importances_) 
#Features included are Age and Estimated Salary

Confusion Matrix: [[53  5]
 [ 3 19]]
Training score: 99.6875
Testing score: 90.0
Importance of each feature: [0.46867356 0.53132644]


In [None]:
'''from sklearn.tree import export_graphviz #graphviz works only on 32 bit python version
import graphviz
import pydotplus
from IPython.display import Image 

dot_data=export_graphviz(classifier,
               out_file = None,
               feature_names = X,
               impurity=False, # impurity- 1 - (gini index)
               class_names = y,
               filled=True #applies color to the decision tree graph
               ) 

graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())'''

In [None]:
# VISUALISE TRAINING SET RESULTS
from matplotlib.colors import ListedColormap
plt.figure(figsize = (7,7))
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01), np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), alpha = 0.75, cmap = ListedColormap(('black', 'white')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
 plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(('red', 'orange'))(i), label = j)
plt.title('Social_Network_Ads')
plt.xlabel('Age and Estimated Salary')
plt.ylabel('Purchased or not?')
plt.legend()
plt.show()

The decision tree boundary for the test set and training set will be the same only

In [None]:
# VISUALISE TEST SET RESULTS
from matplotlib.colors import ListedColormap
plt.figure(figsize = (7,7))
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01), np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), alpha = 0.75, cmap = ListedColormap(('black', 'white')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
 plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(('red', 'orange'))(i), label = j)
plt.title('Social_Network_Ads')
plt.xlabel('Age and Estimated Salary')
plt.ylabel('Purchased or not?')
plt.legend()
plt.show()