In [1]:
#Social_Network_Ads.csv

**Importing the libraries**

In [2]:
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn import datasets, neighbors
from mlxtend.plotting import plot_decision_regions 
from sklearn.model_selection import cross_val_score 

**Importing the dataset**

In [3]:
ds = pd.read_csv('Social_Network_Ads.csv')
ds.head()
ds = ds.drop('User ID', axis=1)
ds

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0
...,...,...,...,...
395,Female,46,41000,1
396,Male,51,23000,1
397,Female,50,20000,1
398,Male,36,33000,0


In [4]:
ds['Purchased'].value_counts()

0    257
1    143
Name: Purchased, dtype: int64

In [5]:
ds.isnull().sum()

Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

**Splitting the dataset into the Training set and Test set**

In [6]:
feature = ds.drop('Purchased', axis = 1)

target = ds['Purchased']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feature, target, shuffle = True, test_size=0.2, random_state=1)

print('Shape of training feature:', X_train.shape)
print('Shape of testing feature:', X_test.shape)
print('Shape of training label:', y_train.shape)
print('Shape of testing label:', y_test.shape)

Shape of training feature: (320, 3)
Shape of testing feature: (80, 3)
Shape of training label: (320,)
Shape of testing label: (80,)


**Feature Scaling**

In [7]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
cat_cols = ['Gender']

# Encode Categorical Data
df_encoded = pd.DataFrame(encoder.fit_transform(ds[cat_cols]))
df_encoded.columns = encoder.get_feature_names(cat_cols)

# Replace Categotical Data with Encoded Data
ds = ds.drop(cat_cols ,axis=1)
ds = pd.concat([df_encoded, ds], axis=1)



In [8]:
from sklearn.preprocessing import StandardScaler

# Copying original dataframe
ds_new = ds.copy()

scaler = StandardScaler()
num_cols = ['Age', 'EstimatedSalary']
ds_new[num_cols] = scaler.fit_transform(ds[num_cols])

ds_new.head()

Unnamed: 0,Gender_Female,Gender_Male,Age,EstimatedSalary,Purchased
0,0.0,1.0,-1.781797,-1.490046,0
1,0.0,1.0,-0.253587,-1.460681,0
2,1.0,0.0,-1.113206,-0.78529,0
3,1.0,0.0,-1.017692,-0.374182,0
4,0.0,1.0,-1.781797,0.183751,0


**Fitting Decision Tree Classification to the Training set**

In [9]:
def evaluate_model(model, x_test, y_test):
    from sklearn import metrics

    # Predict Test Data 
    y_pred = model.predict(x_test)

    # Calculate accuracy, precision, recall, f1-score, and kappa score
    acc = metrics.accuracy_score(y_test, y_pred)
    prec = metrics.precision_score(y_test, y_pred)
    rec = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)

    # Display confussion matrix
    cm = metrics.confusion_matrix(y_test, y_pred)

    return {'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 
             'cm': cm}

In [10]:
feature = ds.drop('Purchased', axis=1)

# Select Target
target = ds['Purchased']

# Set Training and Testing Data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feature , target, 
                                                    shuffle = True, 
                                                    test_size=0.2, 
                                                    random_state=1)

# Show the Training and Testing Data
print('Shape of training feature:', X_train.shape)
print('Shape of testing feature:', X_test.shape)
print('Shape of training label:', y_train.shape)
print('Shape of testing label:', y_test.shape)

dtc = tree.DecisionTreeClassifier(random_state=0)
dtc.fit(X_train, y_train)

Shape of training feature: (320, 4)
Shape of testing feature: (80, 4)
Shape of training label: (320,)
Shape of testing label: (80,)


DecisionTreeClassifier(random_state=0)

**Predicting the Test set results**

In [11]:
# Evaluate Model
dtc_eval = evaluate_model(dtc, X_test, y_test)

# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])

Accuracy: 0.775
Precision: 0.71875
Recall: 0.71875
F1 Score: 0.71875


**Making the Confusion Matrix**

In [12]:
print('Confusion Matrix:\n', dtc_eval['cm'])

Confusion Matrix:
 [[39  9]
 [ 9 23]]


**Visualising the Training set results**

In [None]:
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, dtc.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('DT (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

**Visualising the Test set results**

In [None]:
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, dtc.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('DT (Testing set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()