In [166]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split,GridSearchCV
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np 
import seaborn as sns

In [167]:
data = pd.read_csv('Carseats.csv')

In [168]:
print(data.head())


   Sales  CompPrice  Income  Advertising  Population  Price ShelveLoc  Age  \
0   9.50        138      73           11         276    120       Bad   42   
1  11.22        111      48           16         260     83      Good   65   
2  10.06        113      35           10         269     80    Medium   59   
3   7.40        117     100            4         466     97    Medium   55   
4   4.15        141      64            3         340    128       Bad   38   

   Education Urban   US  
0         17   Yes  Yes  
1         10   Yes  Yes  
2         12   Yes  Yes  
3         14   Yes  Yes  
4         13   Yes   No  


In [169]:
data['HighSales'] = np.where(data['Sales'] > 8  , 'Yes', 'No')


In [170]:
print(data.head())

   Sales  CompPrice  Income  Advertising  Population  Price ShelveLoc  Age  \
0   9.50        138      73           11         276    120       Bad   42   
1  11.22        111      48           16         260     83      Good   65   
2  10.06        113      35           10         269     80    Medium   59   
3   7.40        117     100            4         466     97    Medium   55   
4   4.15        141      64            3         340    128       Bad   38   

   Education Urban   US HighSales  
0         17   Yes  Yes       Yes  
1         10   Yes  Yes       Yes  
2         12   Yes  Yes       Yes  
3         14   Yes  Yes        No  
4         13   Yes   No        No  


In [171]:
data['ShelveLoc'] = pd.Categorical(data['ShelveLoc']).codes
data['Urban'] = pd.Categorical(data['Urban']).codes
#data['US'] = pd.Categorical(data['US']).codes
data['HighSales'] = pd.Categorical(data['HighSales']).codes
print(data.head())

   Sales  CompPrice  Income  Advertising  Population  Price  ShelveLoc  Age  \
0   9.50        138      73           11         276    120          0   42   
1  11.22        111      48           16         260     83          1   65   
2  10.06        113      35           10         269     80          2   59   
3   7.40        117     100            4         466     97          2   55   
4   4.15        141      64            3         340    128          0   38   

   Education  Urban   US  HighSales  
0         17      1  Yes          1  
1         10      1  Yes          1  
2         12      1  Yes          1  
3         14      1  Yes          0  
4         13      1   No          0  


In [172]:
#X = data.iloc[:, :9]
#y = data.iloc[:,10]
X = data.drop('US',axis=1)
y = data['US']


In [173]:
X_train ,X_test , y_train ,y_test = train_test_split(X,y,test_size=0.4 , random_state= 20) 


In [174]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(240, 11)
(160, 11)
(240,)
(160,)


In [175]:
dt_withoutpruining = DecisionTreeClassifier()

In [176]:
dt_withoutpruining.fit(X_train,y_train)

In [177]:
y_pred_without_pruning = dt_withoutpruining.predict(X_test)
accuracy_without_pruning = accuracy_score(y_test, y_pred_without_pruning)
print("Accuracy without pre-pruning:", accuracy_without_pruning)

Accuracy without pre-pruning: 0.83125


### Pre Pruning



In [178]:
dt_withprepruining = DecisionTreeClassifier(max_depth = 3)

In [179]:
dt_withprepruining.fit(X_train,y_train)

In [180]:
y_pred_with_prepruning = dt_withprepruining.predict(X_test)
accuracy_with_prepruning = accuracy_score(y_test, y_pred_with_prepruning)
print("Accuracy with pre-pruning on training set :", dt_withprepruining.score(X_train,y_train))
print("Accuracy with pre-pruning on test set:", accuracy_with_prepruning)

Accuracy with pre-pruning on training set : 0.9375
Accuracy with pre-pruning on test set: 0.875


In [181]:
from sklearn.model_selection import cross_val_score
import numpy as np


In [182]:
max_depth_values = range(1, 11)

In [183]:
cv_scores = []
for max_depth in max_depth_values:
    tree_cv = DecisionTreeClassifier(max_depth=max_depth)
    scores = cross_val_score(tree_cv, X_train, y_train, cv=5)
    cv_scores.append(np.mean(scores))

In [184]:
cv_scores

[0.9291666666666666,
 0.9291666666666666,
 0.9041666666666666,
 0.8958333333333333,
 0.8791666666666667,
 0.8666666666666668,
 0.8666666666666668,
 0.875,
 0.8791666666666667,
 0.875]

In [185]:
np.argmax(cv_scores)

0

In [186]:
max_depth_values[np.argmax(cv_scores)]

1

In [187]:
best_max_depth = max_depth_values[np.argmax(cv_scores)]

In [188]:
print("Best max depth selected through cross-validation:", best_max_depth)

Best max depth selected through cross-validation: 1


In [189]:
# Train a decision tree classifier with the selected max_depth
tree_post_pruned = DecisionTreeClassifier(max_depth=best_max_depth)
tree_post_pruned.fit(X_train, y_train)
y_pred_post_pruned = tree_post_pruned.predict(X_test)
accuracy_post_pruned = accuracy_score(y_test, y_pred_post_pruned)
print("Accuracy with post-pruning (cross-validation):", accuracy_post_pruned)

Accuracy with post-pruning (cross-validation): 0.88125


In [190]:
#from sklearn.tree import export_graphviz
#from sklearn.externals.six import StringIO  
#from IPython.display import Image  
#import pydotplus

#dot_data = StringIO()
#export_graphviz(clf, out_file=dot_data,filled=True, rounded=True,special_characters=True,feature_names = feature_cols,class_names=['0','1'])
#graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
#graph.write_png('diabetes.png')
#Image(graph.create_png())
