In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("car_evaluation.csv",header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [3]:
df.shape

(1728, 7)

In [4]:
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
df.columns = col_names
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [5]:
for col in df.columns:
    print(df[col].value_counts())

med      432
low      432
high     432
vhigh    432
Name: buying, dtype: int64
med      432
low      432
high     432
vhigh    432
Name: maint, dtype: int64
4        432
2        432
3        432
5more    432
Name: doors, dtype: int64
more    576
4       576
2       576
Name: persons, dtype: int64
med      576
small    576
big      576
Name: lug_boot, dtype: int64
med     576
low     576
high    576
Name: safety, dtype: int64
unacc    1210
acc       384
good       69
vgood      65
Name: class, dtype: int64


In [6]:
df.isnull().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   class     1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [8]:
X= df.drop(["class"],axis=1)
y= df["class"]

In [9]:
X.shape

(1728, 6)

In [10]:
y.shape

(1728,)

In [15]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape,X_test.shape

((1382, 6), (346, 6))

# Feature Engineering

In [16]:
import category_encoders as ce
encoder = ce.OrdinalEncoder(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'])
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

In [17]:
X_train.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
107,1,1,1,1,1,1
901,2,1,2,2,2,2
1709,3,2,1,3,1,1
706,4,3,3,3,3,2
678,4,3,2,3,3,3


In [18]:
X_test.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
599,4,4,3,3,3,1
1201,2,2,4,2,3,2
628,4,4,1,3,1,2
1498,3,4,1,2,3,2
1263,2,2,3,1,3,3


# Decision Tree Classifier with criterion gini index

In [19]:
from sklearn.tree import DecisionTreeClassifier

In [21]:
clf_gini = DecisionTreeClassifier(criterion="gini",max_depth=3,random_state=0)
clf_gini.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=3, random_state=0)

In [22]:
y_pred_gini = clf_gini.predict(X_test)

In [23]:
from sklearn.metrics import accuracy_score

In [26]:
accuracy_score(y_test,y_pred_gini)

0.8179190751445087

# Compare the train-set and test-set accuracy

In [25]:
y_pred_gini_train = clf_gini.predict(X_train)

In [27]:
accuracy_score(y_train,y_pred_gini_train)

0.8024602026049205

Here, the training-set accuracy score is 0.8024 while the test-set accuracy to be 0.8179. These two values are quite comparable. So, there is no sign of overfitting.

# Decision Tree Classifier with criterion entropy

In [28]:
clf_entropy = DecisionTreeClassifier(criterion="entropy",max_depth=3,random_state=0)
clf_entropy.fit(X_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)

In [31]:
y_pred_entropy = clf_entropy.predict(X_test)

In [32]:
accuracy_score(y_test,y_pred_entropy)

0.8179190751445087

# Compare the train-set and test-set accuracy

In [33]:
y_pred_entropy_train = clf_entropy.predict(X_train)

In [34]:
accuracy_score(y_train,y_pred_entropy_train)

0.8024602026049205

Here, the training-set accuracy score is 0.8024 while the test-set accuracy to be 0.8179. These two values are quite comparable. So, there is no sign of overfittin

# Confusion Matrix

In [35]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred_entropy)
print(cm)

[[ 65   0  18   0]
 [ 11   0   0   0]
 [ 17   0 218   0]
 [ 17   0   0   0]]


# Classification Report

In [38]:
from sklearn.metrics import classification_report
cr = classification_report(y_test,y_pred_entropy)
print(cr)

              precision    recall  f1-score   support

         acc       0.59      0.78      0.67        83
        good       0.00      0.00      0.00        11
       unacc       0.92      0.93      0.93       235
       vgood       0.00      0.00      0.00        17

    accuracy                           0.82       346
   macro avg       0.38      0.43      0.40       346
weighted avg       0.77      0.82      0.79       346



In [39]:
clf_entropy.predict([[3,2,1,3,1,1]])

array(['unacc'], dtype=object)

In [40]:
y_train[1709]

'unacc'