**The problem statement**
The problem is to predict the safety of the car. In this project, I build a Decision Tree Classifier to predict the safety of the car. I implement Decision Tree Classification with Python and Scikit-Learn. I have used the Car Evaluation Data Set for this project, downloaded from the UCI Machine Learning Repository website.


In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv('/content/car_evaluation.csv')
df.head()

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


In [None]:
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
df.columns = col_names

In [None]:
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1727 entries, 0 to 1726
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1727 non-null   object
 1   maint     1727 non-null   object
 2   doors     1727 non-null   object
 3   persons   1727 non-null   object
 4   lug_boot  1727 non-null   object
 5   safety    1727 non-null   object
 6   class     1727 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [None]:
df.columns

Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class'], dtype='object')

In [None]:
df.shape

(1727, 7)

In [14]:
df['buying'].value_counts()

high     432
med      432
low      432
vhigh    431
Name: buying, dtype: int64

In [18]:
for col in col_names:
  print(df[col].value_counts())
  print('*'*30)

high     432
med      432
low      432
vhigh    431
Name: buying, dtype: int64
******************************
high     432
med      432
low      432
vhigh    431
Name: maint, dtype: int64
******************************
3        432
4        432
5more    432
2        431
Name: doors, dtype: int64
******************************
4       576
more    576
2       575
Name: persons, dtype: int64
******************************
med      576
big      576
small    575
Name: lug_boot, dtype: int64
******************************
med     576
high    576
low     575
Name: safety, dtype: int64
******************************
unacc    1209
acc       384
good       69
vgood      65
Name: class, dtype: int64
******************************


In [19]:
df.isnull().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64

Declare feature vector and target variable


In [26]:
X=df.drop(['class'],axis=1)
Y=df['class']

In [27]:
X.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,vhigh,vhigh,2,2,small,med
1,vhigh,vhigh,2,2,small,high
2,vhigh,vhigh,2,2,med,low
3,vhigh,vhigh,2,2,med,med
4,vhigh,vhigh,2,2,med,high


In [28]:
Y.head()

0    unacc
1    unacc
2    unacc
3    unacc
4    unacc
Name: class, dtype: object

# **Split data into separate training and test set**

In [29]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=30)

In [31]:
x_train.shape,x_test.shape

((1208, 6), (519, 6))

In [32]:
y_train.shape,y_test.shape

((1208,), (519,))

# **Feature Engineering**

In [34]:
!pip install category_encoders
import category_encoders as ce

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [39]:
encoder=ce.OrdinalEncoder(cols=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'])
x_train=encoder.fit_transform(x_train)
x_test=encoder.transform(x_test)

In [40]:
x_train.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
56,1,1,1,1,1,1
210,1,2,2,2,1,2
330,1,3,3,1,2,2
1186,2,4,2,2,2,3
1263,2,3,1,2,1,2


# **Random Forest Classifier model with default parameters**

In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

random=RandomForestClassifier()
random.fit(x_train,y_train)

y_pred=random.predict(x_test)

acc=accuracy_score(y_test,y_pred)

acc

0.9730250481695568

In [45]:
d={}
for t in [1,10,20,50,60,70,80,100,500,1000]:
  for t in [2,5,7,11,14,20]:
    random=RandomForestClassifier(n_estimators = t, random_state = 101)
    random.fit(x_train,y_train)
    y_pred=random.predict(x_test)
    acc=accuracy_score(y_test,y_pred)
    print(t," ",acc)
  print('*'*50)

2   0.9267822736030829
5   0.953757225433526
7   0.9556840077071291
11   0.9653179190751445
14   0.9672447013487476
20   0.9730250481695568
**************************************************
2   0.9267822736030829
5   0.953757225433526
7   0.9556840077071291
11   0.9653179190751445
14   0.9672447013487476
20   0.9730250481695568
**************************************************
2   0.9267822736030829
5   0.953757225433526
7   0.9556840077071291
11   0.9653179190751445
14   0.9672447013487476
20   0.9730250481695568
**************************************************
2   0.9267822736030829
5   0.953757225433526
7   0.9556840077071291
11   0.9653179190751445
14   0.9672447013487476
20   0.9730250481695568
**************************************************
2   0.9267822736030829
5   0.953757225433526
7   0.9556840077071291
11   0.9653179190751445
14   0.9672447013487476
20   0.9730250481695568
**************************************************
2   0.9267822736030829
5   0.953757225433526
