In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn import set_config
set_config(display = 'diagram')

In [17]:
df = pd.read_csv('car_evaluation.csv')

In [18]:
df

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc
...,...,...,...,...,...,...,...
1722,low,low,5more,more,med,med,good
1723,low,low,5more,more,med,high,vgood
1724,low,low,5more,more,big,low,unacc
1725,low,low,5more,more,big,med,good


In [19]:
#Attributes:buying: vhigh, high, med, low.
#maint: vhigh, high, med, low.
#doors: 2, 3, 4, 5more.
#persons: 2, 4, more.
#lug_boot: small, med, big.
#safety: low, med, high.
#Class Values:unacc, acc, good, vgood   

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1727 entries, 0 to 1726
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   vhigh    1727 non-null   object
 1   vhigh.1  1727 non-null   object
 2   2        1727 non-null   object
 3   2.1      1727 non-null   object
 4   small    1727 non-null   object
 5   low      1727 non-null   object
 6   unacc    1727 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [21]:
df.isnull().sum()

vhigh      0
vhigh.1    0
2          0
2.1        0
small      0
low        0
unacc      0
dtype: int64

In [22]:
X = df.drop(columns= ['unacc'])
y = df['unacc']

In [23]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [24]:
numeric_feature = X_train.select_dtypes('number').columns
categorical_features = X_train.select_dtypes('object').columns

In [25]:
numeric_transformer = make_pipeline(SimpleImputer(strategy = 'median'), StandardScaler())
categorical_transformer = make_pipeline(SimpleImputer(strategy = 'constant', fill_value = "missing"), OneHotEncoder())

In [26]:
col_transformer = make_column_transformer(
  (numeric_transformer, numeric_feature), 
    (categorical_transformer, categorical_features),
remainder = 'passthrough')

In [27]:
col_transformer.fit(X_train)

In [28]:
pipe = make_pipeline(col_transformer, DecisionTreeClassifier(criterion = 'entropy', random_state = 0))

In [29]:
pipe.fit(X_train, y_train)

In [30]:
from sklearn.model_selection import cross_validate
with_categorical_score = cross_validate(pipe, X_train, y_train, return_train_score = True)

In [31]:
categorical_score = pd.DataFrame(with_categorical_score)
categorical_score

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.027724,0.0,0.971119,1.0
1,0.012812,0.004999,0.960145,1.0
2,0.014755,0.002999,0.956522,1.0
3,0.015623,0.0,0.981884,1.0
4,0.015625,0.0,0.974638,1.0


In [32]:
categorical_score.mean()

fit_time       0.017308
score_time     0.001600
test_score     0.968862
train_score    1.000000
dtype: float64

In [35]:
pipe.predict(X_test)

array(['acc', 'unacc', 'unacc', 'unacc', 'unacc', 'good', 'unacc',
       'unacc', 'vgood', 'acc', 'unacc', 'unacc', 'acc', 'unacc', 'acc',
       'acc', 'unacc', 'unacc', 'acc', 'unacc', 'unacc', 'unacc', 'acc',
       'acc', 'unacc', 'unacc', 'acc', 'unacc', 'unacc', 'unacc', 'unacc',
       'unacc', 'unacc', 'unacc', 'good', 'acc', 'unacc', 'unacc',
       'unacc', 'unacc', 'acc', 'unacc', 'acc', 'unacc', 'unacc', 'unacc',
       'acc', 'acc', 'unacc', 'acc', 'acc', 'acc', 'unacc', 'acc',
       'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'acc', 'unacc',
       'unacc', 'acc', 'unacc', 'acc', 'unacc', 'unacc', 'unacc', 'unacc',
       'unacc', 'unacc', 'unacc', 'unacc', 'unacc', 'good', 'vgood',
       'unacc', 'unacc', 'unacc', 'unacc', 'acc', 'acc', 'acc', 'unacc',
       'acc', 'acc', 'unacc', 'vgood', 'unacc', 'unacc', 'unacc', 'unacc',
       'unacc', 'unacc', 'unacc', 'vgood', 'acc', 'acc', 'unacc', 'unacc',
       'unacc', 'unacc', 'unacc', 'unacc', 'vgood', 'acc', 'unacc',


In [37]:
pipe.score(X_train, y_train)

1.0

In [34]:
pipe.score(X_test, y_test)

0.9826589595375722