In [1]:
import numpy as np
import pandas as pd

In [46]:
df = pd.read_csv('/content/car_evaluation.csv')
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,outcome
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [3]:
df.shape

(1728, 7)

In [47]:
df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,outcome
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5,6,med,med,good
1724,low,low,5,6,med,high,vgood
1725,low,low,5,6,big,low,unacc
1726,low,low,5,6,big,med,good


In [48]:
df.outcome.value_counts()

unacc    1210
acc       384
good       69
vgood      65
Name: outcome, dtype: int64

In [49]:
df.isnull().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
outcome     0
dtype: int64

In [50]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [51]:
df.buying = le.fit_transform(df.buying)
df.maint = le.fit_transform(df.maint)
df.lug_boot = le.fit_transform(df.lug_boot)
df.safety = le.fit_transform(df.safety)

In [52]:
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,outcome
0,3,3,2,2,2,1,unacc
1,3,3,2,2,2,2,unacc
2,3,3,2,2,2,0,unacc
3,3,3,2,2,1,1,unacc
4,3,3,2,2,1,2,unacc


In [53]:
# X any y
X = df.iloc[:,:-1]
X.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,3,3,2,2,2,1
1,3,3,2,2,2,2
2,3,3,2,2,2,0
3,3,3,2,2,1,1
4,3,3,2,2,1,2


In [57]:
X.tail()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
1723,1,1,5,6,1,2
1724,1,1,5,6,1,0
1725,1,1,5,6,0,1
1726,1,1,5,6,0,2
1727,1,1,5,6,0,0


In [54]:
y =df.outcome
y.head()

0    unacc
1    unacc
2    unacc
3    unacc
4    unacc
Name: outcome, dtype: object

In [55]:
# train test split

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 47)

In [56]:
y_train.shape

(1209,)

In [14]:
y_test.shape

(519,)

In [15]:
y_train.value_counts()

unacc    848
acc      264
good      50
vgood     47
Name: outcome, dtype: int64

In [16]:
# model building

from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()

In [17]:
model.fit(x_train,y_train)

In [18]:
y_pred = model.predict(x_test)

In [19]:
y_pred_train  = model.predict(x_train)

In [20]:
# model evaluation

from sklearn.metrics import accuracy_score,classification_report

In [21]:
accuracy_score(y_test,y_pred)

0.8978805394990366

In [22]:
accuracy_score(y_train,y_pred_train)

0.9511993382961125

In [23]:
pd.crosstab(y_test,y_pred)

col_0,acc,good,unacc,vgood
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
acc,97,0,22,1
good,9,9,1,0
unacc,11,0,351,0
vgood,8,1,0,9


In [24]:
# recall of good
9/(19)

0.47368421052631576

In [25]:
# recall of vgood

9/(9+8+1)

0.5

In [27]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         acc       0.78      0.81      0.79       120
        good       0.90      0.47      0.62        19
       unacc       0.94      0.97      0.95       362
       vgood       0.90      0.50      0.64        18

    accuracy                           0.90       519
   macro avg       0.88      0.69      0.75       519
weighted avg       0.90      0.90      0.89       519



In [45]:
# testing on the random observation

model.predict([[1,2,1,2,2,1]])



array(['unacc'], dtype=object)

In [58]:
model.predict([[1,1,5,6,0,2]])



array(['good'], dtype=object)

In [59]:
model.predict([[1,3,3,1,4,0]])



array(['unacc'], dtype=object)

## Balancing technique

In [28]:
from imblearn.over_sampling import SMOTE

In [29]:
smote = SMOTE()

In [31]:
x_train_smote,y_train_smote = smote.fit_resample(x_train,y_train)

In [32]:
y_train.value_counts()

unacc    848
acc      264
good      50
vgood     47
Name: outcome, dtype: int64

In [33]:
y_train_smote.value_counts()

unacc    848
acc      848
good     848
vgood    848
Name: outcome, dtype: int64

In [34]:
# model building after balancing

model1 = KNeighborsClassifier()

In [35]:
model1.fit(x_train_smote,y_train_smote)

In [36]:
y_pred1 = model1.predict(x_test)

In [37]:
y_pred1_train = model1.predict(x_train_smote)

In [38]:
accuracy_score(y_test,y_pred1)

0.8362235067437379

In [39]:
accuracy_score(y_train_smote,y_pred1_train)

0.9705188679245284

In [40]:
pd.crosstab(y_test,y_pred1)

col_0,acc,good,unacc,vgood
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
acc,101,8,8,3
good,4,14,1,0
unacc,50,5,307,0
vgood,6,0,0,12


In [41]:
# recall of good
14/(14+4+1)

0.7368421052631579

In [42]:
# recall of vgood
12/(12+6)

0.6666666666666666

In [43]:
print(classification_report(y_test,y_pred1))

              precision    recall  f1-score   support

         acc       0.63      0.84      0.72       120
        good       0.52      0.74      0.61        19
       unacc       0.97      0.85      0.91       362
       vgood       0.80      0.67      0.73        18

    accuracy                           0.84       519
   macro avg       0.73      0.77      0.74       519
weighted avg       0.87      0.84      0.85       519



In [44]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         acc       0.78      0.81      0.79       120
        good       0.90      0.47      0.62        19
       unacc       0.94      0.97      0.95       362
       vgood       0.90      0.50      0.64        18

    accuracy                           0.90       519
   macro avg       0.88      0.69      0.75       519
weighted avg       0.90      0.90      0.89       519

