## Classification model for Car evaluation

In [13]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE

In [14]:
train_data= pd.read_csv("Car_Condition_evaluation_dataset/training_data.csv")
test_data= pd.read_csv("Car_Condition_evaluation_dataset/testing_data.csv")

In [15]:
train_data.shape

(1554, 7)

In [16]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1554 entries, 0 to 1553
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Buying_Cost          1554 non-null   object
 1   Maintainance_Cost    1554 non-null   object
 2   Number_of_doors      1554 non-null   object
 3   Number_of_Passenger  1554 non-null   object
 4   Luggage_Space        1554 non-null   object
 5   Safety_Features      1554 non-null   object
 6   How_is_the_deal      1554 non-null   object
dtypes: object(7)
memory usage: 85.1+ KB


In [17]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174 entries, 0 to 173
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Deal_num             174 non-null    int64 
 1   Buying_Cost          174 non-null    object
 2   Maintainance_Cost    174 non-null    object
 3   Number_of_doors      174 non-null    object
 4   Number_of_Passenger  174 non-null    object
 5   Luggage_Space        174 non-null    object
 6   Safety_Features      174 non-null    object
dtypes: int64(1), object(6)
memory usage: 9.6+ KB


In [18]:
test_data=test_data.set_index("Deal_num")

In [19]:
train_data[:5]

Unnamed: 0,Buying_Cost,Maintainance_Cost,Number_of_doors,Number_of_Passenger,Luggage_Space,Safety_Features,How_is_the_deal
0,vhigh,med,2,4,small,low,Bad_deal
1,vhigh,med,5more,4,small,low,Bad_deal
2,med,vhigh,5more,4,small,low,Bad_deal
3,high,high,3,2,med,med,Bad_deal
4,vhigh,vhigh,5more,4,small,med,Bad_deal


In [20]:
for i in train_data.columns:
    print(f"unique ordinals in column {i} are {train_data[i].unique()}")

unique ordinals in column Buying_Cost are ['vhigh' 'med' 'high' 'low']
unique ordinals in column Maintainance_Cost are ['med' 'vhigh' 'high' 'low']
unique ordinals in column Number_of_doors are ['2' '5more' '3' '4']
unique ordinals in column Number_of_Passenger are ['4' '2' 'more']
unique ordinals in column Luggage_Space are ['small' 'med' 'big']
unique ordinals in column Safety_Features are ['low' 'med' 'high']
unique ordinals in column How_is_the_deal are ['Bad_deal' 'Nice_deal']


In [21]:
from sklearn.preprocessing import OrdinalEncoder

x=OrdinalEncoder(categories=[["low","med","high","vhigh"],["low","med","high","vhigh"],
                 ["2","3","4","5more"],["2","4","more"],["small","med","big"],
                  ["low","med","high"]])
y=OrdinalEncoder(categories=[["Bad_deal","Nice_deal"]])

In [22]:
xtrain,ytrain = train_data.drop('How_is_the_deal', axis = 1), train_data[['How_is_the_deal']]

xtrain = pd.DataFrame(x.fit_transform(xtrain), columns = train_data.columns[:-1])

ytrain = pd.DataFrame(y.fit_transform(ytrain), columns = [train_data.columns[-1]])

In [23]:
xtest = pd.DataFrame(x.fit_transform(test_data), columns = test_data.columns)

test_labels= pd.read_csv("Car_Condition_evaluation_dataset/test_labels.csv")

test_labels=test_labels.set_index("Deal_num")

ytest=pd.DataFrame(y.fit_transform(test_labels), columns = test_labels.columns)

In [24]:
#Standard Normalization
xtrain=(xtrain-xtrain.mean())/(xtrain.std()+0.00000001)
xtest=(xtest-xtest.mean())/(xtest.std()+0.00000001)

## Base_model

In [25]:
Baseline_model=SVC(kernel="linear")
Baseline_model.fit(xtrain.values,ytrain.values.reshape(-1))
ypred_train=Baseline_model.predict(xtrain.values)
train_accuracy=np.mean(ypred_train==ytrain.values.reshape(-1))
print("Training accuracy is",train_accuracy)
ypred_test=Baseline_model.predict(xtest.values)
test_accuracy=np.mean(ypred_test==ytest.values.reshape(-1))
print("Test accuracy is",test_accuracy*100)

Training accuracy is 0.9851994851994852
Test accuracy is 93.10344827586206


In [26]:
#Check for the label count in the training data
class_info_train=train_data.pivot_table(index=["How_is_the_deal"],aggfunc="size")

class_info_train

How_is_the_deal
Bad_deal     1434
Nice_deal     120
dtype: int64

This Clearly shows the class imbalance in the training data but still we achived good test accuracy.

## SMOT

In [27]:
smot = SMOTE(sampling_strategy='minority',k_neighbors=5)
x_smot,y_smot=smot.fit_resample(xtrain.values,ytrain.values.reshape(-1))

In [28]:
print(f" Number of Traning data points before applying SMOT are {xtrain.shape[0]}\n\t\t\t\tAfter applying SMOT are {x_smot.shape[0]}")

 Number of Traning data points before applying SMOT are 1554
				After applying SMOT are 2868


## Final Model Using

In [29]:
Final_model=SVC(kernel="linear")
Final_model.fit(x_smot,y_smot)
ypred_train=Final_model.predict(xtrain.values)
train_accuracy=np.mean(ypred_train==ytrain.values.reshape(-1))
print("Training accuracy is",train_accuracy)
ypred_test=Final_model.predict(xtest.values)
test_accuracy=np.mean(ypred_test==ytest.values.reshape(-1))
print("Test accuracy is",test_accuracy*100)

Training accuracy is 0.9787644787644788
Test accuracy is 94.82758620689656
