## Classification model for Car evaluation

In [1396]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingClassifier

In [1397]:
train_data= pd.read_csv("Car_Condition_evaluation_dataset/training_data.csv")
test_data= pd.read_csv("Car_Condition_evaluation_dataset/testing_data.csv")

In [1398]:
train_data.shape

(1554, 7)

In [1399]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1554 entries, 0 to 1553
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Buying_Cost          1554 non-null   object
 1   Maintainance_Cost    1554 non-null   object
 2   Number_of_doors      1554 non-null   object
 3   Number_of_Passenger  1554 non-null   object
 4   Luggage_Space        1554 non-null   object
 5   Safety_Features      1554 non-null   object
 6   How_is_the_deal      1554 non-null   object
dtypes: object(7)
memory usage: 85.1+ KB


In [1400]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174 entries, 0 to 173
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Deal_num             174 non-null    int64 
 1   Buying_Cost          174 non-null    object
 2   Maintainance_Cost    174 non-null    object
 3   Number_of_doors      174 non-null    object
 4   Number_of_Passenger  174 non-null    object
 5   Luggage_Space        174 non-null    object
 6   Safety_Features      174 non-null    object
dtypes: int64(1), object(6)
memory usage: 9.6+ KB


In [1401]:
test_data=test_data.set_index("Deal_num")

In [1402]:
train_data[:5]

Unnamed: 0,Buying_Cost,Maintainance_Cost,Number_of_doors,Number_of_Passenger,Luggage_Space,Safety_Features,How_is_the_deal
0,vhigh,med,2,4,small,low,Bad_deal
1,vhigh,med,5more,4,small,low,Bad_deal
2,med,vhigh,5more,4,small,low,Bad_deal
3,high,high,3,2,med,med,Bad_deal
4,vhigh,vhigh,5more,4,small,med,Bad_deal


In [1403]:
for i in train_data.columns:
    print(f"unique ordinals in column {i} are {train_data[i].unique()}")

unique ordinals in column Buying_Cost are ['vhigh' 'med' 'high' 'low']
unique ordinals in column Maintainance_Cost are ['med' 'vhigh' 'high' 'low']
unique ordinals in column Number_of_doors are ['2' '5more' '3' '4']
unique ordinals in column Number_of_Passenger are ['4' '2' 'more']
unique ordinals in column Luggage_Space are ['small' 'med' 'big']
unique ordinals in column Safety_Features are ['low' 'med' 'high']
unique ordinals in column How_is_the_deal are ['Bad_deal' 'Nice_deal']


In [1404]:
test_data[:5]

Unnamed: 0_level_0,Buying_Cost,Maintainance_Cost,Number_of_doors,Number_of_Passenger,Luggage_Space,Safety_Features
Deal_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,low,low,5more,more,big,high
1,low,low,2,2,small,high
2,low,med,5more,4,med,low
3,low,low,4,2,big,low
4,low,low,4,2,med,high


In [1405]:
from sklearn.preprocessing import OrdinalEncoder

x=OrdinalEncoder(categories=[["low","med","high","vhigh"],["low","med","high","vhigh"],
                 ["2","3","4","5more"],["2","4","more"],["small","med","big"],
                  ["low","med","high"]])
y=OrdinalEncoder(categories=[["Bad_deal","Nice_deal"]])

In [1406]:
xtrain,ytrain = train_data.drop('How_is_the_deal', axis = 1), train_data[['How_is_the_deal']]

xtrain = pd.DataFrame(x.fit_transform(xtrain), columns = train_data.columns[:-1])

ytrain = pd.DataFrame(y.fit_transform(ytrain), columns = [train_data.columns[-1]])

In [1407]:
xtest = pd.DataFrame(x.fit_transform(test_data), columns = test_data.columns)

test_labels= pd.read_csv("Car_Condition_evaluation_dataset/test_labels.csv")

test_labels=test_labels.set_index("Deal_num")

ytest=pd.DataFrame(y.fit_transform(test_labels), columns = test_labels.columns)

## Baseline_model

In [1408]:
scaler1 = StandardScaler().fit(xtrain)

In [1409]:
#Standard Normalization
xtrain1=scaler1.transform(xtrain)
xtest1=scaler1.transform(xtest)

In [1410]:
Baseline_model=SVC(kernel="rbf")
Baseline_model.fit(xtrain1,ytrain.values.reshape(-1))
ypred_train=Baseline_model.predict(xtrain1)
train_accuracy=np.mean(ypred_train==ytrain.values.reshape(-1))
print("Training accuracy is",train_accuracy)
ypred_test=Baseline_model.predict(xtest1)
test_accuracy=np.mean(ypred_test==ytest.values.reshape(-1))
print("Test accuracy is",test_accuracy*100)

Training accuracy is 0.9974259974259975
Test accuracy is 89.65517241379311


In [1411]:
#Check for the class label count in the training data
class_info_train=train_data.pivot_table(index=["How_is_the_deal"],aggfunc="size")

class_info_train

How_is_the_deal
Bad_deal     1434
Nice_deal     120
dtype: int64

This Clearly shows the class imbalance in the training data but still we achived good test accuracy.

## Ensemble_model

In [1412]:
Ens_model=BaggingClassifier(base_estimator=SVC(),n_estimators=25,random_state=35)
Ens_model.fit(xtrain1,ytrain.values.reshape(-1))
ypred_train=Ens_model.predict(xtrain1)
train_accuracy=np.mean(ypred_train==ytrain.values.reshape(-1))
print("Training accuracy is",train_accuracy)
ypred_test=Ens_model.predict(xtest1)
test_accuracy=np.mean(ypred_test==ytest.values.reshape(-1))
print("Test accuracy is",test_accuracy*100)

Training accuracy is 0.9993564993564994
Test accuracy is 90.22988505747126


#### From this it is clear that there is no significant improvement in accuracy with ensemble model.Now we will try SMOT along with ensemble model

## SMOT

In [1413]:
smot = SMOTE(sampling_strategy='minority',k_neighbors=3)
x_smot,y_smot=smot.fit_resample(xtrain.values,ytrain.values.reshape(-1))

In [1414]:
scaler2 = StandardScaler().fit(x_smot)
x_smot=scaler2.transform(x_smot)
xtest=scaler2.transform(xtest.values)

In [1415]:
print(f" Number of Traning data points before applying SMOT are {xtrain.shape[0]}\n\t\t\t\tAfter applying SMOT are {x_smot.shape[0]}")

 Number of Traning data points before applying SMOT are 1554
				After applying SMOT are 2868


## Final Model Using ensemble model and Data augmentation technique SMOT

In [1416]:
Final_model=BaggingClassifier(n_estimators=20,random_state=30)
Final_model.fit(x_smot,y_smot)
ypred_train=Final_model.predict(x_smot)
train_accuracy=np.mean(ypred_train==y_smot)
print("Training accuracy is",train_accuracy)
ypred_test=Final_model.predict(xtest)
test_accuracy=np.mean(ypred_test==ytest.values.reshape(-1))
print("Test accuracy is",test_accuracy*100)

Training accuracy is 1.0
Test accuracy is 95.40229885057471


### From this we can conclude that SMOT improved the test accuracy by a significant amount