In [2]:
import pandas as pd
import numpy as np

col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']

dataset = pd.read_csv('car_evaluation.csv', names=col_names)
X=dataset.iloc[:,:-1].values
Y= dataset.iloc[:,-1].values
dataset.head()


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [3]:
feature_names = list(dataset.columns.values[:-1])
for feature in feature_names:
    unique_count = dataset[feature].nunique()
    unique_vals = dataset[feature].unique()
    print("{}: {} values, {}".format(feature, unique_count, unique_vals))

buying: 4 values, ['vhigh' 'high' 'med' 'low']
maint: 4 values, ['vhigh' 'high' 'med' 'low']
doors: 4 values, ['2' '3' '4' '5more']
persons: 3 values, ['2' '4' 'more']
lug_boot: 3 values, ['small' 'med' 'big']
safety: 3 values, ['low' 'med' 'high']


In [4]:
label_name = dataset.columns.values[-1]
print("{}: {} values, {}".format(label_name, dataset[label_name].nunique(), dataset[label_name].unique()))
dataset[label_name].value_counts()


class: 4 values, ['unacc' 'acc' 'vgood' 'good']


unacc    1210
acc       384
good       69
vgood      65
Name: class, dtype: int64

In [5]:
data_encoded = pd.get_dummies(dataset, columns=feature_names, drop_first=True)
data_encoded.tail()


Unnamed: 0,class,buying_low,buying_med,buying_vhigh,maint_low,maint_med,maint_vhigh,doors_3,doors_4,doors_5more,persons_4,persons_more,lug_boot_med,lug_boot_small,safety_low,safety_med
1723,good,1,0,0,1,0,0,0,0,1,0,1,1,0,0,1
1724,vgood,1,0,0,1,0,0,0,0,1,0,1,1,0,0,0
1725,unacc,1,0,0,1,0,0,0,0,1,0,1,0,0,1,0
1726,good,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1
1727,vgood,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0


In [6]:
data_encoded['class'], class_uniques = pd.factorize(data_encoded['class'])
data_encoded.tail()

Unnamed: 0,class,buying_low,buying_med,buying_vhigh,maint_low,maint_med,maint_vhigh,doors_3,doors_4,doors_5more,persons_4,persons_more,lug_boot_med,lug_boot_small,safety_low,safety_med
1723,3,1,0,0,1,0,0,0,0,1,0,1,1,0,0,1
1724,2,1,0,0,1,0,0,0,0,1,0,1,1,0,0,0
1725,0,1,0,0,1,0,0,0,0,1,0,1,0,0,1,0
1726,3,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1
1727,2,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0


In [7]:
# Save the class column as a Pandas Series.
class_col = data_encoded['class']
# Remove class column from DataFrame.
data_encoded.drop(columns=['class'], inplace=True)
# Get the position of where the last column will be inserted.
last_col_pos = data_encoded.columns.get_loc('safety_med') + 1
# Insert the class column back into the DataFrame.
data_encoded.insert(last_col_pos, 'class', class_col)
data_encoded.tail()

Unnamed: 0,buying_low,buying_med,buying_vhigh,maint_low,maint_med,maint_vhigh,doors_3,doors_4,doors_5more,persons_4,persons_more,lug_boot_med,lug_boot_small,safety_low,safety_med,class
1723,1,0,0,1,0,0,0,0,1,0,1,1,0,0,1,3
1724,1,0,0,1,0,0,0,0,1,0,1,1,0,0,0,2
1725,1,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0
1726,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1,3
1727,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,2


In [8]:
from sklearn.model_selection import train_test_split
# Features.
X = data_encoded.loc[:, 'buying_low':'safety_med']
# Labels.
y = data_encoded.loc[:, 'class']

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=0)



In [11]:
from sklearn.svm import SVC
from time import time

t7=time()
svc = SVC()
clf_svc=svc.fit(X_train, Y_train)
print("Acurracy: ", clf_svc.score(X_test,Y_test))
t8=time()
print("time elapsed: ", t8-t7)


Acurracy:  0.9595375722543352
time elapsed:  0.1834723949432373


In [16]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score , classification_report

svc = SVC()

parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

grid = GridSearchCV(svc, parameters, n_jobs=-1, verbose=1, scoring='accuracy')


grid.fit(X_train, Y_train)

print('Best score: %0.3f' % grid.best_score_)

print('Best parameters set:')
best_parameters = grid.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print('\t%s: %r' % (param_name, best_parameters[param_name]))

predictions = grid.predict(X_test)
print(classification_report(Y_test, predictions))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best score: 0.986
Best parameters set:
	C: 10
	kernel: 'rbf'
              precision    recall  f1-score   support

         acc       0.99      0.97      0.98        79
        good       0.88      0.88      0.88        17
       unacc       1.00      1.00      1.00       240
       vgood       0.91      1.00      0.95        10

    accuracy                           0.99       346
   macro avg       0.94      0.96      0.95       346
weighted avg       0.99      0.99      0.99       346

