In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Support Vector Machines - Nonlinear Classification
### with GridSearch

### All needed imports for this notebook

In [None]:
import pandas as pd
pd.options.display.max_colwidth = 80

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

from sklearn.svm import SVC # SVM model with kernels
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

#### Fetching Data

In [None]:
data = '/kaggle/input/car-evaluation-data-set/car_evaluation.csv'

header_list = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class value']

cars = pd.read_csv(data, names=header_list, index_col=None)

### Exploring Data

In [None]:
cars.head()

In [None]:
cars.describe()

In [None]:
cars.info(), cars.shape

#### **Destribution frequency of values in each variable.** Judging by the output, *stratified sampling* is not needed since all data instances seem to be evenly good splitted

In [None]:
for column in cars.columns:
    print(cars[column].value_counts(), '\n') 

#### I had an idea that number of doors can somehow correlate with luggage capacity, but seems that *lug_boot* value does not depended on that

In [None]:
a = cars.loc[cars['doors'] == '2', ['lug_boot']]
b = cars.loc[cars['doors'] == '3', ['lug_boot']]
c = cars.loc[cars['doors'] == '4', ['lug_boot']]
d = cars.loc[cars['doors'] == '5more', ['lug_boot']]

print(a['lug_boot'].value_counts(), '\n\n', b['lug_boot'].value_counts(), '\n\n', 
      c['lug_boot'].value_counts(), '\n\n', d['lug_boot'].value_counts())

#### Feature and Target vectors

In [None]:
X = cars.drop(['class value'], axis=1)
y = cars['class value']

X, y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42)

In [None]:
X_train.shape, X_test.shape

In [None]:
y_train.shape, y_test.shape

### Encoding
#### There are a limited number of possible values, each of which represents a category, which means that all the variables in dataset are of ordinal categorical data type. Most Machine Learning algorithms prefer to work with numbers, so let’s convert these categories from text to numbers. For this, we can use Scikit-Learn’s OrdinalEncoder class:

In [None]:
columns_encode = []
columns_encode.append(header_list)
columns_encode

In [None]:
ordinal_encoder = OrdinalEncoder()

X_train = ordinal_encoder.fit_transform(X_train, columns_encode)
X_test = ordinal_encoder.transform(X_test)

In [None]:
X_train, X_train.shape

In [None]:
y_train, y_train.shape

#### Using GridSearch to find the best hyperparameters

In [None]:
param_grid = [{'kernel': ['poly'], 'C' : [3, 5, 7, 9, 10]},
             {'kernel' : ['rbf'], 'C' : [3, 5, 7, 9, 10], 'gamma' : [2, 4, 6, 8]}]

svm = SVC()

In [None]:
grid_search = GridSearchCV(svm, param_grid, return_train_score=True)

grid_search.fit(X_train, y_train)

#### Estimated best hyperparameters for SVM

In [None]:
grid_search.best_params_

#### GridSearcg estimated the best model to be with polynomial kernel of ninth degree

In [None]:
grid_search.best_estimator_

In [None]:
svm_y_pred = grid_search.predict(X_test)

accuracy_score(y_test, svm_y_pred)

In [None]:
svm_y_pred_train = grid_search.predict(X_train)

accuracy_score(y_train, svm_y_pred_train)

### Accuracy of training test is a little bit higher, but it's clearly not overfit, so I guess tthe model did very good

#### Confusion Matrix

In [None]:
confusion_matrix(y_test, svm_y_pred)

In [None]:
print(classification_report(y_test, svm_y_pred))