### Preparations

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [2]:
raw_data = pd.read_csv('/Users/maria/Desktop/vehicle.csv')
raw_data.head()

Unnamed: 0,COMPACTNESS,CIRCULARITY,DISTANCE_CIRCULARITY,RADIUS_RATIO,PR.AXIS_ASPECT_RATIO,MAX.LENGTH_ASPECT_RATIO,SCATTER_RATIO,ELONGATEDNESS,PR.AXIS_RECTANGULARITY,MAX.LENGTH_RECTANGULARITY,SCALED_VARIANCE_MAJOR,SCALED_VARIANCE_MINOR,SCALED_RADIUS_OF_GYRATION,SKEWNESS_ABOUT_MAJOR,SKEWNESS_ABOUT_MINOR,KURTOSIS_ABOUT_MAJOR,KURTOSIS_ABOUT_MINOR,HOLLOWS_RATIO,Class
0,95,48,83,178,72,10,162,42,20,159,176,379,184,70,6,16,187,197,van
1,91,41,84,141,57,9,149,45,19,143,170,330,158,72,9,14,189,199,van
2,104,50,106,209,66,10,207,32,23,158,223,635,220,73,14,9,188,196,saab
3,93,41,82,159,63,9,144,46,19,143,160,309,127,63,6,10,199,207,van
4,85,44,70,205,103,52,149,45,19,144,241,325,188,127,9,11,180,183,bus


In [3]:
raw_data.isnull().sum(axis = 0)

COMPACTNESS                  0
CIRCULARITY                  0
DISTANCE_CIRCULARITY         0
RADIUS_RATIO                 0
PR.AXIS_ASPECT_RATIO         0
MAX.LENGTH_ASPECT_RATIO      0
SCATTER_RATIO                0
ELONGATEDNESS                0
PR.AXIS_RECTANGULARITY       0
MAX.LENGTH_RECTANGULARITY    0
SCALED_VARIANCE_MAJOR        0
SCALED_VARIANCE_MINOR        0
SCALED_RADIUS_OF_GYRATION    0
SKEWNESS_ABOUT_MAJOR         0
SKEWNESS_ABOUT_MINOR         0
KURTOSIS_ABOUT_MAJOR         0
KURTOSIS_ABOUT_MINOR         0
HOLLOWS_RATIO                0
Class                        0
dtype: int64

In [4]:
raw_data.shape

(846, 19)

In [5]:
raw_data.dtypes

COMPACTNESS                   int64
CIRCULARITY                   int64
DISTANCE_CIRCULARITY          int64
RADIUS_RATIO                  int64
PR.AXIS_ASPECT_RATIO          int64
MAX.LENGTH_ASPECT_RATIO       int64
SCATTER_RATIO                 int64
ELONGATEDNESS                 int64
PR.AXIS_RECTANGULARITY        int64
MAX.LENGTH_RECTANGULARITY     int64
SCALED_VARIANCE_MAJOR         int64
SCALED_VARIANCE_MINOR         int64
SCALED_RADIUS_OF_GYRATION     int64
SKEWNESS_ABOUT_MAJOR          int64
SKEWNESS_ABOUT_MINOR          int64
KURTOSIS_ABOUT_MAJOR          int64
KURTOSIS_ABOUT_MINOR          int64
HOLLOWS_RATIO                 int64
Class                        object
dtype: object

In [6]:
raw_data['Class'].nunique()

4

### Naive way

Each class value is represented by a single number. During the prediction, the response received is rounded to the nearest integer

In [7]:
n_data = raw_data
n_data['Class'] = pd.factorize(n_data.Class)[0]
n_data.head()

Unnamed: 0,COMPACTNESS,CIRCULARITY,DISTANCE_CIRCULARITY,RADIUS_RATIO,PR.AXIS_ASPECT_RATIO,MAX.LENGTH_ASPECT_RATIO,SCATTER_RATIO,ELONGATEDNESS,PR.AXIS_RECTANGULARITY,MAX.LENGTH_RECTANGULARITY,SCALED_VARIANCE_MAJOR,SCALED_VARIANCE_MINOR,SCALED_RADIUS_OF_GYRATION,SKEWNESS_ABOUT_MAJOR,SKEWNESS_ABOUT_MINOR,KURTOSIS_ABOUT_MAJOR,KURTOSIS_ABOUT_MINOR,HOLLOWS_RATIO,Class
0,95,48,83,178,72,10,162,42,20,159,176,379,184,70,6,16,187,197,0
1,91,41,84,141,57,9,149,45,19,143,170,330,158,72,9,14,189,199,0
2,104,50,106,209,66,10,207,32,23,158,223,635,220,73,14,9,188,196,1
3,93,41,82,159,63,9,144,46,19,143,160,309,127,63,6,10,199,207,0
4,85,44,70,205,103,52,149,45,19,144,241,325,188,127,9,11,180,183,2


In [8]:
X = pd.DataFrame(n_data.drop(['Class'], axis=1))
y = pd.DataFrame(n_data['Class'])
reg = LinearRegression().fit(X, y)
predictions = reg.predict(X)
predictions = [int(value) for value in predictions]
predictions

[0,
 0,
 1,
 0,
 0,
 2,
 1,
 0,
 0,
 2,
 0,
 1,
 1,
 0,
 2,
 1,
 0,
 2,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 2,
 2,
 0,
 1,
 1,
 1,
 2,
 0,
 1,
 0,
 1,
 2,
 2,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 2,
 1,
 2,
 1,
 3,
 0,
 2,
 0,
 1,
 0,
 1,
 2,
 0,
 1,
 2,
 1,
 1,
 2,
 0,
 2,
 1,
 2,
 1,
 1,
 0,
 2,
 2,
 1,
 2,
 0,
 1,
 2,
 0,
 1,
 1,
 2,
 0,
 0,
 2,
 1,
 1,
 1,
 1,
 0,
 2,
 2,
 1,
 2,
 2,
 1,
 0,
 0,
 0,
 0,
 2,
 1,
 2,
 1,
 2,
 2,
 0,
 0,
 2,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 2,
 0,
 2,
 1,
 1,
 1,
 0,
 2,
 0,
 1,
 0,
 0,
 0,
 2,
 1,
 1,
 1,
 2,
 2,
 0,
 1,
 2,
 0,
 1,
 2,
 1,
 1,
 0,
 2,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 0,
 1,
 2,
 0,
 3,
 0,
 1,
 3,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 2,
 2,
 1,
 1,
 2,
 2,
 0,
 1,
 2,
 1,
 2,
 2,
 2,
 2,
 1,
 0,
 2,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 0,
 2,
 0,
 2,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 2,
 0,
 0,
 1,
 2,
 2,
 2,
 2,
 1,
 1,
 2,
 0,
 0,
 1,
 2,
 2,
 0,


In [9]:
set(predictions)

{-1, 0, 1, 2, 3}

In [10]:
#compare = pd.DataFrame(predictions, n_data['Class']).T
n_data['class_pred'] = predictions
n_data

Unnamed: 0,COMPACTNESS,CIRCULARITY,DISTANCE_CIRCULARITY,RADIUS_RATIO,PR.AXIS_ASPECT_RATIO,MAX.LENGTH_ASPECT_RATIO,SCATTER_RATIO,ELONGATEDNESS,PR.AXIS_RECTANGULARITY,MAX.LENGTH_RECTANGULARITY,SCALED_VARIANCE_MAJOR,SCALED_VARIANCE_MINOR,SCALED_RADIUS_OF_GYRATION,SKEWNESS_ABOUT_MAJOR,SKEWNESS_ABOUT_MINOR,KURTOSIS_ABOUT_MAJOR,KURTOSIS_ABOUT_MINOR,HOLLOWS_RATIO,Class,class_pred
0,95,48,83,178,72,10,162,42,20,159,176,379,184,70,6,16,187,197,0,0
1,91,41,84,141,57,9,149,45,19,143,170,330,158,72,9,14,189,199,0,0
2,104,50,106,209,66,10,207,32,23,158,223,635,220,73,14,9,188,196,1,1
3,93,41,82,159,63,9,144,46,19,143,160,309,127,63,6,10,199,207,0,0
4,85,44,70,205,103,52,149,45,19,144,241,325,188,127,9,11,180,183,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
841,93,39,87,183,64,8,169,40,20,134,200,422,149,72,7,25,188,195,1,1
842,89,46,84,163,66,11,159,43,20,159,173,368,176,72,1,20,186,197,0,0
843,106,54,101,222,67,12,222,30,25,173,228,721,200,70,3,4,187,201,1,1
844,86,36,78,146,58,7,135,50,18,124,155,270,148,66,0,25,190,195,1,1


In [11]:
n_data.loc[n_data['class_pred'] == -1]


Unnamed: 0,COMPACTNESS,CIRCULARITY,DISTANCE_CIRCULARITY,RADIUS_RATIO,PR.AXIS_ASPECT_RATIO,MAX.LENGTH_ASPECT_RATIO,SCATTER_RATIO,ELONGATEDNESS,PR.AXIS_RECTANGULARITY,MAX.LENGTH_RECTANGULARITY,SCALED_VARIANCE_MAJOR,SCALED_VARIANCE_MINOR,SCALED_RADIUS_OF_GYRATION,SKEWNESS_ABOUT_MAJOR,SKEWNESS_ABOUT_MINOR,KURTOSIS_ABOUT_MAJOR,KURTOSIS_ABOUT_MINOR,HOLLOWS_RATIO,Class,class_pred
388,94,47,85,333,138,49,155,43,19,155,320,354,187,135,12,9,188,196,0,-1


In [12]:
n_data = n_data.drop(388)

In [13]:
n_data.loc[n_data['class_pred'] == -1]

Unnamed: 0,COMPACTNESS,CIRCULARITY,DISTANCE_CIRCULARITY,RADIUS_RATIO,PR.AXIS_ASPECT_RATIO,MAX.LENGTH_ASPECT_RATIO,SCATTER_RATIO,ELONGATEDNESS,PR.AXIS_RECTANGULARITY,MAX.LENGTH_RECTANGULARITY,SCALED_VARIANCE_MAJOR,SCALED_VARIANCE_MINOR,SCALED_RADIUS_OF_GYRATION,SKEWNESS_ABOUT_MAJOR,SKEWNESS_ABOUT_MINOR,KURTOSIS_ABOUT_MAJOR,KURTOSIS_ABOUT_MINOR,HOLLOWS_RATIO,Class,class_pred


### OneHot encoding

In [14]:
raw_data = pd.read_csv('/Users/maria/Desktop/vehicle.csv')

In [15]:
o_data = raw_data
o_data.head()

Unnamed: 0,COMPACTNESS,CIRCULARITY,DISTANCE_CIRCULARITY,RADIUS_RATIO,PR.AXIS_ASPECT_RATIO,MAX.LENGTH_ASPECT_RATIO,SCATTER_RATIO,ELONGATEDNESS,PR.AXIS_RECTANGULARITY,MAX.LENGTH_RECTANGULARITY,SCALED_VARIANCE_MAJOR,SCALED_VARIANCE_MINOR,SCALED_RADIUS_OF_GYRATION,SKEWNESS_ABOUT_MAJOR,SKEWNESS_ABOUT_MINOR,KURTOSIS_ABOUT_MAJOR,KURTOSIS_ABOUT_MINOR,HOLLOWS_RATIO,Class
0,95,48,83,178,72,10,162,42,20,159,176,379,184,70,6,16,187,197,van
1,91,41,84,141,57,9,149,45,19,143,170,330,158,72,9,14,189,199,van
2,104,50,106,209,66,10,207,32,23,158,223,635,220,73,14,9,188,196,saab
3,93,41,82,159,63,9,144,46,19,143,160,309,127,63,6,10,199,207,van
4,85,44,70,205,103,52,149,45,19,144,241,325,188,127,9,11,180,183,bus


In [16]:
o_data.Class.value_counts()

bus     218
saab    217
opel    212
van     199
Name: Class, dtype: int64

In [17]:
dum_data = pd.get_dummies(o_data)
dum_data

Unnamed: 0,COMPACTNESS,CIRCULARITY,DISTANCE_CIRCULARITY,RADIUS_RATIO,PR.AXIS_ASPECT_RATIO,MAX.LENGTH_ASPECT_RATIO,SCATTER_RATIO,ELONGATEDNESS,PR.AXIS_RECTANGULARITY,MAX.LENGTH_RECTANGULARITY,...,SCALED_RADIUS_OF_GYRATION,SKEWNESS_ABOUT_MAJOR,SKEWNESS_ABOUT_MINOR,KURTOSIS_ABOUT_MAJOR,KURTOSIS_ABOUT_MINOR,HOLLOWS_RATIO,Class_bus,Class_opel,Class_saab,Class_van
0,95,48,83,178,72,10,162,42,20,159,...,184,70,6,16,187,197,0,0,0,1
1,91,41,84,141,57,9,149,45,19,143,...,158,72,9,14,189,199,0,0,0,1
2,104,50,106,209,66,10,207,32,23,158,...,220,73,14,9,188,196,0,0,1,0
3,93,41,82,159,63,9,144,46,19,143,...,127,63,6,10,199,207,0,0,0,1
4,85,44,70,205,103,52,149,45,19,144,...,188,127,9,11,180,183,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
841,93,39,87,183,64,8,169,40,20,134,...,149,72,7,25,188,195,0,0,1,0
842,89,46,84,163,66,11,159,43,20,159,...,176,72,1,20,186,197,0,0,0,1
843,106,54,101,222,67,12,222,30,25,173,...,200,70,3,4,187,201,0,0,1,0
844,86,36,78,146,58,7,135,50,18,124,...,148,66,0,25,190,195,0,0,1,0


In [18]:
X = pd.DataFrame(dum_data.drop(['Class_bus', 'Class_opel', 'Class_saab', 'Class_van'], axis=1))
y = pd.DataFrame(dum_data[['Class_bus', 'Class_opel', 'Class_saab', 'Class_van']])
reg = LinearRegression().fit(X, y)
predictions = reg.predict(X)
o_data[['bus_pred', 'opel_pred', 'saab_pred', 'van_pred']] = predictions
o_data

Unnamed: 0,COMPACTNESS,CIRCULARITY,DISTANCE_CIRCULARITY,RADIUS_RATIO,PR.AXIS_ASPECT_RATIO,MAX.LENGTH_ASPECT_RATIO,SCATTER_RATIO,ELONGATEDNESS,PR.AXIS_RECTANGULARITY,MAX.LENGTH_RECTANGULARITY,...,SKEWNESS_ABOUT_MAJOR,SKEWNESS_ABOUT_MINOR,KURTOSIS_ABOUT_MAJOR,KURTOSIS_ABOUT_MINOR,HOLLOWS_RATIO,Class,bus_pred,opel_pred,saab_pred,van_pred
0,95,48,83,178,72,10,162,42,20,159,...,70,6,16,187,197,van,0.384677,-0.091274,0.161243,0.545353
1,91,41,84,141,57,9,149,45,19,143,...,72,9,14,189,199,van,-0.046580,0.030922,0.251068,0.764590
2,104,50,106,209,66,10,207,32,23,158,...,73,14,9,188,196,saab,0.025550,0.225139,0.653054,0.096256
3,93,41,82,159,63,9,144,46,19,143,...,63,6,10,199,207,van,0.119273,0.164433,-0.087610,0.803904
4,85,44,70,205,103,52,149,45,19,144,...,127,9,11,180,183,bus,1.046070,-0.349315,-0.369872,0.673117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
841,93,39,87,183,64,8,169,40,20,134,...,72,7,25,188,195,saab,0.069613,0.363942,0.601342,-0.034897
842,89,46,84,163,66,11,159,43,20,159,...,72,1,20,186,197,van,0.171449,0.038881,0.041399,0.748271
843,106,54,101,222,67,12,222,30,25,173,...,70,3,4,187,201,saab,-0.285190,0.574622,0.586232,0.124336
844,86,36,78,146,58,7,135,50,18,124,...,66,0,25,190,195,saab,-0.007107,0.268025,0.385209,0.353873


### Hyperparameter tuning 

#### Naive prep

In [19]:
n_data.head()

Unnamed: 0,COMPACTNESS,CIRCULARITY,DISTANCE_CIRCULARITY,RADIUS_RATIO,PR.AXIS_ASPECT_RATIO,MAX.LENGTH_ASPECT_RATIO,SCATTER_RATIO,ELONGATEDNESS,PR.AXIS_RECTANGULARITY,MAX.LENGTH_RECTANGULARITY,SCALED_VARIANCE_MAJOR,SCALED_VARIANCE_MINOR,SCALED_RADIUS_OF_GYRATION,SKEWNESS_ABOUT_MAJOR,SKEWNESS_ABOUT_MINOR,KURTOSIS_ABOUT_MAJOR,KURTOSIS_ABOUT_MINOR,HOLLOWS_RATIO,Class,class_pred
0,95,48,83,178,72,10,162,42,20,159,176,379,184,70,6,16,187,197,0,0
1,91,41,84,141,57,9,149,45,19,143,170,330,158,72,9,14,189,199,0,0
2,104,50,106,209,66,10,207,32,23,158,223,635,220,73,14,9,188,196,1,1
3,93,41,82,159,63,9,144,46,19,143,160,309,127,63,6,10,199,207,0,0
4,85,44,70,205,103,52,149,45,19,144,241,325,188,127,9,11,180,183,2,0


Check default accuracity

In [20]:
knn = KNeighborsClassifier()

#resgression knearest regression (param)

x = n_data.drop(columns=['Class', 'class_pred'])
y = n_data['class_pred']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)
# random_state=10
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)

print(classification_report(y_test, y_pred, zero_division = 0))

              precision    recall  f1-score   support

           0       0.83      0.86      0.84        63
           1       0.67      0.69      0.68       138
           2       0.55      0.52      0.53        77
           3       0.00      0.00      0.00         1

    accuracy                           0.68       279
   macro avg       0.51      0.52      0.51       279
weighted avg       0.67      0.68      0.67       279



The accuracy ~ 0.68

#### Naive tuning 

In [21]:
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
p=[1,2]
n_neighbors = list(range(1,50))
#add kernel with regression

In [22]:
hyperparameters = dict(algorithm=algorithm, p=p, n_neighbors=n_neighbors)
knn_naive = KNeighborsClassifier()

In [23]:
clf = GridSearchCV(knn_naive, hyperparameters, cv=10)
best_model = clf.fit(x_test,y_test)
print('P:', best_model.best_estimator_.get_params()['p'])
print('n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])
print('algorithm:', best_model.best_estimator_.get_params()['algorithm'])



P: 1
n_neighbors: 13
algorithm: auto


Checking the tunings


In [24]:
knn_naive_check = KNeighborsClassifier(n_neighbors=13, p=1, algorithm = 'auto')

knn_naive_check.fit(x_train, y_train)
y_pred = knn_naive_check.predict(x_test)

print(classification_report(y_test, y_pred, zero_division = 0))

              precision    recall  f1-score   support

           0       0.80      0.89      0.84        63
           1       0.69      0.72      0.71       138
           2       0.60      0.51      0.55        77
           3       0.00      0.00      0.00         1

    accuracy                           0.70       279
   macro avg       0.52      0.53      0.53       279
weighted avg       0.69      0.70      0.69       279



The accuracy increased to 0.70

#### OneHot prep

In [54]:
o_data.head()

Unnamed: 0,COMPACTNESS,CIRCULARITY,DISTANCE_CIRCULARITY,RADIUS_RATIO,PR.AXIS_ASPECT_RATIO,MAX.LENGTH_ASPECT_RATIO,SCATTER_RATIO,ELONGATEDNESS,PR.AXIS_RECTANGULARITY,MAX.LENGTH_RECTANGULARITY,...,SKEWNESS_ABOUT_MAJOR,SKEWNESS_ABOUT_MINOR,KURTOSIS_ABOUT_MAJOR,KURTOSIS_ABOUT_MINOR,HOLLOWS_RATIO,Class,bus_pred,opel_pred,saab_pred,van_pred
0,95,48,83,178,72,10,162,42,20,159,...,70,6,16,187,197,van,0.0,0.0,0.0,1.0
1,91,41,84,141,57,9,149,45,19,143,...,72,9,14,189,199,van,0.0,0.0,0.0,1.0
2,104,50,106,209,66,10,207,32,23,158,...,73,14,9,188,196,saab,0.0,0.0,1.0,0.0
3,93,41,82,159,63,9,144,46,19,143,...,63,6,10,199,207,van,0.0,0.0,0.0,1.0
4,85,44,70,205,103,52,149,45,19,144,...,127,9,11,180,183,bus,1.0,0.0,0.0,1.0


In [55]:
o_data.bus_pred = abs(round(o_data.bus_pred))
o_data.opel_pred = abs(round(o_data.opel_pred))
o_data.saab_pred = abs(round(o_data.saab_pred))
o_data.van_pred = abs(round(o_data.van_pred))
o_data

Unnamed: 0,COMPACTNESS,CIRCULARITY,DISTANCE_CIRCULARITY,RADIUS_RATIO,PR.AXIS_ASPECT_RATIO,MAX.LENGTH_ASPECT_RATIO,SCATTER_RATIO,ELONGATEDNESS,PR.AXIS_RECTANGULARITY,MAX.LENGTH_RECTANGULARITY,...,SKEWNESS_ABOUT_MAJOR,SKEWNESS_ABOUT_MINOR,KURTOSIS_ABOUT_MAJOR,KURTOSIS_ABOUT_MINOR,HOLLOWS_RATIO,Class,bus_pred,opel_pred,saab_pred,van_pred
0,95,48,83,178,72,10,162,42,20,159,...,70,6,16,187,197,van,0.0,0.0,0.0,1.0
1,91,41,84,141,57,9,149,45,19,143,...,72,9,14,189,199,van,0.0,0.0,0.0,1.0
2,104,50,106,209,66,10,207,32,23,158,...,73,14,9,188,196,saab,0.0,0.0,1.0,0.0
3,93,41,82,159,63,9,144,46,19,143,...,63,6,10,199,207,van,0.0,0.0,0.0,1.0
4,85,44,70,205,103,52,149,45,19,144,...,127,9,11,180,183,bus,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
841,93,39,87,183,64,8,169,40,20,134,...,72,7,25,188,195,saab,0.0,0.0,1.0,0.0
842,89,46,84,163,66,11,159,43,20,159,...,72,1,20,186,197,van,0.0,0.0,0.0,1.0
843,106,54,101,222,67,12,222,30,25,173,...,70,3,4,187,201,saab,0.0,1.0,1.0,0.0
844,86,36,78,146,58,7,135,50,18,124,...,66,0,25,190,195,saab,0.0,0.0,0.0,0.0


In [56]:
knn = KNeighborsClassifier()

x = o_data.drop(columns=['Class', 'bus_pred', 'opel_pred', 'saab_pred', 'van_pred'])
y = o_data[['bus_pred', 'opel_pred', 'saab_pred', 'van_pred']]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20)
#random_state=42

In [57]:
knn.fit(x_train, y_train)


KNeighborsClassifier()

In [63]:
y_pred = knn.predict(x_test)
y_test_check = y_test.values
print(classification_report(y_test, y_pred, zero_division = 0))

              precision    recall  f1-score   support

           0       0.87      0.66      0.75        41
           1       0.65      0.33      0.44        33
           2       0.70      0.55      0.62        29
           3       0.74      0.89      0.81        38

   micro avg       0.75      0.62      0.68       141
   macro avg       0.74      0.61      0.65       141
weighted avg       0.75      0.62      0.67       141
 samples avg       0.50      0.49      0.49       141



accuracy ~0.68

#### OneHot Tuning

In [59]:
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
p=[1,2]
n_neighbors = list(range(1,50))

In [60]:
hyperparameters = dict(algorithm=algorithm, p=p, n_neighbors=n_neighbors)
knn_onehot = KNeighborsClassifier()

In [61]:
clf = GridSearchCV(knn_onehot, hyperparameters, cv=10)
best_model = clf.fit(x_test,y_test)
print('P:', best_model.best_estimator_.get_params()['p'])
print('n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])
print('algorithm:', best_model.best_estimator_.get_params()['algorithm'])

P: 1
n_neighbors: 3
algorithm: ball_tree


In [62]:
knn_onehot_check = KNeighborsClassifier(n_neighbors=3, p=1, algorithm = 'ball_tree')

knn_onehot_check.fit(x_train, y_train)
y_pred = knn_onehot_check.predict(x_test)

print(classification_report(y_test, y_pred, zero_division = 0))

              precision    recall  f1-score   support

           0       0.91      0.78      0.84        41
           1       0.65      0.39      0.49        33
           2       0.60      0.62      0.61        29
           3       0.77      0.89      0.83        38

   micro avg       0.75      0.69      0.72       141
   macro avg       0.73      0.67      0.69       141
weighted avg       0.75      0.69      0.71       141
 samples avg       0.55      0.53      0.54       141



accuracy ~0.72