In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [21]:
df = pd.read_csv("C:/Users/Administrator.DAI-PC2/Desktop/ML/Day1/Glass.csv")
le = LabelEncoder()
y = le.fit_transform(df["Type"])
X = df.drop("Type", axis = 1)
le.classes_

array(['building_windows_float_processed',
       'building_windows_non_float_processed', 'containers', 'headlamps',
       'tableware', 'vehicle_windows_float_processed'], dtype=object)

In [14]:
knn = KNeighborsClassifier()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=24)

kfold = KFold(n_splits = 5, shuffle = True, random_state=24)

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
y_pred_prob = knn.predict_proba(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[13  2  0  0  0  1]
 [ 7 10  1  0  1  0]
 [ 0  3  4  0  0  0]
 [ 1  1  1  9  0  0]
 [ 0  1  1  0  1  0]
 [ 7  1  0  0  0  0]]
              precision    recall  f1-score   support

           0       0.46      0.81      0.59        16
           1       0.56      0.53      0.54        19
           2       0.57      0.57      0.57         7
           3       1.00      0.75      0.86        12
           4       0.50      0.33      0.40         3
           5       0.00      0.00      0.00         8

    accuracy                           0.57        65
   macro avg       0.52      0.50      0.49        65
weighted avg       0.55      0.57      0.54        65

0.5692307692307692


In [19]:
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state=24)

std_scaler = StandardScaler()
std_mm = MinMaxScaler()
pipe_std = Pipeline([('SCL', None), ('KNN',knn)])
params = {'KNN__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'SCL':[std_scaler, std_mm, None]}
gcv = GridSearchCV(pipe_std, param_grid = params, cv = kfold, scoring = 'neg_log_loss')
gcv.fit(X, y)
print(gcv.best_score_)
print(gcv.best_params_)

-2.1501310079676905
{'KNN__n_neighbors': 8, 'SCL': None}


In [20]:
tst_conc = pd.read_csv("tst_Glass.csv")
pred_strength = gcv.predict(tst_conc)
pred_strength

array([3, 1, 2, 3, 0, 3])

In [23]:
pred_type = le.inverse_transform(pred_strength)
pred_type

array(['headlamps', 'building_windows_non_float_processed', 'containers',
       'headlamps', 'building_windows_float_processed', 'headlamps'],
      dtype=object)