In [None]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 100)

In [None]:
df = pd.read_csv('../input/pocker-hand.csv')
df.shape

#### big boi, let's make a subset of it

In [None]:
df = df.sample(1000)

In [None]:
df.head()

In [None]:
X = df.drop(['Class'], axis=1)
y = df['Class']

### Data preparation. Use label encoding for categorial ordinal features and one-hot encoding for nominal

#### first of all, let's check df for nan values

In [None]:
for col in X.columns:
    if X[col].isnull().values.any():
        print(col)

In [None]:
for col in ['V1', 'V3', 'V5', 'V7', 'V9']:
    X = pd.get_dummies(X, columns=[col], prefix='bin_' + col, dtype=np.int64)

In [None]:
print(X.shape)
X.head()

#### Wow, it looks much better! Let's normalize it also

In [None]:
X = (X - X.min()) / (X.max() - X.min())

In [None]:
X.head()

#### Finally, do some fun things with hyper paramethers

In [None]:
# add our super cool c++ lib to path
import sys
sys.path.insert(0, '/Users/konstantin.rybkin/projects/huawei-ml-course/shared')

In [None]:
# from core.metrics import F1Score
from core.knn import Dataset, NonParametricRegressor

In [None]:
best_metric = ""
best_kernel = ""
best_window_type = ""
best_window = 0

best_accuracy = 0


def find_acc(regressor, n, metric, kernel, window_type, h):
    cm = [[0, 0], [0, 0]]
                
    correct_count = 0
    for i in range(n): # LOO
        predicted, actual = regressor.loo_validate(i, metric, kernel, window_type, h)
        predicted = int(predicted)
        actual = int(actual)
        if predicted == actual:
            correct_count += 1
    return correct_count / n

def upd_bests(regressor, n, metric, kernel, window_type, h):
    global best_metric, best_kernel, best_window_type, best_window, best_accuracy
    
    acc = find_acc(regressor, n, metric, kernel, window_type, h)
    if acc > best_accuracy:
        best_metric = metric
        best_kernel = kernel
        best_window_type = window_type
        best_window = h
        best_accuracy = acc
        
        print('new best result: {:.4f}'.format(best_accuracy))

In [None]:
metrics = [
    "manhattan",
    "euclidean",
    "chebyshev"
]
kernels = [
    "uniform",
    "triangular",
    "epanechnikov",
    "quartic",
    "triweight",
    "tricube",
    "gaussian",
    "cosine",
    "logistic",
    "sigmoid"
]

normalized_df = X.assign(y=y.values)
lib_dataset = Dataset(normalized_df.values)
regressor = NonParametricRegressor(lib_dataset)

for metric in metrics:
    for kernel in kernels:
        print('searching best params for {}/{}/fixed...'.format(metric, kernel))
        # fixed window
        for h in [0.05, 0.1, 0.5, 1, 3, 5, 10]:
            upd_bests(regressor, lib_dataset.n(), metric, kernel, 'fixed', h)

        print('searching best params for {}/{}/variable...'.format(metric, kernel))
        # variable window
        for k in [1, 5, 10]:
            upd_bests(regressor, lib_dataset.n(), metric, kernel, 'variable', k)
        print('-----------------------')

In [None]:
best_metric, best_kernel, best_window_type, best_window, best_accuracy

#### finally we find some more data about f1 score and plot graphs on it

In [None]:
import plotly.graph_objects as go

acc_fixed = []
acc_variable = []

for h in range(50):
    print('current h:', h)
    acc_fixed.append((h, find_acc(regressor, lib_dataset.n(), 'manhattan', 'triweight', 'fixed', h)))

# variable window
for k in range(1, 200, 4):
    print('current k:', k)
    acc_variable.append((k, find_acc(regressor, lib_dataset.n(), 'manhattan', 'triweight', 'variable', k)))

In [None]:
fig1 = go.Figure(data=go.Scatter(x=list(map(lambda x: x[0], acc_fixed)), 
                                 y=list(map(lambda x: x[1], acc_fixed))))
fig2 = go.Figure(data=go.Scatter(x=list(map(lambda x: x[0], acc_variable)), 
                                 y=list(map(lambda x: x[1], acc_variable))))
fig1.update_layout(title='Fixed window', xaxis_title='size', yaxis_title='accuracy')
fig2.update_layout(title='Variable window', xaxis_title='neighbors number', yaxis_title='accuracy')
fig1.show()
fig2.show()