In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

import os
import sys
sys.path.append(os.path.join(
  os.path.abspath(''), '..', '..')
)

from src.preprocess import Preprocess

import warnings
warnings.filterwarnings('ignore')

In [None]:
x_train: pd.DataFrame = pd.read_csv(
  '../../data/train.csv', index_col=0)
x_test: pd.DataFrame = pd.read_csv(
  '../../data/test.csv', index_col=0)
y_train: pd.Series = x_train['Survived']

x_train, x_test = Preprocess(scaler=StandardScaler()) \
  .apply(x_train, x_test)

params = {
  'n_neighbors': [i for i in range(1, 21, 2)],
}

base_knn = KNeighborsClassifier()
knn = GridSearchCV(
  base_knn,
  params,
  scoring='balanced_accuracy',
  n_jobs=-1,
  cv=5,
  return_train_score=True,
)
knn.fit(x_train, y_train)

results = pd.DataFrame(knn.cv_results_)
results = results[[
  'param_n_neighbors','mean_test_score',
  'rank_test_score', 'mean_train_score'
]]
results.to_csv('knn_results.csv', index=False)

In [None]:
plt.plot(results['param_n_neighbors'],
  results['mean_test_score'], label='test')
plt.plot(results['param_n_neighbors'],
  results['mean_train_score'], label='train')

plt.xlabel('Neighbors')
plt.ylabel('Balanced Accuracy')
plt.title('Neighbors Overfit Analysis')
plt.legend()
plt.savefig('neighbors.svg', format='svg')