In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.neighbors import KNeighborsClassifier # k nearest neighbours
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler

In [None]:
data = pd.read_csv('../input/knn-01/train.csv')
data

In [None]:
validate_data = pd.read_csv('../input/knn-01/test.csv')
validate_data

In [None]:
validate_data_ids = validate_data['id']

# Data review

In [None]:
# types of all features
data.dtypes

In [None]:
# check if there are any missed values
data.isnull().any()

In [None]:
for i in ['bone_length', 'rotting_flesh', 'hair_length', 'has_soul']:
    plt.hist(data[i])
    plt.title(i)
    plt.show()

In [None]:
sns.heatmap(data.drop(['id'], 1, errors='ignore').corr())

In [None]:
sns.countplot(x='type', data=data)

In [None]:
sns.countplot(x='type', hue='color', data=data, palette=['yellow', 'green', 'black', 'grey', 'blue', 'red'])

In [None]:
sns.pairplot(data, hue='type')

# Data preprocessing

In [None]:
def preprocess_data(data, validate_data):
    data.drop(['id', 'color', ''], errors='ignore', axis=1, inplace=True)
    validate_data.drop(['id', 'color'], errors='ignore', axis=1, inplace=True)
    
#     colormap = {name: idx for idx, name in enumerate(data['color'].astype('category').cat.categories )}
#     ncolors = len(colormap) - 1
#     data['color'] = data['color'].apply(
#         lambda x: colormap[x] / ncolors
#     )
#     validate_data['color'] = validate_data['color'].apply(
#         lambda x: colormap[x] / ncolors
#     )

In [None]:
preprocess_data(data, validate_data)

In [None]:
data.dtypes

In [None]:
data, validate_data

# Model

In [None]:
train_set_x, train_set_y = data.drop('type', 1), data['type']

In [None]:
classifier = GridSearchCV(
    KNeighborsClassifier(),
    param_grid={
        'n_neighbors': np.arange(1, 100),
        'p': np.arange(1, 10)
    },
    scoring='accuracy',
    cv=3
)

In [None]:
classifier.fit(train_set_x, train_set_y)

In [None]:
scores = classifier.cv_results_['mean_test_score']
scores, scores.mean(), scores.max()

In [None]:
classifier.best_params_

In [None]:
np.mean(classifier.predict(train_set_x) == train_set_y)

In [None]:
submission = classifier.predict(validate_data)

In [None]:
pd.DataFrame({'id': validate_data_ids, 'type': submission}).to_csv('submission.csv', index=False)