In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

# https://scikit-learn.org/stable/modules/multiclass.html
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import VotingClassifier

import matplotlib.pyplot as plt

random_seed = 256

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/train.csv', index_col='row_id')
train.head()

In [None]:
# Check for missing values
train.isna().sum().sum()

In [None]:
# Check for duplicated rows
train.duplicated().sum()

In [None]:
# Drop duplicated rows
print('Initial train data shape:',train.shape)
train.drop_duplicates(keep='first', inplace=True)
print('Cleaned train data shape:',train.shape)

In [None]:
# Get X data
X_train = train.drop("target", axis=1).astype(np.float32)

# Get y data
target_encoder = LabelEncoder()
y_train = pd.Series(target_encoder.fit_transform(train["target"]))

In [None]:
# Check target categories
target_encoder.classes_

## Model Creation
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [None]:
# Best model found by hand
params = {'n_neighbors' : 1, 
          #'weights' : 'uniform', 
          #'algorithm' : 'auto', 
          #'leaf_size' : 30, 
          #'p' : 2, 
          #'metric' : 'minkowski', 
          #'metric_params' : None
         }

model = KNeighborsClassifier(**params,
                             #random_state=random_seed,
                             n_jobs=-1, 
                             #verbose=0, 
                             )

# scores (mean): 0.9551510765637913
# Public score: 0.98288

## Cross Validation
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html<br>
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html<br>
https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter<br>

In [None]:
#cv = StratifiedKFold(n_splits=5)
#scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1)
#scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)

#scores.mean()

## Optuna optimization

In [None]:
import optuna

In [None]:
def objective(trial):
       
    n_neighbors = trial.suggest_int('n_neighbors', 1, 2)
    #weights = trial.suggest_categorical('weights', ["uniform", "distance"])    
    leaf_size = trial.suggest_int('leaf_size', 2, 60)
    #metric = trial.suggest_categorical('metric', ["manhattan", "minkowski"])  #["euclidean", "manhattan", "chebyshev", "minkowski"]
    #if metric == "minkowski":
    #    p = trial.suggest_int('p', 2, 5)
    #else:
    #    p = trial.suggest_int('p', 1, 1)
        
    params = {'n_neighbors' : n_neighbors, 
              'weights' : "distance", 
              #'algorithm' : 'auto', 
              'leaf_size' : leaf_size, 
              #'p' : p, 
              'metric' : "manhattan", 
              #'metric_params' : None
             }

    model = KNeighborsClassifier(**params,
                                 #random_state=random_seed,
                                 n_jobs=-1, 
                                 #verbose=0, 
                                 )
    
    cv = StratifiedKFold(n_splits=5)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1)
    #scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
        
    return scores.mean()

In [None]:
# Create Optuna Trial
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.RandomSampler(seed=random_seed))

# Run trials
#study.optimize(objective, n_trials = 5)
study.optimize(objective, timeout = int(3600*10))    # an hour * X

In [None]:
# Best trial
print('Best trial score:', study.best_trial.value)
study.best_trial.params

In [None]:
# See optimization history
fig = optuna.visualization.plot_optimization_history(study)
fig.show()

In [None]:
# See params importance
fig = optuna.visualization.plot_param_importances(study)
fig.show()

In [None]:
# See params slices
fig = optuna.visualization.plot_slice(study)
fig.show()

In [None]:
# Create model with best trial parameters
params = {'n_neighbors' : study.best_trial.params['n_neighbors'], 
          'weights' : "distance", 
          #'algorithm' : 'auto', 
          'leaf_size' : study.best_trial.params['leaf_size'], 
          #'p' : study.best_trial.params['p'], 
          'metric' : "manhattan", 
          #'metric_params' : None
         }

best_model = KNeighborsClassifier(**params,
                                  #random_state=random_seed,
                                  n_jobs=-1, 
                                  #verbose=0, 
                                  )

In [None]:
best_model.get_params()

## Train best model

In [None]:
# Train best model with all train data
best_model.fit(X_train, y_train)

## Submission

In [None]:
test = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/test.csv')
X_test = test.drop('row_id', axis=1).astype(np.float32)

target = best_model.predict(X_test).squeeze()
target = target_encoder.inverse_transform(target)

row_id =  test['row_id'].values
submission = pd.DataFrame({'row_id' : row_id, 'target' : target})

submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)