# Imports

In [26]:
import csv
import random
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.impute import KNNImputer

In [33]:
random.seed(3927)

# Input

In [14]:
# Train
X_train, y_train = [], []
with open("input/train.csv") as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        sample = [f if f != '' else np.nan for f in row[1:-1]]
        X_train.append(sample)
        y_train.append(row[-1])
X_train = np.array(X_train, dtype='float')
y_train = np.array(y_train, dtype='int')

In [15]:
# Test
X_test = []
with open("input/test.csv") as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        sample = [f if f != '' else np.nan for f in row[1:]]
        X_test.append(sample)
X_test = np.array(X_test, dtype='float')

In [65]:
print(X_train[0], len(X_train[0]), y_train[0])

[  -4.18782175  -13.97884594 -246.34682666  143.77128452           nan
           nan  229.36080373   42.7291799    68.08876577  -14.35422206
  379.53492634   41.19237186  -89.71246617   91.03940423           nan
 -248.25915654   43.75840381  -40.15463248   -6.52880772 -176.74953079
   61.8727004   -89.54816069   32.41485189 -140.23182148  101.05476121
  -66.80018827   -5.89607964  -18.33249994  148.83052828 -112.41496457
   54.72238505  -66.32006108  157.46149586 -442.09197832   43.22645042
  -34.06027759   81.55610722 -229.44176268  234.65191099  -22.79971451
 -104.71116142   53.57580542  -46.05744368  -90.01054161 -155.88275903
  -99.35183224  -17.22170544   29.77794043  -52.29519064   86.38233725
    5.38248447  -47.87198572   -8.33688219    2.3098171    48.31420518
  101.67060799  -36.93306478  -89.90533254    9.08032462 -164.99263115
   -9.79146639   -1.72402732  -70.50777472 -274.41528558  -21.89281345
  110.34424102  -95.92101794   58.71223592  -51.03686555   31.53084503
  119.

In [66]:
print(X_test[0], len(X_test[0]))

[          nan  -53.72676379           nan  -38.5621528  -138.35115709
           nan   -0.94232273   41.17679205  -68.50197728   75.3381127
  151.28076053           nan   86.35470457   61.98910919  -82.17675297
  347.94990132  -20.65991763   50.67431442 -169.08435797  -21.5484204
   36.7010437    17.36768281   71.37129652 -114.22083116 -120.35301608
   85.43642352  -49.59181472  -86.53730854   23.82818363   19.59957773
   13.15223138   58.21813062  -87.6321296   156.20635593   -7.99511485
   63.58110861 -263.75929575 -144.56802354 -141.14883643   -6.81317708
 -107.76701598  -32.11995905   48.27804174 -102.16205616 -324.3721731
 -105.39509451   17.81982518   27.92826896  -81.89390493  -23.0515522
   44.60344144   -5.88051479   97.00822487   29.37705413  -48.88012564
   44.36204444  -87.01140815 -473.60999537  -71.46052784  188.72237556
   25.96248948 -544.76717004  251.32125113  -49.53800268  119.05681115
  -57.65147648  150.21279358 -233.41991507  191.3249957   -36.45332242
   28.7115

# Impute missing values

In [17]:
# Setup imputer
imp = KNNImputer(weights='distance')

In [18]:
# Train
X_train_imp = imp.fit_transform(X_train)

In [19]:
print(X_train_imp[0], len(X_train_imp[0]), y_train[0])

[  -4.18782175  -13.97884594 -246.34682666  143.77128452   -1.01831486
   57.27210684  229.36080373   42.7291799    68.08876577  -14.35422206
  379.53492634   41.19237186  -89.71246617   91.03940423   -8.35974451
 -248.25915654   43.75840381  -40.15463248   -6.52880772 -176.74953079
   61.8727004   -89.54816069   32.41485189 -140.23182148  101.05476121
  -66.80018827   -5.89607964  -18.33249994  148.83052828 -112.41496457
   54.72238505  -66.32006108  157.46149586 -442.09197832   43.22645042
  -34.06027759   81.55610722 -229.44176268  234.65191099  -22.79971451
 -104.71116142   53.57580542  -46.05744368  -90.01054161 -155.88275903
  -99.35183224  -17.22170544   29.77794043  -52.29519064   86.38233725
    5.38248447  -47.87198572   -8.33688219    2.3098171    48.31420518
  101.67060799  -36.93306478  -89.90533254    9.08032462 -164.99263115
   -9.79146639   -1.72402732  -70.50777472 -274.41528558  -21.89281345
  110.34424102  -95.92101794   58.71223592  -51.03686555   31.53084503
  119.

In [20]:
# Test
X_test_imp = imp.fit_transform(X_test)

In [21]:
print(X_test_imp[0], len(X_test_imp[0]))

[  51.18136768  -53.72676379  130.12401065  -38.5621528  -138.35115709
  153.03350057   -0.94232273   41.17679205  -68.50197728   75.3381127
  151.28076053   35.84814657   86.35470457   61.98910919  -82.17675297
  347.94990132  -20.65991763   50.67431442 -169.08435797  -21.5484204
   36.7010437    17.36768281   71.37129652 -114.22083116 -120.35301608
   85.43642352  -49.59181472  -86.53730854   23.82818363   19.59957773
   13.15223138   58.21813062  -87.6321296   156.20635593   -7.99511485
   63.58110861 -263.75929575 -144.56802354 -141.14883643   -6.81317708
 -107.76701598  -32.11995905   48.27804174 -102.16205616 -324.3721731
 -105.39509451   17.81982518   27.92826896  -81.89390493  -23.0515522
   44.60344144   -5.88051479   97.00822487   29.37705413  -48.88012564
   44.36204444  -87.01140815 -473.60999537  -71.46052784  188.72237556
   25.96248948 -544.76717004  251.32125113  -49.53800268  119.05681115
  -57.65147648  150.21279358 -233.41991507  191.3249957   -36.45332242
   28.7115

# Perform cross-validation for KNN

In [36]:
# Setup classifier
classifier = KNeighborsClassifier()

## Attempt \#1

In [37]:
# Setup params search
params_grid = { 'n_neighbors': list(range(1, 10, 2)), \
                'weights': ['distance'], \
                'algorithm': ['kd_tree', 'ball_tree'] }
res = GridSearchCV(classifier, params_grid, cv=5, refit=True)

In [38]:
# Fit
count = len(X_train_imp)
samples = random.sample(range(count), count // 10)
res.fit(X_train_imp[samples], y_train[samples])

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'algorithm': ['kd_tree', 'ball_tree'],
                         'n_neighbors': [1, 3, 5, 7, 9],
                         'weights': ['distance']})

In [39]:
print(res.best_estimator_.get_params())

{'algorithm': 'kd_tree', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 9, 'p': 2, 'weights': 'distance'}


## Attempt \#2

In [41]:
# Setup params search
params_grid = { 'n_neighbors': list(range(1, 21)), \
                'weights': ['distance'], \
                'algorithm': ['kd_tree', 'ball_tree'] }
res = GridSearchCV(classifier, params_grid, cv=5, refit=True)

In [42]:
# Fit
count = len(X_train_imp)
samples = random.sample(range(count), count // 10)
res.fit(X_train_imp[samples], y_train[samples])

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'algorithm': ['kd_tree', 'ball_tree'],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20],
                         'weights': ['distance']})

In [43]:
print(res.best_estimator_.get_params())

{'algorithm': 'kd_tree', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 13, 'p': 2, 'weights': 'distance'}


# Fit

In [60]:
classifier = KNeighborsClassifier(**res.best_estimator_.get_params())
classifier.fit(X_train_imp, y_train)

KNeighborsClassifier(algorithm='kd_tree', n_neighbors=13, weights='distance')

# Predict

In [62]:
y_pred = classifier.predict(X_test_imp)

In [63]:
print(y_pred[:100])

[4 7 3 4 9 5 5 5 1 2 7 8 6 5 4 1 7 0 7 1 3 8 7 1 1 9 4 6 3 9 3 6 3 4 5 6 7
 6 8 8 1 5 1 5 3 6 1 6 1 6 2 1 3 9 7 6 8 0 2 3 6 3 0 9 9 0 8 9 2 3 6 3 6 1
 2 1 2 2 7 9 4 3 5 8 1 9 6 1 7 9 9 8 7 4 4 6 6 9 2 4]


# Output

In [64]:
with open("output/pred.csv", 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerow(('id', 'label'))
    writer.writerows(zip(range(len(y_pred)), y_pred))