# GLC2022 - KNN Baseline

I was wondering about what we can learn from coordinates alone. Before going into much more complex models, I tought a knn might be a good baseline.


In [None]:
%pylab inline --no-import-all

import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors

# Change this path to adapt to where you downloaded the data
DATA_PATH = Path("../input/geolifeclef-2022-lifeclef-2022-fgvc9")

# Create the path to save submission files
SUBMISSION_PATH = Path("submissions")
os.makedirs(SUBMISSION_PATH, exist_ok=True)

# Clone the GitHub repository
!rm -rf GLC
!git clone https://github.com/maximiliense/GLC
    
#from GLC.metrics import top_30_error_rate, top_k_error_rate_from_sets, predict_top_30_set, generate_submission_file

from GLC.metrics import top_30_error_rate, top_k_error_rate_from_sets, predict_top_30_set
from GLC.submission import generate_submission_file

# Data Loading

In [None]:
df_obs_fr = pd.read_csv(DATA_PATH / "observations" / "observations_fr_train.csv", sep=";", index_col="observation_id")
df_obs_us = pd.read_csv(DATA_PATH / "observations" / "observations_us_train.csv", sep=";", index_col="observation_id")
df_obs = pd.concat((df_obs_fr, df_obs_us))

df_obs_fr_test = pd.read_csv(DATA_PATH / "observations" / "observations_fr_test.csv", sep=";", index_col="observation_id")
df_obs_us_test = pd.read_csv(DATA_PATH / "observations" / "observations_us_test.csv", sep=";", index_col="observation_id")
df_obs_test = pd.concat((df_obs_fr_test, df_obs_us_test))

obs_id_train = df_obs.index[df_obs["subset"] == "train"].values
obs_id_val = df_obs.index[df_obs["subset"] == "val"].values
obs_id_test = df_obs_test.index.values

y_train = df_obs.loc[obs_id_train]["species_id"].values
y_val = df_obs.loc[obs_id_val]["species_id"].values

n_obs = len(df_obs)
n_train = len(obs_id_train)
n_val = len(obs_id_val)
n_test = len(obs_id_test)

print(f'Data set size: {n_obs} observations')
print(f'Train set size: {n_train} - {n_train / n_obs :.2%} of observations')
print(f'Validation set size: {n_val} - {n_val / n_obs :.2%} of observations')
print(f'Test set size: {n_test} - {n_test/n_train:.2%} of observations')

In [None]:
%%time

n_neighbors = 1200

X_train = df_obs.loc[obs_id_train][['latitude','longitude']].values.copy()
X_val = df_obs.loc[obs_id_val][['latitude','longitude']].values.copy()
X_test = df_obs_test[['latitude','longitude']].values.copy()

In [None]:
def pred_line(line, y, top_k = 30):
    exta = line
    arr = y[exta]
    u, count = np.unique(arr, return_counts=True)
    count_sort_ind = np.argsort(-count)
    return u[count_sort_ind][:top_k]


def predict_NN(model, X_loc, y, top_k=30):
    distances, indices = model.kneighbors(X_loc)
    indices
    preds = []
    for i in range(len(indices)):
        preds.append(pred_line(indices[i],y,top_k))
    pred = np.array(preds)
    return np.array(preds)


X_train_loc = X_train.copy()
X_val_loc = X_val.copy()
X_test_loc = X_test.copy()

nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='ball_tree').fit(X_train_loc)

s_pred = predict_NN(nbrs, X_val_loc, y_train)
score = top_k_error_rate_from_sets(y_val, s_pred)
print(f'KNN {n_neighbors} - Top-30 error rate: {score:.2%}')

# post processing

We only predict 5925 species. What if we remove those we are never predicting from training ? 

In [None]:
unique_preds = np.unique(predict_NN(nbrs, X_val_loc, y_train, 40))

filter_keep = np.isin(y_train, unique_preds)

y_train_bis = y_train[filter_keep]
X_train_loc_bis = X_train_loc[filter_keep]

print(f' propotion of target kept: {len(np.unique(y_train_bis))/len(np.unique(y_train)):.2%}')
print(f' propotion of instance kept: {len(y_train_bis)/len(y_train):.2%}')

In [None]:
nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='ball_tree').fit(X_train_loc_bis)

s_pred_bis = predict_NN(nbrs, X_val_loc, y_train_bis)
score = top_k_error_rate_from_sets(y_val, s_pred_bis)
print(f'KNN {n_neighbors} - Top-30 error rate: {score:.2%}')

# retrain on whole data set

In [None]:
# retrain on whole data set
#nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='ball_tree').fit(df_obs[['latitude','longitude']].values)

# Compute baseline on the test set
s_pred = predict_NN(nbrs, X_test_loc, y_train_bis)

# Generate the submission file
generate_submission_file(SUBMISSION_PATH / "lat_long_knn_1200.csv", df_obs_test.index, s_pred)