This notebook uses machine learning (kNN) to predict the Open-Sky/Highway, Open-Sky/Tree-lined street, and Downtown  described in [this discussion](https://www.kaggle.com/c/google-smartphone-decimeter-challenge/discussion/245160) by hand labeling for training data.
We think that hand labels for training data are allowed, and there is no problem in using the labels obtained by machine learning with them, but what do you think, [@sohier](https://www.kaggle.com/sohier) ?

## Method
1. Hand label on train ( [KML files for Train, Test, and Ground Truth](https://www.kaggle.com/c/google-smartphone-decimeter-challenge/discussion/245160) )
1. Extract the value of latDeg and lngDeg per collectionName and predict it with KNeighborsClassifier

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from glob import glob
from sklearn.neighbors import KNeighborsClassifier


###READ DATA###
BASE_DIR = Path('../input/google-smartphone-decimeter-challenge')
train_base = pd.read_csv(BASE_DIR / 'baseline_locations_train.csv')
train_base = train_base.sort_values([
    "collectionName", "phoneName", "millisSinceGpsEpoch"
]).reset_index(drop=True)
test_base = pd.read_csv(BASE_DIR / 'baseline_locations_test.csv')
test_base = test_base.sort_values([
    "collectionName", "phoneName", "millisSinceGpsEpoch"
]).reset_index(drop=True)



###ADD AREA###
train_base['area'] = train_base['collectionName'].map(lambda x: x.split('-')[4])
test_base['area'] = test_base['collectionName'].map(lambda x: x.split('-')[4])

###TRAIN AREA TARGET!!!###
train_name = np.array(
    sorted(path.split('/')[-1] for path in glob(f'{BASE_DIR}/train/*')))
train_highway = train_name[
    np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21]) - 1]
train_tree = train_name[
    np.array([22,23,25,26,28]) - 1]
train_downtown = train_name[
    np.array([24,27,29]) - 1]

train_base['area_target'] = -1
train_base.loc[train_base['collectionName'].isin(train_highway), 'area_target'] = 0
train_base.loc[train_base['collectionName'].isin(train_tree), 'area_target'] = 1
train_base.loc[train_base['collectionName'].isin(train_downtown), 'area_target'] = 2

###TEST AREA TARGET!!!###
test_name = np.array(
    sorted(path.split('/')[-1] for path in glob(f'{BASE_DIR}/test/*')))
test_highway = test_name[
    np.array([1,2,3,4,5,6,7,8,9]) - 1]
test_tree = test_name[
    np.array([10,11,12,13,14,16,17,18]) - 1]
test_downtown = test_name[
    np.array([15,19]) - 1]

test_base.loc[test_base['collectionName'].isin(test_highway), 'area_target'] = 0
test_base.loc[test_base['collectionName'].isin(test_tree), 'area_target'] = 1
test_base.loc[test_base['collectionName'].isin(test_downtown), 'area_target'] = 2


###PREDICT DOWNTOWN###
def processing_downtown(input_df: pd.DataFrame):
    output_df = input_df.groupby('collectionName')[['latDeg', 'lngDeg']].std()
    output_df = output_df.merge(
        input_df.groupby('collectionName')[['area_target']].first(),
        on='collectionName')
    output_df = output_df.merge(
        input_df.groupby('collectionName')['area'].first(),
        on='collectionName')
    output_df = output_df.merge(
        input_df.groupby('collectionName')['phoneName'].unique().apply(list),
        on='collectionName')
    return output_df

train = processing_downtown(train_base)
test = processing_downtown(test_base)
train['downtown_target'] = (train['area_target']==2).astype(int)

model_knn = KNeighborsClassifier(n_neighbors=1)
model_knn.fit(
    train[['latDeg', 'lngDeg']],
    train['downtown_target'])

downtown_pred = model_knn.predict(test[['latDeg', 'lngDeg']])


###PREDICT HIGHWAY & TREE###
def processing_highway_tree(input_df: pd.DataFrame):
    output_df = input_df.groupby('collectionName')[['latDeg', 'lngDeg']].min()
    output_df = output_df.merge(
        input_df.groupby('collectionName')[['area_target']].first(),
        on='collectionName')
    output_df = output_df.merge(
        input_df.groupby('collectionName')['area'].first(),
        on='collectionName')
    output_df = output_df.merge(
        input_df.groupby('collectionName')['phoneName'].unique().apply(list),
        on='collectionName')
    return output_df

train = processing_highway_tree(train_base)
test = processing_highway_tree(test_base)
test.loc[downtown_pred==1, 'area_pred'] = 2

model_knn = KNeighborsClassifier(n_neighbors=1)
model_knn.fit(
    train.loc[train['area_target']!=2, ['latDeg', 'lngDeg']],
    train.loc[train['area_target']!=2, 'area_target'])

pred = model_knn.predict(
    test.loc[test['area_pred'].isnull(), ['latDeg', 'lngDeg']])
test.loc[test['area_pred'].isnull(), 'area_pred'] = pred

test[['area_pred', 'area_target']] = test[['area_pred', 'area_target']].astype(int)
test = test[['latDeg', 'lngDeg', 'area', 'phoneName', 'area_pred', 'area_target']]
test # 100% !!!