# Generating wifi features

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import collections
import csv
import glob
import multiprocessing
import os
from multiprocessing import Pool
from pathlib import Path
from typing import List, Tuple, Any

In [None]:
def input_dir() -> Path:
    return Path('/kaggle/input/indoor-location-navigation/')

In [None]:
def extract_wps_wifis(file: Path) -> Tuple[List[str], List[str]]:
    wps = []
    wifis = []
    with open(file) as f:
        for row in csv.reader(f, delimiter="\t", doublequote=True):
            if row[1] == "TYPE_WAYPOINT":
                # x
                row[2] = float(row[2])  # type: ignore
                # y
                row[3] = float(row[3])  # type: ignore
                wps.append(row)
            elif row[1] == "TYPE_WIFI":
                # wifi signal value
                row[4] = int(row[4])  # type: ignore
                wifis.append(row)
    wps = sorted(wps, key=lambda x: x[0])  # timestamp
    wifis = sorted(wifis, key=lambda x: x[0])  # timestamp
    return wps, wifis


def top_bssids(bssids: List[str], n: int) -> List[str]:
    df = pd.DataFrame(bssids)
    value_counts = df[0].value_counts() # type: ignore
    return sorted(value_counts[value_counts > n].index.tolist())


def top_bssids_for_building(input_dir: Path, building: str, n: int) -> List[str]:
    folders = sorted(glob.glob(os.path.join(
        input_dir, 'train/' + building+'/*')))
    bssids = []
    for folder in folders:
        files = glob.glob(os.path.join(folder, "*.txt"))
        for file in files:
            _, wifis = extract_wps_wifis(Path(file))
            bssids.extend([wifi[3] for wifi in wifis])

    return top_bssids(bssids, n)


def nearest_waypoint(timestamp: int, wps: List[List[str]]) -> List[str]:
    dists = []
    for wp in wps:
        # timestamp delta
        dist = abs(timestamp - int(wp[0]))
        dists.append(dist)
    nearest_index = np.argmin(dists)
    return wps[nearest_index]


# Note: This can have exact same rows in train. Because both wifi_group_a and wifi_group_b can be nearest to a certain waypoint and wifi_group_a and wifi_group_b are the same.
def generate_train_for_building(building_path: Path, bssids: List[str]) -> pd.Series:
    dfs = []
    folders = sorted(building_path.glob('*'))
    for folder in folders:
        files = folder.glob("*.txt")
        for file in files:
            rows = generate_train_for_path(file, bssids)
            dfs.extend(rows)
    building_df = pd.concat(dfs)
    building_df.reset_index(drop=True, inplace=True)
    type_map = {column: int for column in bssids}
    building_df = building_df.astype(type_map) # type: ignore
    return building_df


def generate_train_for_path(path_file: Path, bssids: List[str]) -> List[Any]:
    floor = str(path_file.parent.name)
    wps, wifis = extract_wps_wifis(path_file)
    wifis_df = pd.DataFrame(wifis, columns=[
                            'timestamp', 'type', 'ssid', 'bssid', 'value', 'channel', 'last_timestamp'])
    rows = []
    for timestamp, wifi_group in wifis_df.groupby('timestamp'):
        timestamp = int(timestamp)
        path = path_file.stem
        row = generate_train_for_timestamp(
            timestamp, wifi_group, wps, floor, path, bssids)
        rows.append(row)
    return rows


def generate_train_for_timestamp(timestamp: int, wifi_group: pd.DataFrame, wps: List[Any], floor: str, path: str, bssids: List[str]) -> pd.DataFrame:
    floor_map = {"B2": -2, "B1": -1, "F1": 0, "F2": 1, "F3": 2, "F4": 3, "F5": 4, "F6": 5, "F7": 6, "F8": 7, "F9": 8,
                 "1F": 0, "2F": 1, "3F": 2, "4F": 3, "5F": 4, "6F": 5, "7F": 6, "8F": 7, "9F": 8}
    waypoint = nearest_waypoint(timestamp, wps)
    wifi_group = wifi_group.drop_duplicates(subset='bssid')
    tmp = wifi_group.iloc[:, 3:5]  # bssid and value
    row = tmp.set_index('bssid').reindex(bssids).replace(np.nan, -999).T
    row["x"] = float(waypoint[2])
    row["y"] = float(waypoint[3])
    row["f"] = floor_map[floor]
    row["path"] = path
    return row


def generate_target_buildings() -> List[str]:
    ssubm = pd.read_csv(
        '/kaggle/input/indoor-location-navigation/sample_submission.csv')
    ssubm_df = ssubm["site_path_timestamp"].apply(
        lambda x: pd.Series(x.split("_")))
    return sorted(ssubm_df[0].value_counts().index.tolist()) # type: ignore


def generate_one(building: str):
    print(f"start:{building}")
    building_path = input_dir() / 'train' / building
    bssids = top_bssids_for_building(input_dir(), building, 1000)
    train_df = generate_train_for_building(building_path, bssids)
    train_df.to_csv(f'{building}_train.csv', index=False)
    print(f"end:{building}")


def generate_train():
    num_cores = multiprocessing.cpu_count()
    print(f"num_cores={num_cores}")
    pool = Pool(num_cores)
    pool.map(generate_one, generate_target_buildings())


def generate_test_one(building_df: pd.DataFrame):
    building = building_df.iloc[0, 0]
    print(f"start: {building}")
    bssids = top_bssids_for_building(input_dir(), building, 1000) # type: ignore
    feats = []
    # group by path
    for path, path_df in building_df.groupby('path'):
        _, wifis = extract_wps_wifis(input_dir() / 'test' / f'{path}.txt')

        wifi_df = pd.DataFrame(wifis)
        wifi_points = pd.DataFrame(wifi_df.groupby(0).count().index.tolist())
        for timepoint in path_df.iloc[:, 2].tolist():
            deltas = (wifi_points.astype(int) - int(timepoint)).abs()
            min_delta_idx = deltas.values.argmin()
            wifi_block_timestamp = wifi_points.iloc[min_delta_idx].values[0]

            wifi_block = wifi_df[wifi_df[0] ==
                                 wifi_block_timestamp].drop_duplicates(subset=3)
            feat = wifi_block.set_index(3)[4].reindex(bssids).fillna(-999)

            feat['site_path_timestamp'] = f'{building}_{path}_{timepoint}'
            feats.append(feat)
    feature_df = pd.concat(feats, axis=1).T
    feature_df.to_csv(f"{building}_test.csv", index=False)
    print(f'end: {building}')


def generate_test():
    sub_df = pd.read_csv(
        '/kaggle/input/indoor-location-navigation/sample_submission.csv')
    sub_df = sub_df["site_path_timestamp"].apply(
        lambda x: pd.Series(x.split("_")))
    sub_df.columns = ['site', 'path', 'timestamp']

    building_dfs = [building_df for _, building_df in sub_df.groupby('site')]

    num_cores = multiprocessing.cpu_count()
    print(f"num_cores={num_cores}")
    pool = Pool(num_cores)
    pool.map(generate_test_one, building_dfs)

In [None]:
generate_train()

In [None]:
generate_test()

# LightGBM as regressor

In [None]:
# ------------------------------------------------------------------------------
# Import libraries
# ------------------------------------------------------------------------------
import numpy as np
import pandas as pd
import scipy.stats as stats
from pathlib import Path
import glob

from sklearn.model_selection import KFold
import lightgbm as lgb

import psutil
import random
import os
import time
import sys
import math
from contextlib import contextmanager

# ------------------------------------------------------------------------------
# Fixed values
# ------------------------------------------------------------------------------
N_SPLITS = 10
SEED = 100

# ------------------------------------------------------------------------------
# File path definition
# ------------------------------------------------------------------------------
LOG_PATH = Path("./log/")
LOG_PATH.mkdir(parents=True, exist_ok=True)


# ------------------------------------------------------------------------------
# Utilities
# ------------------------------------------------------------------------------
@contextmanager
def timer(name: str):
    t0 = time.time()
    p = psutil.Process(os.getpid())
    m0 = p.memory_info()[0] / 2. ** 30
    try:
        yield
    finally:
        m1 = p.memory_info()[0] / 2. ** 30
        delta = m1 - m0
        sign = '+' if delta >= 0 else '-'
        delta = math.fabs(delta)
        print(f"[{m1:.1f}GB({sign}{delta:.1f}GB): {time.time() - t0:.3f}sec] {name}", file=sys.stderr)


def set_seed(seed=1234):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

    
def comp_metric(xhat, yhat, fhat, x, y, f):
    intermediate = np.sqrt(np.power(xhat-x, 2) + np.power(yhat-y, 2)) + 15 * np.abs(fhat-f)
    return intermediate.sum()/xhat.shape[0]


def score_log(df: pd.DataFrame, num_files: int, nam_file: str, data_shape: tuple, n_fold: int, seed: int, mpe: float):
    score_dict = {'n_files': num_files, 'file_name': nam_file, 'shape': data_shape, 'fold': n_fold, 'seed': seed, 'score': mpe}
    # noinspection PyTypeChecker
    df = pd.concat([df, pd.DataFrame.from_dict([score_dict])])
    df.to_csv(LOG_PATH / f"log_score.csv", index=False)
    return df




# ------------------------------------------------------------------------------
# Set seed
# ------------------------------------------------------------------------------
set_seed(SEED)

# ------------------------------------------------------------------------------
# Read data
# ------------------------------------------------------------------------------
feature_dir = "../input/indoor-navigation-and-location-wifi-features"
train_files = sorted(glob.glob(os.path.join(feature_dir, '*_train.csv')))
test_files = sorted(glob.glob(os.path.join(feature_dir, '*_test.csv')))
subm = pd.read_csv('../input/indoor-location-navigation/sample_submission.csv', index_col=0)

# ------------------------------------------------------------------------------
# Define parameters for models
# ------------------------------------------------------------------------------
lgb_params = {'objective': 'root_mean_squared_error',
              'boosting_type': 'gbdt',
              'n_estimators': 50000,
              'learning_rate': 0.1,
              'num_leaves': 90,
              'colsample_bytree': 0.4,
              'subsample': 0.6,
              'subsample_freq': 2,
              'bagging_seed': SEED,
              'reg_alpha': 8,
              'reg_lambda': 2,
              'random_state': SEED,
              'n_jobs': -1
              }

lgb_f_params = {'objective': 'multiclass',
                'boosting_type': 'gbdt',
                'n_estimators': 50000,
                'learning_rate': 0.1,
                'num_leaves': 90,
                'colsample_bytree': 0.4,
                'subsample': 0.6,
                'subsample_freq': 2,
                'bagging_seed': SEED,
                'reg_alpha': 10,
                'reg_lambda': 2,
                'random_state': SEED,
                'n_jobs': -1
                }

# ------------------------------------------------------------------------------
# Training and inference
# ------------------------------------------------------------------------------
score_df = pd.DataFrame()
oof = list()
predictions = list()
for n_files, file in enumerate(train_files):
    data = pd.read_csv(file, index_col=0)
    test_data = pd.read_csv(test_files[n_files], index_col=0)

    oof_x, oof_y, oof_f = np.zeros(data.shape[0]), np.zeros(data.shape[0]), np.zeros(data.shape[0])
    preds_x, preds_y = 0, 0
    preds_f_arr = np.zeros((test_data.shape[0], N_SPLITS))

    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
    for fold, (trn_idx, val_idx) in enumerate(kf.split(data.iloc[:, :-4])):
        X_train = data.iloc[trn_idx, :-4]
        y_trainx = data.iloc[trn_idx, -4]
        y_trainy = data.iloc[trn_idx, -3]
        y_trainf = data.iloc[trn_idx, -2]

        X_valid = data.iloc[val_idx, :-4]
        y_validx = data.iloc[val_idx, -4]
        y_validy = data.iloc[val_idx, -3]
        y_validf = data.iloc[val_idx, -2]

        modelx = lgb.LGBMRegressor(**lgb_params)
        with timer("fit X"):
            modelx.fit(X_train, y_trainx,
                       eval_set=[(X_valid, y_validx)],
                       eval_metric='rmse',
                       verbose=False,
                       early_stopping_rounds=20
                       )

        modely = lgb.LGBMRegressor(**lgb_params)
        with timer("fit Y"):
            modely.fit(X_train, y_trainy,
                       eval_set=[(X_valid, y_validy)],
                       eval_metric='rmse',
                       verbose=False,
                       early_stopping_rounds=20
                       )

        modelf = lgb.LGBMClassifier(**lgb_f_params)
        with timer("fit F"):
            modelf.fit(X_train, y_trainf,
                       eval_set=[(X_valid, y_validf)],
                       eval_metric='multi_logloss',
                       verbose=False,
                       early_stopping_rounds=20
                       )

        oof_x[val_idx] = modelx.predict(X_valid)
        oof_y[val_idx] = modely.predict(X_valid)
        oof_f[val_idx] = modelf.predict(X_valid).astype(int)

        preds_x += modelx.predict(test_data.iloc[:, :-1]) / N_SPLITS
        preds_y += modely.predict(test_data.iloc[:, :-1]) / N_SPLITS
        preds_f_arr[:, fold] = modelf.predict(test_data.iloc[:, :-1]).astype(int)

        score = comp_metric(oof_x[val_idx], oof_y[val_idx], oof_f[val_idx],
                            y_validx.to_numpy(), y_validy.to_numpy(), y_validf.to_numpy())
        print(f"fold {fold}: mean position error {score}")
        score_df = score_log(score_df, n_files, os.path.basename(file), data.shape, fold, SEED, score)

    print("*+"*40)
    print(f"file #{n_files}, shape={data.shape}, name={os.path.basename(file)}")
    score = comp_metric(oof_x, oof_y, oof_f,
                        data.iloc[:, -4].to_numpy(), data.iloc[:, -3].to_numpy(), data.iloc[:, -2].to_numpy())
    oof.append(score)
    print(f"mean position error {score}")
    print("*+"*40)
    score_df = score_log(score_df, n_files, os.path.basename(file), data.shape, 999, SEED, score)

    preds_f_mode = stats.mode(preds_f_arr, axis=1)
    preds_f = preds_f_mode[0].astype(int).reshape(-1)
    test_preds = pd.DataFrame(np.stack((preds_f, preds_x, preds_y))).T
    test_preds.columns = subm.columns
    test_preds.index = test_data["site_path_timestamp"]
    test_preds["floor"] = test_preds["floor"].astype(int)
    predictions.append(test_preds)

# ------------------------------------------------------------------------------
# Submit the result
# ------------------------------------------------------------------------------
all_preds = pd.concat(predictions)
all_preds = all_preds.reindex(subm.index)
all_preds.to_csv('submission.csv')

In [None]:
from IPython.display import FileLink
FileLink('submission.csv')