In [17]:
import os
import json
import glob
import cv2
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go

from PIL import Image, ImageOps
from skimage import io
from skimage.color import rgba2rgb, rgb2xyz
from tqdm import tqdm
from dataclasses import dataclass
from math import floor, ceil
import random

# Train data generation
import collections
import csv
from pathlib import Path
from typing import List, Tuple, Any

import time
import re
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import pickle

pd.set_option("display.max_columns", 100)

In [None]:
SEED = 42

In [18]:
dir_path = "../input/indoorpkl/"
train_file_name = dir_path + "indoor_train.pkl"
test_file_name = dir_path + "indoor_test.pkl"

# Try loading it back in
with open(train_file_name, "rb") as file:
    df_train = pickle.load(file)

with open(test_file_name, "rb") as file:
    df_test = pickle.load(file)

In [19]:
display(df_train.head())
display(df_test.head())

Unnamed: 0,site_id,file_id,floor_converted,floor,ts,x,y,start_ts,diff_start_ts,acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,magn_x,magn_y,magn_z,magn_acc,wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,diff_acce_ts_start_ts,diff_ahrs_ts_start_ts,diff_magn_ts_start_ts,diff_wifi_ts_start_ts,site_id_le,file_id_le,floor_converted_le,wifi_ssid_le,wifi_bssid_le,ts_date,ts_day,ts_hour,ts_minute,wifi_last_seen_ts_date,wifi_last_seen_ts_day,wifi_last_seen_ts_hour,wifi_last_seen_ts_minute
0,5cd56c0ce2acfd2d33b6ab27,5d09a625bd54340008acddb9,-1,B1,1560913000000.0,14.283729,20.392578,1560913000000.0,0.0,1560913000000.0,-0.210693,-0.304062,9.943115,,1560913000000.0,-0.012902,0.008711,-0.427844,,1560913000000.0,-21.72,17.76,-36.12,,1560913000000.0,bd56240b1064c9e8e62ec3b8b1825d1104c16dcc,51e058eb65d3e5b3838e8dba0f3006028d5fd864,-90.0,,1560913000000.0,186.0,186.0,186.0,530.0,0,3,0,22,19,2019-06-19 03:02:49.585999872,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:02:00,2019-06-19 03:02:43.913999872,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:02:00
1,5cd56c0ce2acfd2d33b6ab27,5d09a625bd54340008acddb9,-1,B1,1560913000000.0,21.157534,30.024122,1560913000000.0,9886.0,1560913000000.0,-0.718262,-0.418991,10.347733,,1560913000000.0,-0.006788,0.039279,-0.327645,,1560913000000.0,-19.26,18.9,-30.179998,,1560913000000.0,bd56240b1064c9e8e62ec3b8b1825d1104c16dcc,51e058eb65d3e5b3838e8dba0f3006028d5fd864,-84.0,,1560913000000.0,9895.0,9895.0,9895.0,9998.0,0,3,0,22,19,2019-06-19 03:02:59.472000000,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:02:00,2019-06-19 03:02:52.752999936,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:02:00
0,5cd56c0ce2acfd2d33b6ab27,5d09a625bd54340008acddb7,-1,B1,1560913000000.0,10.19571,21.657787,1560913000000.0,0.0,1560913000000.0,0.043091,0.27533,9.42836,,1560913000000.0,0.000129,0.000146,-0.75751,,1560913000000.0,-24.18,-5.16,-26.64,,1560913000000.0,bd56240b1064c9e8e62ec3b8b1825d1104c16dcc,51e058eb65d3e5b3838e8dba0f3006028d5fd864,-87.0,,1560913000000.0,192.0,192.0,192.0,502.0,0,2,0,22,19,2019-06-19 03:00:25.966000128,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:00:00,2019-06-19 03:00:22.655000064,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:00:00
1,5cd56c0ce2acfd2d33b6ab27,5d09a625bd54340008acddb7,-1,B1,1560913000000.0,14.283729,20.392578,1560913000000.0,5292.0,1560913000000.0,-0.541092,1.374268,10.240006,,1560913000000.0,0.078497,-0.037847,-0.717117,,1560913000000.0,-23.82,-6.9,-26.88,,1560913000000.0,bd56240b1064c9e8e62ec3b8b1825d1104c16dcc,51e058eb65d3e5b3838e8dba0f3006028d5fd864,-76.0,,1560913000000.0,5284.0,5284.0,5284.0,5531.0,0,2,0,22,19,2019-06-19 03:00:31.257999872,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:00:00,2019-06-19 03:00:31.092000000,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:00:00
2,5cd56c0ce2acfd2d33b6ab27,5d09a625bd54340008acddb7,-1,B1,1560913000000.0,21.089481,19.001072,1560913000000.0,13045.0,1560913000000.0,-0.131683,0.172379,10.309433,,1560913000000.0,0.017186,-0.002997,-0.696129,,1560913000000.0,-26.699999,1.86,-10.62,,1560913000000.0,bd56240b1064c9e8e62ec3b8b1825d1104c16dcc,51e058eb65d3e5b3838e8dba0f3006028d5fd864,-79.0,,1560913000000.0,13051.0,13051.0,13051.0,13216.0,0,2,0,22,19,2019-06-19 03:00:39.011000064,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:00:00,2019-06-19 03:00:33.140999936,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:00:00


Unnamed: 0,site_id,file_id,floor_converted,floor,ts,x,y,start_ts,diff_start_ts,acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,magn_x,magn_y,magn_z,magn_acc,wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,diff_acce_ts_start_ts,diff_ahrs_ts_start_ts,diff_magn_ts_start_ts,diff_wifi_ts_start_ts,site_id_le,file_id_le,floor_converted_le,wifi_ssid_le,wifi_bssid_le,ts_date,ts_day,ts_hour,ts_minute,wifi_last_seen_ts_date,wifi_last_seen_ts_day,wifi_last_seen_ts_hour,wifi_last_seen_ts_minute
0,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,9.0,,,0.0,9.0,136.0,0.798813,4.30072,7.810059,,136.0,0.247101,0.104201,0.474897,,136.0,30.561829,-1.228333,-38.301086,,2340.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45.0,,1578475000000.0,136.0,136.0,136.0,2340.0,0,0,0,108,264,1970-01-01 00:00:00.009,1970-01-01,1970-01-01,1970-01-01,2020-01-08 09:09:04.726000128,2020-01-08,2020-01-08 09:00:00,2020-01-08 09:09:00
1,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,9017.0,,,0.0,9017.0,9012.0,-1.106979,4.056503,9.795456,,9012.0,0.162119,0.185954,0.561409,,9012.0,29.867554,-6.085205,-26.150513,,9508.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,1d1d62dcf72481cc9580fed3b724f0d27015aaf1,-43.0,,1578475000000.0,9012.0,9012.0,9012.0,9508.0,0,0,0,108,35,1970-01-01 00:00:09.017,1970-01-01,1970-01-01,1970-01-01,2020-01-08 09:09:30.052000000,2020-01-08,2020-01-08 09:00:00,2020-01-08 09:09:00
2,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,15326.0,,,0.0,15326.0,15326.0,-0.572464,3.981689,7.08223,,15326.0,0.22507,0.022647,-0.200452,,15326.0,-6.207275,14.727783,-39.649963,,14714.0,b6ffe5619e02871fcd04f61c9bb4b5c53a3f46b7,b26914599f6d9ba16b43975394e1eeb9d82f4bab,-41.0,,1578475000000.0,15326.0,15326.0,15326.0,14714.0,0,0,0,79,208,1970-01-01 00:00:15.326,1970-01-01,1970-01-01,1970-01-01,2020-01-08 09:09:38.027000064,2020-01-08,2020-01-08 09:00:00,2020-01-08 09:09:00
3,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,18763.0,,,0.0,18763.0,18755.0,-0.751434,4.546112,10.231201,,18755.0,0.225055,0.044806,-0.119175,,18755.0,-1.350403,9.870911,-41.67633,,19587.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,de53ffe7e3c71c9ed5c845fa50e0521efa5f3685,-41.0,,1578475000000.0,18755.0,18755.0,18755.0,19587.0,0,0,0,108,249,1970-01-01 00:00:18.763,1970-01-01,1970-01-01,1970-01-01,2020-01-08 09:09:42.716999936,2020-01-08,2020-01-08 09:00:00,2020-01-08 09:09:00
4,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,22328.0,,,0.0,22328.0,22326.0,-2.089798,4.224701,12.037628,,22326.0,0.242105,0.053464,-0.008162,,22326.0,-1.350403,5.014038,-30.87616,,22074.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,1d1d62dcf72481cc9580fed3b724f0d27015aaf1,-42.0,,1578475000000.0,22326.0,22326.0,22326.0,22074.0,0,0,0,108,35,1970-01-01 00:00:22.328,1970-01-01,1970-01-01,1970-01-01,2020-01-08 09:09:34.860999936,2020-01-08,2020-01-08 09:00:00,2020-01-08 09:09:00


In [118]:
# metric
def comp_metric(xhat, yhat, fhat, x, y, f):
    intermediate = np.sqrt(np.power(xhat - x,2) + np.power(yhat-y,2)) + 15 * np.abs(fhat-f)
    return intermediate.sum() / xhat.shape[0]

In [135]:
# column settings
drop_cols = ["floor_converted", "floor", "x", "y", "floor_converted_le"]
categorical_features = ["site_id", "file_id", "wifi_ssid", "wifi_bssid"]
datetime_features = ["ts_date", "ts_day", "ts_hour", "ts_minute", \
                     "wifi_last_seen_ts_date", "wifi_last_seen_ts_day", \
                     "wifi_last_seen_ts_hour", "wifi_last_seen_ts_minute"]

# convert to category from object dtype
for col in categorical_features:
    df_train[col] = df_train[col].astype("category")
    
# convert to int from datetime64 dtype
for col in datetime_features:
    df_train[col] = df_train[col].astype(int)

# set features and labels
# print(df_train.info())

features = df_train.drop(columns=drop_cols)
target_x = df_train.iloc[:, 5]
target_y = df_train.iloc[:, 6]
target_f = df_train.iloc[:, 2]

targets = ["x", "y", "f"]
target_data = [target_x, target_y, target_f]

d = {}
for tgt, tgt_data in zip(targets, target_data):
    feat_train, feat_val, target_train, target_val = \
    train_test_split(features, tgt_data, test_size = 0.2, random_state = 42)
    d["feat_train_{}".format(tgt)] = feat_train
    d["feat_val_{}".format(tgt)] = feat_val
    d["target_train_{}".format(tgt)] = target_train
    d["target_val_{}".format(tgt)] = target_val
    d["train_{}".format(tgt)] = lgb.Dataset(data=feat_train, label=target_train, categorical_feature=categorical_features, free_raw_data=False).construct()
    d["val_{}".format(tgt)] = lgb.Dataset(data=feat_val, label=target_val, categorical_feature=categorical_features, free_raw_data=False).construct()
    print(d["feat_train_{}".format(tgt)].shape)
    print(d["feat_val_{}".format(tgt)].shape)
    print(d["target_train_{}".format(tgt)].shape)
    print(d["target_val_{}".format(tgt)].shape)
    print(d["train_{}".format(tgt)].get_data().shape)
    print(d["train_{}".format(tgt)].get_label().shape)
    print(d["val_{}".format(tgt)].get_data().shape)
    print(d["val_{}".format(tgt)].get_label().shape)

(428, 42)
(108, 42)
(428,)
(108,)
(428, 42)
(428,)
(108, 42)
(108,)
(428, 42)
(108, 42)
(428,)
(108,)
(428, 42)
(428,)
(108, 42)
(108,)
(428, 42)
(108, 42)
(428,)
(108,)
(428, 42)
(428,)
(108, 42)
(108,)


In [136]:
# lgb patams
lgb_params = {'objective': 'root_mean_squared_error',
              'boosting_type': 'gbdt',
              'n_estimators': 50000, # example had 50000
              'learning_rate': 0.1,
              'num_leaves': 90,
              'colsample_bytree': 0.4,
              'subsample': 0.6,
              'subsample_freq': 2,
              'bagging_seed': SEED,
              'reg_alpha': 8,
              'reg_lambda': 2,
              'random_state': SEED,
              'n_jobs': -1
              }

# train models
for tgt in targets:
    model = lgb.train(params=lgb_params,
                      train_set=d["train_{}".format(tgt)],
                      early_stopping_rounds=20,
                      valid_sets=d["val_{}".format(tgt)])
    d["model_{}".format(tgt)] = model
    d["pred_target_{}".format(tgt)] = model.predict(d["feat_val_{}".format(tgt)], 
                                                    num_iteration=model.best_iteration)

mse_x = mean_squared_error(d["target_val_x"], d["pred_target_x"])
mse_y = mean_squared_error(d["target_val_y"], d["pred_target_y"])
mse_f = mean_squared_error(d["target_val_f"], d["pred_target_f"])
rmse_x = np.sqrt(mse_x)
rmse_y = np.sqrt(mse_y)
rmse_f = np.sqrt(mse_f)
print("rmse_x:", rmse_x, "rmse_y:",rmse_y, "rmse_f:",rmse_f)



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3706
[LightGBM] [Info] Number of data points in the train set: 428, number of used features: 38
[LightGBM] [Info] Start training from score 116.858280
[1]	valid_0's rmse: 55.4332
Training until validation scores don't improve for 20 rounds
[2]	valid_0's rmse: 51.3066
[3]	valid_0's rmse: 47.4879
[4]	valid_0's rmse: 44.21
[5]	valid_0's rmse: 41.5416
[6]	valid_0's rmse: 39.372
[7]	valid_0's rmse: 37.061
[8]	valid_0's rmse: 35.1604
[9]	valid_0's rmse: 33.4974
[10]	valid_0's rmse: 32.1497
[11]	valid_0's rmse: 30.7701
[12]	valid_0's rmse: 29.6286
[13]	valid_0's rmse: 28.8998
[14]	valid_0's rmse: 28.406
[15]	valid_0's rmse: 27.74
[16]	valid_0's rmse: 27.3904
[17]	valid_0's rmse: 26.6512
[18]	valid_0's rmse: 26.1761
[19]	valid_0's rmse: 25.7124
[20]	valid_0's rmse: 25.4159
[21]	valid_0's rmse: 25.0268
[22]	valid_0's rmse: 24.577
[23]	valid_0's rmse: 24.3886
[24]	valid_0's rmse: 24.2069
[25]	valid_0's rmse: 2

[19]	valid_0's rmse: 17.6955
[20]	valid_0's rmse: 17.6128
[21]	valid_0's rmse: 17.3792
[22]	valid_0's rmse: 17.1698
[23]	valid_0's rmse: 17.0619
[24]	valid_0's rmse: 16.8977
[25]	valid_0's rmse: 16.8563
[26]	valid_0's rmse: 16.6203
[27]	valid_0's rmse: 16.5772
[28]	valid_0's rmse: 16.5505
[29]	valid_0's rmse: 16.4758
[30]	valid_0's rmse: 16.3397
[31]	valid_0's rmse: 16.1617
[32]	valid_0's rmse: 16.0312
[33]	valid_0's rmse: 15.967
[34]	valid_0's rmse: 15.9516
[35]	valid_0's rmse: 15.8725
[36]	valid_0's rmse: 15.8218
[37]	valid_0's rmse: 15.7578
[38]	valid_0's rmse: 15.6678
[39]	valid_0's rmse: 15.6346
[40]	valid_0's rmse: 15.5744
[41]	valid_0's rmse: 15.5816
[42]	valid_0's rmse: 15.5231
[43]	valid_0's rmse: 15.4635
[44]	valid_0's rmse: 15.3987
[45]	valid_0's rmse: 15.3713
[46]	valid_0's rmse: 15.276
[47]	valid_0's rmse: 15.2642
[48]	valid_0's rmse: 15.2774
[49]	valid_0's rmse: 15.2368
[50]	valid_0's rmse: 15.1918
[51]	valid_0's rmse: 15.1782
[52]	valid_0's rmse: 15.1193
[53]	valid_0's r

[214]	valid_0's rmse: 13.6566
[215]	valid_0's rmse: 13.6453
[216]	valid_0's rmse: 13.6311
[217]	valid_0's rmse: 13.6137
[218]	valid_0's rmse: 13.6295
[219]	valid_0's rmse: 13.6243
[220]	valid_0's rmse: 13.6529
[221]	valid_0's rmse: 13.6581
[222]	valid_0's rmse: 13.6661
[223]	valid_0's rmse: 13.6401
[224]	valid_0's rmse: 13.6376
Early stopping, best iteration is:
[204]	valid_0's rmse: 13.5874
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3706
[LightGBM] [Info] Number of data points in the train set: 428, number of used features: 38
[LightGBM] [Info] Start training from score 1.535047
[1]	valid_0's rmse: 1.45375
Training until validation scores don't improve for 20 rounds
[2]	valid_0's rmse: 1.39517
[3]	valid_0's rmse: 1.34149
[4]	valid_0's rmse: 1.29484
[5]	valid_0's rmse: 1.24601
[6]	valid_0's rmse: 1.21846
[7]	valid_0's rmse: 1.16552
[8]	valid_0's rmse: 1.11904
[9]	valid_0's rmse: 1.0963
[10]	valid_0's rmse: 1.08354
[11]	valid_0's rmse: 1.04778

In [132]:
print(d.keys())

dict_keys(['feat_train_x', 'feat_val_x', 'target_train_x', 'target_val_x', 'train_x', 'val_x', 'feat_train_y', 'feat_val_y', 'target_train_y', 'target_val_y', 'train_y', 'val_y', 'feat_train_f', 'feat_val_f', 'target_train_f', 'target_val_f', 'train_f', 'val_f', 'model_x', 'pred_target_x', 'model_y', 'pred_target_y', 'model_f', 'pred_target_f'])


In [139]:
# print(len(d["target_val_x"]))
# print(d["target_val_x"].to_numpy())
# print(type(d["target_val_x"].to_numpy()))

# print(len(d["pred_target_x"]))
# print(d["pred_target_x"])
# print(type(d["pred_target_x"]))

# print(np.power(d["pred_target_x"] - d["target_val_x"].to_numpy(), 2))
# print(np.power(d["pred_target_y"] - d["target_val_y"].to_numpy(), 2))
a = np.sqrt(np.power(d["pred_target_x"] - d["target_val_x"].to_numpy(), 2) + np.power(d["pred_target_y"] - d["target_val_y"].to_numpy(), 2)) + 15 * np.abs(d["pred_target_f"] - d["target_val_f"].to_numpy())
print(a.sum() / d["pred_target_x"].shape[0])
print(d["pred_target_x"].shape[0])

comp_metric = comp_metric(d["pred_target_x"], d["pred_target_y"], d["pred_target_f"], 
                          d["target_val_x"].to_numpy(), d["target_val_y"].to_numpy(), d["target_val_f"].to_numpy())
print("comp metric: ", comp_metric)
# def comp_metric(xhat, yhat, fhat, x, y, f):
#     intermediate = np.sqrt(np.power(xhat - x,2) + np.power(yhat-y,2)) + 15 * np.abs(fhat-f)
#     return intermediate.sum()/xhat.shape[0]

24.78382305654363
108


TypeError: 'numpy.float64' object is not callable

In [None]:
# # Load test data
# df_test_pred = df_test.drop(columns=drop_cols)

# # Add "site_path_timestamp" column to df_test
# # convert ts to unix and then convert it to string, and put it in a new column["ts_unix"]
# # join ["site_id", "file_id", "ts_unix"] and put it into a new column ["site_path_timestamp"]
# # or just add it when generating the data -> less steps -> less likely to make a mistake

# test_preds_x = model_x.predict(df_test_pred)
# test_preds_y = model_y.predict(df_test_pred)
# test_preds_f = model_f.predict(df_test_pred)

# # stack the results up with the pairing site_path_timestamp
# # load ssubm
# # test_preds = pd.concat([test_preds_x, test_preds_y, test_preds_f], axis=1)
# test_preds.columns = ssubm.columns
# test_preds.index = df_test_pred["site_path_timestamp"]
# test_preds["floor"] = test_preds["floor"].astype(int)
# predictions.append(test_preds)

In [None]:
# generate prediction file 
# pd.concat(predictions)
# reindex the prediction df
# convert to csv for submission