In [61]:
import os
import json
import glob
import cv2
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go

from PIL import Image, ImageOps
from skimage import io
from skimage.color import rgba2rgb, rgb2xyz
from tqdm import tqdm
from dataclasses import dataclass
from math import floor, ceil
import random

# Train data generation
import collections
import csv
from pathlib import Path
from typing import List, Tuple, Any

import time
import re
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import pickle

pd.set_option("display.max_columns", 100)

In [2]:
dir_path = "../input/indoorpkl/"
train_file_name = dir_path + "indoor_train.pkl"
test_file_name = dir_path + "indoor_test.pkl"

# Try loading it back in
with open(train_file_name, "rb") as file:
    df_train = pickle.load(file)

with open(test_file_name, "rb") as file:
    df_test = pickle.load(file)

In [3]:
display(df_train.head())
display(df_test.head())

Unnamed: 0,site_id,file_id,floor_converted,floor,ts,x,y,start_ts,diff_start_ts,acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,magn_x,magn_y,magn_z,magn_acc,wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,diff_acce_ts_start_ts,diff_ahrs_ts_start_ts,diff_magn_ts_start_ts,diff_wifi_ts_start_ts,site_id_le,file_id_le,floor_converted_le,wifi_ssid_le,wifi_bssid_le,ts_date,ts_day,ts_hour,ts_minute,wifi_last_seen_ts_date,wifi_last_seen_ts_day,wifi_last_seen_ts_hour,wifi_last_seen_ts_minute
0,5cd56c0ce2acfd2d33b6ab27,5d09a625bd54340008acddb9,-1,B1,1560913000000.0,14.283729,20.392578,1560913000000.0,0.0,1560913000000.0,-0.210693,-0.304062,9.943115,,1560913000000.0,-0.012902,0.008711,-0.427844,,1560913000000.0,-21.72,17.76,-36.12,,1560913000000.0,bd56240b1064c9e8e62ec3b8b1825d1104c16dcc,51e058eb65d3e5b3838e8dba0f3006028d5fd864,-90.0,,1560913000000.0,186.0,186.0,186.0,530.0,0,3,0,22,19,2019-06-19 03:02:49.585999872,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:02:00,2019-06-19 03:02:43.913999872,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:02:00
1,5cd56c0ce2acfd2d33b6ab27,5d09a625bd54340008acddb9,-1,B1,1560913000000.0,21.157534,30.024122,1560913000000.0,9886.0,1560913000000.0,-0.718262,-0.418991,10.347733,,1560913000000.0,-0.006788,0.039279,-0.327645,,1560913000000.0,-19.26,18.9,-30.179998,,1560913000000.0,bd56240b1064c9e8e62ec3b8b1825d1104c16dcc,51e058eb65d3e5b3838e8dba0f3006028d5fd864,-84.0,,1560913000000.0,9895.0,9895.0,9895.0,9998.0,0,3,0,22,19,2019-06-19 03:02:59.472000000,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:02:00,2019-06-19 03:02:52.752999936,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:02:00
0,5cd56c0ce2acfd2d33b6ab27,5d09a625bd54340008acddb7,-1,B1,1560913000000.0,10.19571,21.657787,1560913000000.0,0.0,1560913000000.0,0.043091,0.27533,9.42836,,1560913000000.0,0.000129,0.000146,-0.75751,,1560913000000.0,-24.18,-5.16,-26.64,,1560913000000.0,bd56240b1064c9e8e62ec3b8b1825d1104c16dcc,51e058eb65d3e5b3838e8dba0f3006028d5fd864,-87.0,,1560913000000.0,192.0,192.0,192.0,502.0,0,2,0,22,19,2019-06-19 03:00:25.966000128,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:00:00,2019-06-19 03:00:22.655000064,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:00:00
1,5cd56c0ce2acfd2d33b6ab27,5d09a625bd54340008acddb7,-1,B1,1560913000000.0,14.283729,20.392578,1560913000000.0,5292.0,1560913000000.0,-0.541092,1.374268,10.240006,,1560913000000.0,0.078497,-0.037847,-0.717117,,1560913000000.0,-23.82,-6.9,-26.88,,1560913000000.0,bd56240b1064c9e8e62ec3b8b1825d1104c16dcc,51e058eb65d3e5b3838e8dba0f3006028d5fd864,-76.0,,1560913000000.0,5284.0,5284.0,5284.0,5531.0,0,2,0,22,19,2019-06-19 03:00:31.257999872,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:00:00,2019-06-19 03:00:31.092000000,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:00:00
2,5cd56c0ce2acfd2d33b6ab27,5d09a625bd54340008acddb7,-1,B1,1560913000000.0,21.089481,19.001072,1560913000000.0,13045.0,1560913000000.0,-0.131683,0.172379,10.309433,,1560913000000.0,0.017186,-0.002997,-0.696129,,1560913000000.0,-26.699999,1.86,-10.62,,1560913000000.0,bd56240b1064c9e8e62ec3b8b1825d1104c16dcc,51e058eb65d3e5b3838e8dba0f3006028d5fd864,-79.0,,1560913000000.0,13051.0,13051.0,13051.0,13216.0,0,2,0,22,19,2019-06-19 03:00:39.011000064,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:00:00,2019-06-19 03:00:33.140999936,2019-06-19,2019-06-19 03:00:00,2019-06-19 03:00:00


Unnamed: 0,site_id,file_id,floor_converted,floor,ts,x,y,start_ts,diff_start_ts,acce_ts,acce_x,acce_y,acce_z,acce_acc,ahrs_ts,ahrs_x,ahrs_y,ahrs_z,ahrs_acc,magn_ts,magn_x,magn_y,magn_z,magn_acc,wifi_ts,wifi_ssid,wifi_bssid,wifi_rssi,wifi_freq,wifi_last_seen_ts,diff_acce_ts_start_ts,diff_ahrs_ts_start_ts,diff_magn_ts_start_ts,diff_wifi_ts_start_ts,site_id_le,file_id_le,floor_converted_le,wifi_ssid_le,wifi_bssid_le,ts_date,ts_day,ts_hour,ts_minute,wifi_last_seen_ts_date,wifi_last_seen_ts_day,wifi_last_seen_ts_hour,wifi_last_seen_ts_minute
0,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,9.0,,,0.0,9.0,136.0,0.798813,4.30072,7.810059,,136.0,0.247101,0.104201,0.474897,,136.0,30.561829,-1.228333,-38.301086,,2340.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45.0,,1578475000000.0,136.0,136.0,136.0,2340.0,0,0,0,108,264,1970-01-01 00:00:00.009,1970-01-01,1970-01-01,1970-01-01,2020-01-08 09:09:04.726000128,2020-01-08,2020-01-08 09:00:00,2020-01-08 09:09:00
1,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,9017.0,,,0.0,9017.0,9012.0,-1.106979,4.056503,9.795456,,9012.0,0.162119,0.185954,0.561409,,9012.0,29.867554,-6.085205,-26.150513,,9508.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,1d1d62dcf72481cc9580fed3b724f0d27015aaf1,-43.0,,1578475000000.0,9012.0,9012.0,9012.0,9508.0,0,0,0,108,35,1970-01-01 00:00:09.017,1970-01-01,1970-01-01,1970-01-01,2020-01-08 09:09:30.052000000,2020-01-08,2020-01-08 09:00:00,2020-01-08 09:09:00
2,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,15326.0,,,0.0,15326.0,15326.0,-0.572464,3.981689,7.08223,,15326.0,0.22507,0.022647,-0.200452,,15326.0,-6.207275,14.727783,-39.649963,,14714.0,b6ffe5619e02871fcd04f61c9bb4b5c53a3f46b7,b26914599f6d9ba16b43975394e1eeb9d82f4bab,-41.0,,1578475000000.0,15326.0,15326.0,15326.0,14714.0,0,0,0,79,208,1970-01-01 00:00:15.326,1970-01-01,1970-01-01,1970-01-01,2020-01-08 09:09:38.027000064,2020-01-08,2020-01-08 09:00:00,2020-01-08 09:09:00
3,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,18763.0,,,0.0,18763.0,18755.0,-0.751434,4.546112,10.231201,,18755.0,0.225055,0.044806,-0.119175,,18755.0,-1.350403,9.870911,-41.67633,,19587.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,de53ffe7e3c71c9ed5c845fa50e0521efa5f3685,-41.0,,1578475000000.0,18755.0,18755.0,18755.0,19587.0,0,0,0,108,249,1970-01-01 00:00:18.763,1970-01-01,1970-01-01,1970-01-01,2020-01-08 09:09:42.716999936,2020-01-08,2020-01-08 09:00:00,2020-01-08 09:09:00
4,5a0546857ecc773753327266,046cfa46be49fc10834815c6,,,22328.0,,,0.0,22328.0,22326.0,-2.089798,4.224701,12.037628,,22326.0,0.242105,0.053464,-0.008162,,22326.0,-1.350403,5.014038,-30.87616,,22074.0,da39a3ee5e6b4b0d3255bfef95601890afd80709,1d1d62dcf72481cc9580fed3b724f0d27015aaf1,-42.0,,1578475000000.0,22326.0,22326.0,22326.0,22074.0,0,0,0,108,35,1970-01-01 00:00:22.328,1970-01-01,1970-01-01,1970-01-01,2020-01-08 09:09:34.860999936,2020-01-08,2020-01-08 09:00:00,2020-01-08 09:09:00


In [4]:
# metric
def comp_metric(xhat, yhat, fhat, x, y, f):
    intermediate = np.sqrt(np.power(xhat - x,2) + np.power(yhat-y,2)) + 15 * np.abs(fhat-f)
    return intermediate.sum()/xhat.shape[0]

In [59]:
# column settings
drop_cols = ["floor_converted", "floor", "x", "y", "floor_converted_le"]
categorical_features = ["site_id", "file_id", "wifi_ssid", "wifi_bssid"]
datetime_features = ["ts_date", "ts_day", "ts_hour", "ts_minute", \
                     "wifi_last_seen_ts_date", "wifi_last_seen_ts_day", \
                     "wifi_last_seen_ts_hour", "wifi_last_seen_ts_minute"]

# convert to category from object dtype
for col in categorical_features:
    df_train[col] = df_train[col].astype("category")
    
# convert to int from datetime64 dtype
for col in datetime_features:
    df_train[col] = df_train[col].astype(int)

# set features and labels
# print(df_train.info())

features = df_train.drop(columns=drop_cols)
target_x = df_train.iloc[:, 5]

feat_train, feat_val, target_train_x, target_val_x  = \
train_test_split(features, target_x, test_size = 0.3, random_state = 42)

# Prep Dataset
train_x = lgb.Dataset(data=feat_train, label=target_train_x, categorical_feature=categorical_features, free_raw_data=False).construct()
val_x = lgb.Dataset(data=feat_val, label=target_val_x, categorical_feature=categorical_features, free_raw_data=False).construct()


# deal with later: y and f
# target_y = df_train.iloc[:, 6]
# target_f = df_train.iloc[:, 2]
# train_y = lgb.Dataset(data=features, label=target_y, 
#                       categorical_feature=categorical_features, free_raw_data=False)
# train_f = lgb.Dataset(data=features, label=target_f, 
#                       categorical_feature=categorical_features, free_raw_data=False)

# Check data
# train_x.get_data()
# train_x.get_label()
# val_x.get_data()
# val_x.get_label()

In [67]:
# print(len(feat))
# print(len(target_x))
# print(len(target_y))
# print(len(target_f))
# display(feat.head())
# display(target_x.head())
# display(target_y.head())
# display(target_f.head())

# lgbm_params = {
#     'objective': 'regression',
#     'metric': 'rmse',
# }
SEED = 42

lgb_params = {'objective': 'root_mean_squared_error',
              'boosting_type': 'gbdt',
              'n_estimators': 50000, # example had 50000
              'learning_rate': 0.1,
              'num_leaves': 90,
              'colsample_bytree': 0.4,
              'subsample': 0.6,
              'subsample_freq': 2,
              'bagging_seed': SEED,
              'reg_alpha': 8,
              'reg_lambda': 2,
              'random_state': SEED,
              'n_jobs': -1
              }

model_x = lgb.train(params=lgb_params, 
                    train_set=train_x,
                    early_stopping_rounds=20,
                    valid_sets=val_x
                   )

pred_target_x = model_x.predict(feat_val, num_iteration=model_x.best_iteration)
mse = mean_squared_error(target_val_x, pred_target_x)
rmse = np.sqrt(mse)
print(rmse)
# model_x = lgb.train(lgb_params, train_set=train_data)

# model_y = lgb.LGBMRegressor(n_estimators=125, num_leaves=90)
# model_y.fit(feat, target_y)

# model_f = lgb.LGBMRegressor(n_estimators=125, num_leaves=90)
# model_f.fit(feat, target_f)

# print(model_x)
# print(model_y)
# print(model_f)

# # Load test data
# df_test_pred = df_test.drop(columns=drop_cols)

# # Add "site_path_timestamp" column to df_test
# # convert ts to unix and then convert it to string, and put it in a new column["ts_unix"]
# # join ["site_id", "file_id", "ts_unix"] and put it into a new column ["site_path_timestamp"]
# # or just add it when generating the data -> less steps -> less likely to make a mistake

# test_preds_x = model_x.predict(df_test_pred)
# test_preds_y = model_y.predict(df_test_pred)
# test_preds_f = model_f.predict(df_test_pred)

# # stack the results up with the pairing site_path_timestamp
# # load ssubm
# # test_preds = pd.concat([test_preds_x, test_preds_y, test_preds_f], axis=1)
# test_preds.columns = ssubm.columns
# test_preds.index = df_test_pred["site_path_timestamp"]
# test_preds["floor"] = test_preds["floor"].astype(int)
# predictions.append(test_preds)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3284
[LightGBM] [Info] Number of data points in the train set: 375, number of used features: 38
[LightGBM] [Info] Start training from score 115.865571
[1]	valid_0's rmse: 57.5231
Training until validation scores don't improve for 20 rounds
[2]	valid_0's rmse: 54.2823
[3]	valid_0's rmse: 50.7398
[4]	valid_0's rmse: 47.6527
[5]	valid_0's rmse: 44.9611
[6]	valid_0's rmse: 42.7604
[7]	valid_0's rmse: 40.6633
[8]	valid_0's rmse: 38.7437
[9]	valid_0's rmse: 37.1224
[10]	valid_0's rmse: 35.8056
[11]	valid_0's rmse: 34.32
[12]	valid_0's rmse: 33.0653
[13]	valid_0's rmse: 32.3622
[14]	valid_0's rmse: 31.5925
[15]	valid_0's rmse: 30.5521
[16]	valid_0's rmse: 29.9643
[17]	valid_0's rmse: 29.2757
[18]	valid_0's rmse: 28.8648
[19]	valid_0's rmse: 28.2882
[20]	valid_0's rmse: 28.0716
[21]	valid_0's rmse: 27.5938
[22]	valid_0's rmse: 27.3172
[23]	valid_0's rmse: 27.1964
[24]	valid_0's rmse: 26.6339
[25]	valid_0's r

[195]	valid_0's rmse: 21.129
[196]	valid_0's rmse: 21.1159
[197]	valid_0's rmse: 21.1222
[198]	valid_0's rmse: 21.0756
[199]	valid_0's rmse: 21.1114
[200]	valid_0's rmse: 21.1403
[201]	valid_0's rmse: 21.1455
[202]	valid_0's rmse: 21.132
[203]	valid_0's rmse: 21.0735
[204]	valid_0's rmse: 21.0204
[205]	valid_0's rmse: 20.9976
[206]	valid_0's rmse: 20.987
[207]	valid_0's rmse: 20.951
[208]	valid_0's rmse: 20.9369
[209]	valid_0's rmse: 20.9639
[210]	valid_0's rmse: 20.9982
[211]	valid_0's rmse: 20.9632
[212]	valid_0's rmse: 20.9551
[213]	valid_0's rmse: 20.9645
[214]	valid_0's rmse: 20.9743
[215]	valid_0's rmse: 20.9411
[216]	valid_0's rmse: 20.9422
[217]	valid_0's rmse: 20.8909
[218]	valid_0's rmse: 20.8631
[219]	valid_0's rmse: 20.8647
[220]	valid_0's rmse: 20.8773
[221]	valid_0's rmse: 20.8712
[222]	valid_0's rmse: 20.8335
[223]	valid_0's rmse: 20.8232
[224]	valid_0's rmse: 20.8081
[225]	valid_0's rmse: 20.8104
[226]	valid_0's rmse: 20.7978
[227]	valid_0's rmse: 20.8011
[228]	valid_0'

In [None]:
# generate prediction file 
# pd.concat(predictions)
# reindex the prediction df
# convert to csv for submission