# LSTM baseline

from kuto

In [1]:
import os
import sys
import glob
import pickle
import random

import numpy as np
import pandas as pd
import scipy.stats as stats
from pathlib import Path

from timm.models.densenet import densenet121
from functools import partial


sys.path.append('../../')
import src.utils as utils
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

import pytorch_lightning as pl
# from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import EarlyStopping

import wandb
from pytorch_lightning.loggers import WandbLogger


In [2]:
DATA_DIR = Path("/home/knikaido/work/Indoor-Location-Navigation/data/")
WIFI_DIR = DATA_DIR / 'indoorunifiedwifids'
MLFLOW_DIR = DATA_DIR / 'mlflow/mlruns'
OUTPUT_DIR = Path('./output/')
MLFLOW_DIR = DATA_DIR / 'mlflow/mlruns'
IMG_DIR = DATA_DIR / f"indoor-location-navigation-img/metadata/"

## config

In [3]:
configs = {
    'loss':{
        'name': 'MSELoss',
        'params':{}
    },
    'optimizer':{
        'name': 'Adam',
        'params':{
            'lr': 0.001,
        }
    },

    'scheduler':{
        'name': 'ReduceLROnPlateau',
        'params':{
            'factor': 0.1,
            'patience': 3,
        }
    },

    'loader':{
        'train':{
            'batch_size': 16,
            'shuffle': True,
            'num_workers': 1,
        },
        'valid':{
            'batch_size': 4,
            'shuffle': False,
            'num_workers': 1,
        },
        'test':{
            'batch_size': 8,
            'shuffle': False,
            'num_workers': 1,
        }
    }
}

In [4]:
# config
config = configs

# globals variable
SEED = 777
MAX_EPOCHS = 200
N_SPLITS = 5
DEBUG = False
# EXP_MESSAGE = config['globals']['exp_message']

EXP_NAME = 17
IS_SAVE = True

utils.set_seed(SEED)

In [5]:
!wandb login e8aaf98060af90035c3c28a83b34452780aeec20

/bin/sh: 1: wandb: not found


## read data

In [6]:
train_df = pd.read_csv(WIFI_DIR / 'train_all.csv')
test_df = pd.read_csv(WIFI_DIR / 'test_all.csv')

simple_accurate_99 = pd.read_csv('../01/submission.csv')
test_df['floor'] = simple_accurate_99['floor'].values

In [7]:
sub = pd.read_csv(DATA_DIR/'indoor-location-navigation/sample_submission.csv', index_col=0)

BSSIDとRSSIは100ずつ存在しているけど全てが必要なわけではないみたい  
ここでは20だけ取り出している。

In [8]:
test_df

Unnamed: 0,bssid_0,bssid_1,bssid_2,bssid_3,bssid_4,bssid_5,bssid_6,bssid_7,bssid_8,bssid_9,...,rssi_93,rssi_94,rssi_95,rssi_96,rssi_97,rssi_98,rssi_99,site_path_timestamp,site_id,floor
0,eebf5db207eec2f3e041f92153d789270f346821,323607d8444900d64151ee06d164738ac727bbce,7805f319f3f591986effe78c5b41143180278f2d,02a1be3a5dab38320f879489d8a1e0f2a72768b3,b26914599f6d9ba16b43975394e1eeb9d82f4bab,6bc91b3951089c3a225396608b138ca178479924,d84cce12fbfba61bf930123050f61a11e2a29310,5b225e187d0dec3110683a74d0c9a5a4cb2022f5,b2546cae6e588d38618eacc557dd0385812197cf,8464ea586ee5479e1250f938d7c01e9bc68cefe8,...,-60,-61,-61,-61,-61,-61,-61,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,0
1,1d1d62dcf72481cc9580fed3b724f0d27015aaf1,bd9bc0a2092c040bfe6ba12f8aafac24e83b312a,d771612396c3e2e557e986fafd9fc2c56a99d3cd,13b7aeaf441f2161481481fe67eace721cff07ab,c48db7f3ed1858bb4fc191230e3d79d5eb178604,b4dbb0b30caa1d0f21b7b4185ba061556cada67a,b2546cae6e588d38618eacc557dd0385812197cf,d84cce12fbfba61bf930123050f61a11e2a29310,5b225e187d0dec3110683a74d0c9a5a4cb2022f5,6bc91b3951089c3a225396608b138ca178479924,...,-61,-61,-61,-61,-61,-61,-62,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,0
2,6bc91b3951089c3a225396608b138ca178479924,b26914599f6d9ba16b43975394e1eeb9d82f4bab,b2546cae6e588d38618eacc557dd0385812197cf,1d1d62dcf72481cc9580fed3b724f0d27015aaf1,d84cce12fbfba61bf930123050f61a11e2a29310,5b225e187d0dec3110683a74d0c9a5a4cb2022f5,7805f319f3f591986effe78c5b41143180278f2d,b4dbb0b30caa1d0f21b7b4185ba061556cada67a,bd9bc0a2092c040bfe6ba12f8aafac24e83b312a,d771612396c3e2e557e986fafd9fc2c56a99d3cd,...,-61,-61,-61,-61,-62,-62,-62,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,0
3,de53ffe7e3c71c9ed5c845fa50e0521efa5f3685,1d1d62dcf72481cc9580fed3b724f0d27015aaf1,bccd6a9054f8649ad43fe96b766687fb769b064f,f64c13fd10a07bca1bf2b7bd7a80630632ce62c9,7590cf109f6ff3277fd18d10f4727a8777d675ce,7129f110688db020946105b359cae2e59338135b,15d53b7189ffbd7c6010c388a9ccea417d4f28ee,6915ad24a2edf8047f749233e19e9853f5dc17fd,12911a64fecf13f2e9fb0aaed554621e3b0bacde,a929157f3cc32a433b02ad7d7876e9a1678d3944,...,-60,-60,-60,-61,-61,-61,-61,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,0
4,1d1d62dcf72481cc9580fed3b724f0d27015aaf1,a929157f3cc32a433b02ad7d7876e9a1678d3944,6bc91b3951089c3a225396608b138ca178479924,000840e5c600de293cea57f13326f273c86c3988,662791f44cd61d0426634cf093bf0ff1bfd88c2c,c729e2e4f5a2888583cfebcd98b3178023f58b8e,15d53b7189ffbd7c6010c388a9ccea417d4f28ee,f64c13fd10a07bca1bf2b7bd7a80630632ce62c9,d5dad1fcdae9e773ede884b3b4d781d5ee1ec90e,6915ad24a2edf8047f749233e19e9853f5dc17fd,...,-63,-63,-63,-63,-63,-63,-64,5a0546857ecc773753327266_046cfa46be49fc1083481...,5a0546857ecc773753327266,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10128,621fbeab0ad7fa0465f0b82c3b32361a3a848a5d,1a766e046f186e7a1dfb26e43df7be10237a1bdb,25f148dc754c40e6ca5022831c2e2eda132e2e39,3df5a390b1357c32f1c24fdef1c00848ecfdb966,a3a775d6fac3ee115b82a5e7bd443676e1270a4b,979bfb4594c7f3943bfb5e8a41fe5912feb882f4,04ffe806c310e9d0ff298524f2d77e7731957f07,bf9a90571459e1d511b9bec5544852ce238e9bc6,f46c9fb38056b81582c026df6cc2237ae8080cb6,1496306fe32a577a77e92d56995dea87bcea724a,...,-83,-83,-83,-84,-84,-84,-84,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5dc8cea7659e181adb076a3f,5
10129,3df5a390b1357c32f1c24fdef1c00848ecfdb966,621fbeab0ad7fa0465f0b82c3b32361a3a848a5d,1a766e046f186e7a1dfb26e43df7be10237a1bdb,25f148dc754c40e6ca5022831c2e2eda132e2e39,a3a775d6fac3ee115b82a5e7bd443676e1270a4b,979bfb4594c7f3943bfb5e8a41fe5912feb882f4,1496306fe32a577a77e92d56995dea87bcea724a,6c63e704ac41957efd2b0216959ad5f61604433a,1633d5d4624672046db8521a732dfc8c4496f24d,fc25bb6349545b6873b02564154f7a494d26ae64,...,-85,-85,-85,-85,-85,-85,-85,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5dc8cea7659e181adb076a3f,5
10130,3df5a390b1357c32f1c24fdef1c00848ecfdb966,195eb3cc3f2b34a578f0df6082e204b8c333537b,fc25bb6349545b6873b02564154f7a494d26ae64,bf9a90571459e1d511b9bec5544852ce238e9bc6,a0036e6cdc360539e1ab615a186d33de31081e30,f46c9fb38056b81582c026df6cc2237ae8080cb6,94f82b58643e3dfcf699031634e78ba32fedb1da,5f8e96db3b02980da615fb031ff1e2170f71d8cf,1496306fe32a577a77e92d56995dea87bcea724a,979bfb4594c7f3943bfb5e8a41fe5912feb882f4,...,-83,-83,-84,-84,-84,-84,-84,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5dc8cea7659e181adb076a3f,5
10131,195eb3cc3f2b34a578f0df6082e204b8c333537b,3df5a390b1357c32f1c24fdef1c00848ecfdb966,913b8d51f071b4958d4240299e516af3273a8c20,baf415ae85f3997ffb2ad0797952dbbb4832f378,5f8e96db3b02980da615fb031ff1e2170f71d8cf,fc25bb6349545b6873b02564154f7a494d26ae64,bf9a90571459e1d511b9bec5544852ce238e9bc6,f46c9fb38056b81582c026df6cc2237ae8080cb6,a3a775d6fac3ee115b82a5e7bd443676e1270a4b,688ac5d49a12eaa598b582d40302bc6758835b43,...,-80,-81,-81,-81,-81,-82,-82,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5dc8cea7659e181adb076a3f,5


In [9]:
# training target features
NUM_FEATS = 80
BSSID_FEATS = [f'bssid_{i}' for i in range(NUM_FEATS)]
RSSI_FEATS  = [f'rssi_{i}' for i in range(NUM_FEATS)]

In [10]:
train_df.iloc[:, 100:110]

Unnamed: 0,rssi_0,rssi_1,rssi_2,rssi_3,rssi_4,rssi_5,rssi_6,rssi_7,rssi_8,rssi_9
0,-32,-39,-47,-48,-48,-49,-51,-52,-54,-56
1,-29,-34,-47,-48,-48,-49,-52,-52,-52,-53
2,-33,-39,-48,-48,-49,-52,-54,-55,-55,-55
3,-46,-48,-49,-50,-51,-52,-54,-56,-57,-57
4,-42,-49,-51,-51,-52,-53,-54,-55,-55,-55
...,...,...,...,...,...,...,...,...,...,...
258120,-53,-63,-64,-66,-68,-68,-68,-68,-70,-71
258121,-58,-64,-66,-67,-68,-68,-69,-70,-71,-71
258122,-57,-58,-60,-64,-66,-67,-68,-69,-71,-73
258123,-58,-64,-66,-66,-68,-69,-69,-71,-71,-72


bssid_NはN個目のBSSIDを示しておりRSSI値が大きい順に番号が振られている。
100個しかない


In [11]:
# get numbers of bssids to embed them in a layer

# train
wifi_bssids = []
# bssidを列ごとにリストに入れていく
for i in range(100):
    wifi_bssids.extend(train_df.iloc[:,i].values.tolist())
wifi_bssids = list(set(wifi_bssids))

train_wifi_bssids_size = len(wifi_bssids)
print(f'BSSID TYPES(train): {train_wifi_bssids_size}')

# test
wifi_bssids_test = []
for i in range(100):
    wifi_bssids_test.extend(test_df.iloc[:,i].values.tolist())
wifi_bssids_test = list(set(wifi_bssids_test))

test_wifi_bssids_size = len(wifi_bssids_test)
print(f'BSSID TYPES(test): {test_wifi_bssids_size}')


wifi_bssids.extend(wifi_bssids_test)
wifi_bssids_size = len(wifi_bssids)
print(f'BSSID TYPES(all): {wifi_bssids_size}')


BSSID TYPES(train): 61206
BSSID TYPES(test): 33042
BSSID TYPES(all): 94248


## preprocessing

In [12]:
# preprocess

le = LabelEncoder()
le.fit(wifi_bssids)
le_site = LabelEncoder()
le_site.fit(train_df['site_id'])

ss = StandardScaler()
ss.fit(train_df.loc[:,RSSI_FEATS])


def preprocess(input_df, le=le, le_site=le_site, ss=ss):
    output_df = input_df.copy()
    # RSSIの正規化
    output_df.loc[:,RSSI_FEATS] = ss.transform(input_df.loc[:,RSSI_FEATS])

    # BSSIDのLE(1からふる)
    for i in BSSID_FEATS:
        output_df.loc[:,i] = le.transform(input_df.loc[:,i])
#         output_df.loc[:,i] = output_df.loc[:,i] + 1  # 0からではなく1から番号を振りたいため なぜ？

    # site_idのLE
    output_df.loc[:, 'site_id'] = le_site.transform(input_df.loc[:, 'site_id'])
    output_df['site_id_str'] = input_df['site_id']

    # なぜ２重でやる？
#     output_df.loc[:,RSSI_FEATS] = ss.transform(output_df.loc[:,RSSI_FEATS])
    return output_df

train = preprocess(train_df)
test = preprocess(test_df)

train  

  return self.partial_fit(X, y)
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app


Unnamed: 0,bssid_0,bssid_1,bssid_2,bssid_3,bssid_4,bssid_5,bssid_6,bssid_7,bssid_8,bssid_9,...,rssi_96,rssi_97,rssi_98,rssi_99,x,y,floor,path,site_id,site_id_str
0,52392,35870,2764,34897,52709,35259,42719,33509,23416,15248,...,-79,-79,-79,-79,107.85044,161.892620,-1,5e1580adf4c3420006d520d4,0,5a0546857ecc773753327266
1,35870,52392,7486,34897,52709,35259,21970,15248,17024,5350,...,-79,-79,-80,-80,107.85044,161.892620,-1,5e1580adf4c3420006d520d4,0,5a0546857ecc773753327266
2,35870,52392,52709,34897,35259,23416,49407,6672,7486,48500,...,-78,-78,-78,-78,98.33065,163.343340,-1,5e1580adf4c3420006d520d4,0,5a0546857ecc773753327266
3,23416,34897,35259,52392,35870,3706,49407,15612,10166,4977,...,-76,-76,-77,-77,98.33065,163.343340,-1,5e1580adf4c3420006d520d4,0,5a0546857ecc773753327266
4,35870,35259,23416,19472,52392,3706,49407,18305,21409,52794,...,-76,-76,-77,-77,98.33065,163.343340,-1,5e1580adf4c3420006d520d4,0,5a0546857ecc773753327266
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258120,35065,14545,16494,21326,50465,22830,943,30429,43802,32684,...,-85,-85,-85,-85,122.68994,124.028015,6,5dcd5c88a4dbe7000630b084,23,5dc8cea7659e181adb076a3f
258121,35065,16494,21326,943,50465,22830,30059,33363,59581,14545,...,-85,-85,-85,-85,127.17589,123.677780,6,5dcd5c88a4dbe7000630b084,23,5dc8cea7659e181adb076a3f
258122,943,35065,14545,16494,21326,22830,50465,30059,48476,59581,...,-84,-85,-85,-85,127.17589,123.677780,6,5dcd5c88a4dbe7000630b084,23,5dc8cea7659e181adb076a3f
258123,14545,16494,35065,21326,50465,943,30059,48476,58803,22830,...,-85,-85,-85,-85,127.17589,123.677780,6,5dcd5c88a4dbe7000630b084,23,5dc8cea7659e181adb076a3f


In [13]:
site_count = len(train['site_id'].unique())
site_count

24

## PyTorch model
- embedding layerが重要  

In [14]:
# dataset
from torch.utils.data import Dataset, DataLoader
class IndoorDataset(Dataset):
    def __init__(self, df, phase='train'):
        self.df = df
        self.phase = phase
        self.bssid_feats = df[BSSID_FEATS].values.astype(int)
        self.rssi_feats = df[RSSI_FEATS].values.astype(np.float32)
        self.site_id = df['site_id'].values.astype(int)
        self.site_id_str = df['site_id_str'].values

        if phase in ['train', 'valid']:
            self.xy = df[['x', 'y']].values.astype(np.float32)
        self.floor = df['floor'].values.astype(np.float32)
        
    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        
        img = np.load(IMG_DIR / f'{self.site_id_str[idx]}/{self.floor[idx]}/floor_image.png.npy') / 255.0
        img = img.transpose(2, 0, 1).astype(np.float32)
        
        feature = {
            'BSSID_FEATS':self.bssid_feats[idx],
            'RSSI_FEATS':self.rssi_feats[idx],
            'site_id':self.site_id[idx],
            'img': img
        }
        if self.phase in ['train', 'valid']:
            target = {
                'xy':self.xy[idx],
                'floor':self.floor[idx]
            }
        else:
            target = {}
        return feature, target

In [15]:
import torch
from torch import nn

class LSTMModel(nn.Module):
    def __init__(self, bssid_size=94248, site_size=24, embedding_dim=64):
        super(LSTMModel, self).__init__()
        
        # bssid
        # ->64次元に圧縮後sequence化にする
        # wifi_bssids_sizeが辞書の数を表す
        self.bssid_embedding = nn.Embedding(bssid_size, 64, max_norm=True)
        # site
        # ->2次元に圧縮後sequence化する
        # site_countが辞書の数を表す       
        self.site_embedding = nn.Embedding(site_size, 64, max_norm=True)

        # rssi
        # 次元を64倍に線形変換
        self.rssi = nn.Sequential(
            nn.BatchNorm1d(NUM_FEATS),
            nn.Linear(NUM_FEATS, NUM_FEATS * 64)
        )
        
        self.res = torch.hub.load('huawei-noah/ghostnet', 'ghostnet_1x', pretrained=True)
        
        concat_size = 64 + (NUM_FEATS * 64) + (NUM_FEATS * 64) + 1000
        self.linear_layer2 = nn.Sequential(
            nn.BatchNorm1d(concat_size),
            nn.Dropout(0.3),
            nn.Linear(concat_size, 256),
            nn.ReLU()
        )
        self.bn1 = nn.BatchNorm1d(concat_size)

        self.flatten = nn.Flatten()

        self.dropout1 = nn.Dropout(0.3)
        self.linear1 = nn.Linear(in_features=concat_size, out_features=256)#, bias=False)
        self.bn2 = nn.BatchNorm1d(256)

        self.batch_norm1 = nn.BatchNorm1d(1)
        self.lstm1 = nn.LSTM(input_size=256,hidden_size=128,dropout=0.3, batch_first=True)
        self.lstm2 = nn.LSTM(input_size=128,hidden_size=16,dropout=0.1, batch_first=True)

        self.fc_xy = nn.Linear(16, 2)
        # self.fc_x = nn.Linear(16, 1)
        # self.fc_y = nn.Linear(16, 1)
        self.fc_floor = nn.Linear(16, 1)

    
    def forward(self, x):
        # input embedding
        batch_size = x["site_id"].shape[0]
        x_bssid = self.bssid_embedding(x['BSSID_FEATS'])
        x_bssid = self.flatten(x_bssid)
        
        x_site_id = self.site_embedding(x['site_id'])
        x_site_id = self.flatten(x_site_id)

        x_rssi = self.rssi(x['RSSI_FEATS'])
        
        x_img = self.res(x['img'])

        x = torch.cat([x_bssid, x_site_id, x_rssi, x_img], dim=1)
        x = self.linear_layer2(x)

        # lstm layer
        x = x.view(batch_size, 1, -1)  # [batch, 1]->[batch, 1, 1]
        x = self.batch_norm1(x)
        x, _ = self.lstm1(x)
        x = torch.relu(x)
        x, _ = self.lstm2(x)
        x = torch.relu(x)

        # output [batch, 1, 1] -> [batch]
        # x_ = self.fc_x(x).view(-1)
        # y_ = self.fc_y(x).view(-1)
        xy = self.fc_xy(x).squeeze(1)
        floor = torch.relu(self.fc_floor(x)).view(-1)
        # return {"x":x_, "y":y_, "floor":floor} 
        return {"xy": xy, "floor": floor}

In [16]:
def mean_position_error(xhat, yhat, fhat, x, y, f):
    intermediate = np.sqrt(np.power(xhat-x, 2) + np.power(yhat-y, 2)) + 15 * np.abs(fhat-f)
    return intermediate.sum()/xhat.shape[0]

def to_np(input):
    return input.detach().cpu().numpy()

In [17]:
def get_optimizer(model: nn.Module, config: dict):
    optimizer_config = config["optimizer"]
    optimizer_name = optimizer_config.get("name")
    base_optimizer_name = optimizer_config.get("base_name")
    optimizer_params = optimizer_config['params']

    if hasattr(optim, optimizer_name):
        optimizer = optim.__getattribute__(optimizer_name)(model.parameters(), **optimizer_params)
        return optimizer
    else:
        base_optimizer = optim.__getattribute__(base_optimizer_name)
        optimizer = globals().get(optimizer_name)(
            model.parameters(), 
            base_optimizer,
            **optimizer_config["params"])
        return  optimizer

def get_scheduler(optimizer, config: dict):
    scheduler_config = config["scheduler"]
    scheduler_name = scheduler_config.get("name")

    if scheduler_name is None:
        return
    else:
        return optim.lr_scheduler.__getattribute__(scheduler_name)(
            optimizer, **scheduler_config["params"])


def get_criterion(config: dict):
    loss_config = config["loss"]
    loss_name = loss_config["name"]
    loss_params = {} if loss_config.get("params") is None else loss_config.get("params")
    if hasattr(nn, loss_name):
        criterion = nn.__getattribute__(loss_name)(**loss_params)
    else:
        criterion = globals().get(loss_name)(**loss_params)

    return criterion

def worker_init_fn(worker_id):                                                          
    np.random.seed(np.random.get_state()[1][0] + worker_id)

In [18]:
# Learner class(pytorch-lighting)
class Learner(pl.LightningModule):
    def __init__(self, model, config):
        super().__init__()
        self.model = model
        self.config = config
        self.xy_criterion = get_criterion(config)
        self.f_criterion = get_criterion(config)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        output = self.model(x)
        loss = self.xy_criterion(output["xy"], y["xy"])
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        output = self.model(x)
        xy_loss = self.xy_criterion(output["xy"], y["xy"])
        f_loss = self.f_criterion(output["floor"], y["floor"])
        loss = xy_loss  # + f_loss
        mpe = mean_position_error(
            to_np(output['xy'][:, 0]), to_np(output['xy'][:, 1]), 0, 
            to_np(y['xy'][:, 0]), to_np(y['xy'][:, 1]), 0)
        
        # floor lossは現状は無視して良い
        self.log(f'Loss/val', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        self.log(f'Loss/xy', xy_loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        self.log(f'Loss/floor', f_loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        self.log(f'MPE/val', mpe, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        return mpe
    
    def validation_epoch_end(self, outputs):
        avg_loss = np.mean(outputs)
        print(f'epoch = {self.current_epoch}, mpe_loss = {avg_loss}')

    def configure_optimizers(self):
        optimizer = get_optimizer(self.model, self.config)
        scheduler = get_scheduler(optimizer, self.config)
        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "Loss/val"}

In [19]:
# oof
def evaluate(model, loaders, phase):
    x_list = []
    y_list = []
    f_list = []
    with torch.no_grad():
        for batch in loaders[phase]:
            x, y = batch
            output = model(x)
            x_list.append(to_np(output['xy'][:, 0]))
            y_list.append(to_np(output['xy'][:, 1]))
            f_list.append(to_np(output['floor']))

    x_list = np.concatenate(x_list)
    y_list = np.concatenate(y_list)
    f_list = np.concatenate(f_list)
    return x_list, y_list, f_list

## train

In [None]:
oofs = []  # 全てのoofをdfで格納する
predictions = []  # 全ての予測値をdfで格納する
val_scores = []
# skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
gkf = GroupKFold(n_splits=N_SPLITS)
# for fold, (trn_idx, val_idx) in enumerate(skf.split(train.loc[:, 'path'], train.loc[:, 'path'])):
for fold, (trn_idx, val_idx) in enumerate(gkf.split(train.loc[:, 'path'], groups=train.loc[:, 'path'])):

    # 指定したfoldのみループを回す

    print('=' * 20)
    print(f'Fold {fold}')
    print('=' * 20)

    # train/valid data
    trn_df = train.loc[trn_idx, BSSID_FEATS + RSSI_FEATS + ['site_id', 'site_id_str', 'x','y','floor']].reset_index(drop=True)
    val_df = train.loc[val_idx, BSSID_FEATS + RSSI_FEATS + ['site_id', 'site_id_str', 'x','y','floor']].reset_index(drop=True)

    # data loader
    loaders = {}
    loader_config = config["loader"]
    loaders["train"] = DataLoader(IndoorDataset(trn_df, phase="train"), **loader_config["train"], worker_init_fn=worker_init_fn) 
    loaders["valid"] = DataLoader(IndoorDataset(val_df, phase="valid"), **loader_config["valid"], worker_init_fn=worker_init_fn)
    loaders["test"] = DataLoader(IndoorDataset(test, phase="test"), **loader_config["test"], worker_init_fn=worker_init_fn)
    
    # model
    model = LSTMModel(wifi_bssids_size, site_count)
    model_name = model.__class__.__name__
    
    # loggers
    RUN_NAME = f'exp{str(EXP_NAME)}'
    wandb.init(project='Indoor_Location_Navigation', entity='sqrt4kaido', group=RUN_NAME, job_type=RUN_NAME + f'-fold-{fold}')
    wandb.run.name = RUN_NAME + f'-fold-{fold}'
    wandb_config = wandb.config
    wandb_config.model_name = model_name
    wandb.watch(model)
    
    
    loggers = []
    loggers.append(WandbLogger())

    learner = Learner(model, config)
    
    # callbacks
    callbacks = []
    checkpoint_callback = ModelCheckpoint(
        monitor=f'Loss/val',
        mode='min',
        dirpath=OUTPUT_DIR,
        verbose=False,
        filename=f'{model_name}-{learner.current_epoch}-{fold}')
    callbacks.append(checkpoint_callback)

    early_stop_callback = EarlyStopping(
        monitor='Loss/val',
        min_delta=0.00,
        patience=3,
        verbose=True,
        mode='min')
    callbacks.append(early_stop_callback)
    
    trainer = pl.Trainer(
        logger=loggers,
        checkpoint_callback=callbacks,
        max_epochs=MAX_EPOCHS,
        default_root_dir=OUTPUT_DIR,
        gpus=1,
        fast_dev_run=DEBUG,
        deterministic=True,
        benchmark=True,
#         precision=16,
#         progress_bar_refresh_rate=0  # vscodeの時progress barの動作が遅いので表示しない
        )


    trainer.fit(learner, train_dataloader=loaders['train'], val_dataloaders=loaders['valid'])

    #############
    # validation (to make oof)
    #############
    model.eval()
    oof_x, oof_y, oof_f = evaluate(model, loaders, phase="valid")
    val_df["oof_x"] = oof_x
    val_df["oof_y"] = oof_y
    val_df["oof_floor"] = oof_f
    oofs.append(val_df)
    
    val_score = mean_position_error(
        val_df["oof_x"].values, val_df["oof_y"].values, 0,
        val_df['x'].values, val_df['y'].values, 0)
    val_scores.append(val_score)
    print(f"fold {fold}: mean position error {val_score}")

    #############
    # inference
    #############
    preds_x, preds_y, preds_f = evaluate(model, loaders, phase="test")
    test_preds = pd.DataFrame(np.stack((preds_f, preds_x, preds_y))).T
    test_preds.columns = sub.columns
    test_preds["site_path_timestamp"] = test["site_path_timestamp"]
    test_preds["floor"] = test_preds["floor"].astype(int)
    predictions.append(test_preds)
    

Fold 0


Downloading: "https://github.com/huawei-noah/ghostnet/archive/master.zip" to /home/user/.cache/torch/hub/master.zip
Downloading: "https://github.com/huawei-noah/ghostnet/raw/master/ghostnet_pytorch/models/state_dict_73.98.pth" to /home/user/.cache/torch/hub/checkpoints/state_dict_73.98.pth


  0%|          | 0.00/20.0M [00:00<?, ?B/s]

  "num_layers={}".format(dropout, num_layers))
  "num_layers={}".format(dropout, num_layers))
[34m[1mwandb[0m: Currently logged in as: [33msqrt4kaido[0m (use `wandb login --relogin` to force relogin)


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type      | Params
-------------------------------------------
0 | model        | LSTMModel | 17.7 M
1 | xy_criterion | MSELoss   | 0     
2 | f_criterion  | MSELoss   | 0     
-------------------------------------------
17.7 M    Trainable params
0         Non-trainable params
17.7 M    Total params
70.687    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

epoch = 0, mpe_loss = 255.50562286376953




Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

epoch = 0, mpe_loss = 68.37470374402359


Validating: 0it [00:00, ?it/s]

epoch = 1, mpe_loss = 43.53293735694948


Validating: 0it [00:00, ?it/s]

epoch = 2, mpe_loss = 24.33255698994071


Validating: 0it [00:00, ?it/s]

epoch = 3, mpe_loss = 15.124572207254028


Validating: 0it [00:00, ?it/s]

epoch = 4, mpe_loss = 11.321989697352326


Validating: 0it [00:00, ?it/s]

epoch = 5, mpe_loss = 10.245951904725574


Validating: 0it [00:00, ?it/s]

epoch = 6, mpe_loss = 9.918328496611274


Validating: 0it [00:00, ?it/s]

epoch = 7, mpe_loss = 9.52846039195854


Validating: 0it [00:00, ?it/s]

epoch = 8, mpe_loss = 9.696075142496662


Validating: 0it [00:00, ?it/s]

epoch = 9, mpe_loss = 9.009116074828698


Validating: 0it [00:00, ?it/s]

epoch = 10, mpe_loss = 9.054118016839924


Validating: 0it [00:00, ?it/s]

epoch = 11, mpe_loss = 9.175784165907988


Validating: 0it [00:00, ?it/s]

epoch = 12, mpe_loss = 9.265333167573045


Validating: 0it [00:00, ?it/s]

epoch = 13, mpe_loss = 8.949767671012301


Validating: 0it [00:00, ?it/s]

epoch = 14, mpe_loss = 8.44184388108208


Validating: 0it [00:00, ?it/s]

epoch = 15, mpe_loss = 8.562407498717299


Validating: 0it [00:00, ?it/s]

epoch = 16, mpe_loss = 8.29394718273831


Validating: 0it [00:00, ?it/s]

epoch = 17, mpe_loss = 8.322580222355883


Validating: 0it [00:00, ?it/s]

epoch = 18, mpe_loss = 8.249615017529322


Validating: 0it [00:00, ?it/s]

epoch = 19, mpe_loss = 8.289468254479264


Validating: 0it [00:00, ?it/s]

epoch = 20, mpe_loss = 8.160675477527155


Validating: 0it [00:00, ?it/s]

epoch = 21, mpe_loss = 8.171629241229194


Validating: 0it [00:00, ?it/s]

epoch = 22, mpe_loss = 8.356245694277204


Validating: 0it [00:00, ?it/s]

epoch = 23, mpe_loss = 8.201802162563643




In [None]:
if len(oofs) > 1:
    oofs_df = pd.concat(oofs)
else:
    oofs_df = oofs[0]
oofs_df.to_csv(str(OUTPUT_DIR) + f"/oof{EXP_NAME}.csv", index=False)
oofs_df

In [21]:
    # foldの結果を平均した後、reindexでsubmission fileにindexを合わせる
all_preds = pd.concat(predictions).groupby('site_path_timestamp').mean().reindex(sub.index)

all_preds

Unnamed: 0_level_0,floor,x,y
site_path_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000000009,0,87.820877,103.378113
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000009017,0,84.934212,102.221764
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000015326,0,84.637535,106.694267
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000018763,0,88.344582,108.472931
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000022328,0,88.364525,107.649292
...,...,...,...
5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f412_0000000082589,0,216.584732,91.119125
5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f412_0000000085758,0,210.694809,98.997589
5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f412_0000000090895,0,207.790909,106.390236
5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f412_0000000096899,0,202.718658,113.294189


In [22]:
# floorの数値を置換
simple_accurate_99 = pd.read_csv('../01/submission.csv')
all_preds['floor'] = simple_accurate_99['floor'].values
all_preds

Unnamed: 0_level_0,floor,x,y
site_path_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000000009,0,87.820877,103.378113
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000009017,0,84.934212,102.221764
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000015326,0,84.637535,106.694267
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000018763,0,88.344582,108.472931
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000022328,0,88.364525,107.649292
...,...,...,...
5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f412_0000000082589,5,216.584732,91.119125
5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f412_0000000085758,5,210.694809,98.997589
5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f412_0000000090895,5,207.790909,106.390236
5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f412_0000000096899,5,202.718658,113.294189


In [23]:
all_preds.to_csv(str(OUTPUT_DIR) + f"/sub{EXP_NAME}.csv")

In [24]:
print(f"CV:{np.mean(val_scores)}")

CV:8.134229954895378


In [25]:
wandb.init(project='Indoor_Location_Navigation', entity='sqrt4kaido', group=RUN_NAME, job_type='summary')
wandb.run.name = 'summary'
wandb.log({'CV_score': np.mean(val_scores)})
wandb.save(utils.get_notebook_path())
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Loss/val,55.20833
Loss/xy,55.20833
Loss/floor,4.40498
MPE/val,8.26543
epoch,199.0
trainer/global_step,80799.0
_runtime,957.0
_timestamp,1616836981.0
_step,199.0


0,1
Loss/val,█▇▅▅▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Loss/xy,█▇▅▅▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Loss/floor,▆▆▆▆▆▆▅█▇▆▅▄▃▂▂▂▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂
MPE/val,█▇▆▆▅▄▄▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███




VBox(children=(Label(value=' 0.00MB of 0.49MB uploaded (0.00MB deduped)\r'), FloatProgress(value=0.00130087052…

0,1
CV_score,8.13423
_runtime,2.0
_timestamp,1616837013.0
_step,0.0


0,1
CV_score,▁
_runtime,▁
_timestamp,▁
_step,▁
