# Import

In [1]:
from pathlib import Path
import glob
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import accuracy_score

# Setting

In [2]:
DATA_DIR = Path("/home/knikaido/work/Indoor-Location-Navigation/data/")
WIFI_DIR = DATA_DIR / 'wifi_lbl_encode'

In [3]:
SEED = 777
EXP_NUM = 1

floor_dict = {'B1':-1, 'F1':0, 'F2':1 , 'F3':2, 'F4':3, 'F5':4, 'F6':5, 'F7':6, 'B2':-2, '1F':0, '2F':1,
       '3F':2, '4F':3, '5F':4, '6F':5, '7F':6, '8F':7, '9F':8, 'F8':7}

In [4]:
def calc_floor_loss(pred, true):
    true_ = np.array([floor_dict[key] for key in true])
    pred_ = np.array([floor_dict[key] for key in pred])
    return np.sum(15 * np.abs(true_-pred_)) / len(true_)

In [5]:
train_csv_path = sorted(glob.glob(str(WIFI_DIR / 'train/*/*/*.txt')))
len(train_csv_path)

24106

In [6]:
train_files = pd.DataFrame(train_csv_path)
train_files_element = pd.DataFrame(train_csv_path)[0].str.split('/', expand=True).iloc[:,6:]
train_files = pd.concat([train_files, train_files_element], axis=1)
train_files.columns = ['filename', 'dataset', 'train_test', 'site', 'floor', 'path']
train_files

Unnamed: 0,filename,dataset,train_test,site,floor,path
0,/home/knikaido/work/Indoor-Location-Navigation...,wifi_lbl_encode,train,5a0546857ecc773753327266,B1,5e15730aa280850006f3d005.txt
1,/home/knikaido/work/Indoor-Location-Navigation...,wifi_lbl_encode,train,5a0546857ecc773753327266,B1,5e15730b1506f2000638fc29.txt
2,/home/knikaido/work/Indoor-Location-Navigation...,wifi_lbl_encode,train,5a0546857ecc773753327266,B1,5e15730ca280850006f3d007.txt
3,/home/knikaido/work/Indoor-Location-Navigation...,wifi_lbl_encode,train,5a0546857ecc773753327266,B1,5e15730e1506f2000638fc2b.txt
4,/home/knikaido/work/Indoor-Location-Navigation...,wifi_lbl_encode,train,5a0546857ecc773753327266,B1,5e15730f1506f2000638fc2d.txt
...,...,...,...,...,...,...
24101,/home/knikaido/work/Indoor-Location-Navigation...,wifi_lbl_encode,train,5dc8cea7659e181adb076a3f,F7,5dd0d806878f3300066c74f7.txt
24102,/home/knikaido/work/Indoor-Location-Navigation...,wifi_lbl_encode,train,5dc8cea7659e181adb076a3f,F7,5dd0d80794e4900006125dd0.txt
24103,/home/knikaido/work/Indoor-Location-Navigation...,wifi_lbl_encode,train,5dc8cea7659e181adb076a3f,F7,5dd0d809878f3300066c74f9.txt
24104,/home/knikaido/work/Indoor-Location-Navigation...,wifi_lbl_encode,train,5dc8cea7659e181adb076a3f,F7,5dd0d97c94e4900006125dd9.txt


In [7]:
sample_sub = pd.read_csv(DATA_DIR / 'indoor-location-navigation/sample_submission.csv')
sites = sample_sub['site_path_timestamp'].str.split('_', expand=True).iloc[:, 0].unique()
sites

array(['5a0546857ecc773753327266', '5c3c44b80379370013e0fd2b',
       '5d27075f03f801723c2e360f', '5d27096c03f801723c31e5e0',
       '5d27097f03f801723c320d97', '5d27099f03f801723c32511d',
       '5d2709a003f801723c3251bf', '5d2709b303f801723c327472',
       '5d2709bb03f801723c32852c', '5d2709c303f801723c3299ee',
       '5d2709d403f801723c32bd39', '5d2709e003f801723c32d896',
       '5da138274db8ce0c98bbd3d2', '5da1382d4db8ce0c98bbe92e',
       '5da138314db8ce0c98bbf3a0', '5da138364db8ce0c98bc00f1',
       '5da1383b4db8ce0c98bc11ab', '5da138754db8ce0c98bca82f',
       '5da138764db8ce0c98bcaa46', '5da1389e4db8ce0c98bd0547',
       '5da138b74db8ce0c98bd4774', '5da958dd46f8266d0737457b',
       '5dbc1d84c1eb61796cf7c010', '5dc8cea7659e181adb076a3f'],
      dtype=object)

In [8]:
test_site = sites[0]
site_paths = train_files[train_files['site'] == test_site]['path'].unique()

In [9]:
all_wifi = []

bssidごとのmaxのrssiを計算

In [10]:
for iter_path in site_paths:
#     break
    this_path_wifi = pd.read_csv(train_files[train_files['path'] == iter_path]['filename'].values[0])
    
    min_timestamp = min(this_path_wifi['timestamp'])
    this_path_wifi = this_path_wifi[(this_path_wifi['last_timestamp'] >= min_timestamp - 5000) & (this_path_wifi['rssi'] >= -75)]
#     this_path_wifi = this_path_wifi.query('last_timestamp - timestamp <= -5000 & rssi >= -75')
    this_path_wifi_groupby = this_path_wifi.groupby(['bssid']).max()['rssi']
    this_path_wifi_groupby = this_path_wifi_groupby.reset_index()
    
    this_path_wifi_place = pd.DataFrame()
    this_path_wifi_place['bssid'] = this_path_wifi['bssid']
    this_path_wifi_place['floor'] = train_files[train_files['path'] == iter_path]['floor'].values[0]
    this_path_wifi_place['path'] = iter_path
    this_path_wifi_place = this_path_wifi_place[~this_path_wifi_place.duplicated(subset='bssid')]
    
    all_wifi.append(this_path_wifi_groupby.merge(this_path_wifi_place, how='left', on='bssid'))

In [11]:
all_wifi = pd.concat(all_wifi)
# all_wifi = all_wifi.sort_values('bssid')
all_wifi = all_wifi.reset_index(drop=True)

In [12]:
all_wifi

Unnamed: 0,bssid,rssi,floor,path
0,3298,-73,B1,5e15730aa280850006f3d005.txt
1,4320,-70,B1,5e15730aa280850006f3d005.txt
2,7179,-70,B1,5e15730aa280850006f3d005.txt
3,7886,-73,B1,5e15730aa280850006f3d005.txt
4,23823,-69,B1,5e15730aa280850006f3d005.txt
...,...,...,...,...
99357,207498,-57,F4,5dc1579ebce0a000068ae8eb.txt
99358,208488,-68,F4,5dc1579ebce0a000068ae8eb.txt
99359,209225,-64,F4,5dc1579ebce0a000068ae8eb.txt
99360,213783,-68,F4,5dc1579ebce0a000068ae8eb.txt


In [13]:
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
for train_index, test_index in kf.split(site_paths):
    pass
#     break

In [14]:
train_wifi = all_wifi

各フロアにおいて，bssidと一番高いrssiを計算

In [15]:
train_wifi = train_wifi[train_wifi['path'].isin(site_paths[train_index])]
train_wifi = train_wifi.drop('path', axis=1)
train_wifi = train_wifi.groupby(['floor', 'bssid']).max().reset_index()
train_wifi = train_wifi.rename(columns={'floor':'floor_pred'})
train_wifi

Unnamed: 0,floor_pred,bssid,rssi
0,B1,669,-51
1,B1,787,-72
2,B1,788,-48
3,B1,1106,-49
4,B1,1357,-46
...,...,...,...
6496,F4,215201,-64
6497,F4,215762,-55
6498,F4,215897,-66
6499,F4,215906,-73


In [16]:
test_wifi = all_wifi

In [17]:
test_wifi = test_wifi[test_wifi['path'].isin(site_paths[test_index])].reset_index(drop=True)
test_wifi = test_wifi.rename(columns={'floor':'floor_actual'})
test_wifi

Unnamed: 0,bssid,rssi,floor_actual,path
0,3298,-73,B1,5e15730aa280850006f3d005.txt
1,4320,-70,B1,5e15730aa280850006f3d005.txt
2,7179,-70,B1,5e15730aa280850006f3d005.txt
3,7886,-73,B1,5e15730aa280850006f3d005.txt
4,23823,-69,B1,5e15730aa280850006f3d005.txt
...,...,...,...,...
19475,207498,-57,F4,5dc1579ebce0a000068ae8eb.txt
19476,208488,-68,F4,5dc1579ebce0a000068ae8eb.txt
19477,209225,-64,F4,5dc1579ebce0a000068ae8eb.txt
19478,213783,-68,F4,5dc1579ebce0a000068ae8eb.txt


In [18]:
wifi_merge = test_wifi.merge(train_wifi, how='inner', on='bssid', suffixes=('_test', '_train'))
wifi_merge = wifi_merge.groupby(['path', 'bssid', 'floor_pred']).max().reset_index()
wifi_merge = wifi_merge[wifi_merge['rssi_train'] >= wifi_merge['rssi_test']]
# wifi_merge = wifi_merge.groupby(['path', 'floor_pred']).max().reset_index()
wifi_merge

Unnamed: 0,path,bssid,floor_pred,rssi_test,floor_actual,rssi_train
0,5d10a167f9037900086afc39.txt,3571,F1,-65,F3,-51
1,5d10a167f9037900086afc39.txt,3571,F2,-65,F3,-41
2,5d10a167f9037900086afc39.txt,3571,F3,-65,F3,-59
3,5d10a167f9037900086afc39.txt,3571,F4,-65,F3,-59
5,5d10a167f9037900086afc39.txt,3852,F2,-62,F3,-38
...,...,...,...,...,...,...
63254,5e15bf941506f2000638fec5.txt,205363,B1,-65,B1,-37
63255,5e15bf941506f2000638fec5.txt,207196,B1,-71,B1,-50
63257,5e15bf941506f2000638fec5.txt,209005,B1,-67,B1,-45
63258,5e15bf941506f2000638fec5.txt,209952,B1,-73,B1,-49


こんなかんじで，rssi_train >= rssi_test をみたす信号の数をカウントして，一番多いfloorを予測値とする

In [19]:
wifi_merge.pivot_table(index='path', columns='floor_pred', values='rssi_test', aggfunc=len)

floor_pred,B1,F1,F2,F3,F4
path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5d10a167f9037900086afc39.txt,,59.0,97.0,113.0,100.0
5d10a16b9c50c70008fe897d.txt,18.0,135.0,162.0,205.0,131.0
5d10a16bf9037900086afc3d.txt,18.0,150.0,163.0,216.0,143.0
5d119424ffe23f0008604e2c.txt,,43.0,82.0,93.0,77.0
5d11942b9c50c70008fe898f.txt,,49.0,83.0,102.0,82.0
...,...,...,...,...,...
5e15b3a81506f2000638fe91.txt,36.0,140.0,114.0,94.0,95.0
5e15b70f1506f2000638fea8.txt,3.0,264.0,159.0,93.0,85.0
5e15b7131506f2000638feac.txt,7.0,280.0,184.0,122.0,103.0
5e15bda91506f2000638feb7.txt,41.0,,,,


In [20]:
wifi_merge_pred = wifi_merge.pivot_table(index='path', columns='floor_pred', values='rssi_test', aggfunc=len).idxmax(axis=1)
wifi_merge_pred = wifi_merge_pred.reset_index()
wifi_merge_pred = wifi_merge_pred.rename(columns={0:'floor_pred'})
wifi_pred_result = wifi_merge_pred.merge(wifi_merge[['path', 'floor_actual']], how='left', on='path')
wifi_pred_result = wifi_pred_result[~wifi_pred_result.duplicated(subset='path')]
wifi_pred_result = wifi_pred_result.reset_index(drop=True)
wifi_pred_result

Unnamed: 0,path,floor_pred,floor_actual
0,5d10a167f9037900086afc39.txt,F3,F3
1,5d10a16b9c50c70008fe897d.txt,F3,F3
2,5d10a16bf9037900086afc3d.txt,F3,F3
3,5d119424ffe23f0008604e2c.txt,F3,F3
4,5d11942b9c50c70008fe898f.txt,F3,F3
...,...,...,...
97,5e15b3a81506f2000638fe91.txt,F1,F1
98,5e15b70f1506f2000638fea8.txt,F1,F1
99,5e15b7131506f2000638feac.txt,F1,F1
100,5e15bda91506f2000638feb7.txt,B1,B1


In [21]:
accuracy = accuracy_score(wifi_pred_result['floor_pred'], wifi_pred_result['floor_actual'])
print(f'accuracy = {accuracy}')

accuracy = 0.9901960784313726


In [22]:
floor_loss = calc_floor_loss(wifi_pred_result['floor_pred'], wifi_pred_result['floor_actual'])
print(f'floor_loss = {floor_loss}')

floor_loss = 0.29411764705882354


# Calc all site

とりあえずkfold．trainにないpathは予測できないことに注意．

In [23]:
flooor_scores = []
oof_preds = []
for test_site in tqdm(sites):
    site_paths = train_files[train_files['site'] == test_site]['path'].unique()
    
    all_wifi = []
    for iter_path in site_paths:
    #     break
        this_path_wifi = pd.read_csv(train_files[train_files['path'] == iter_path]['filename'].values[0])

        min_timestamp = min(this_path_wifi['timestamp'])
        this_path_wifi = this_path_wifi[(this_path_wifi['last_timestamp'] >= min_timestamp - 5000) & (this_path_wifi['rssi'] >= -75)]
    #     this_path_wifi = this_path_wifi.query('last_timestamp - timestamp <= -5000 & rssi >= -75')
        this_path_wifi_groupby = this_path_wifi.groupby(['bssid']).max()['rssi']
        this_path_wifi_groupby = this_path_wifi_groupby.reset_index()

        this_path_wifi_place = pd.DataFrame()
        this_path_wifi_place['bssid'] = this_path_wifi['bssid']
        this_path_wifi_place['floor'] = train_files[train_files['path'] == iter_path]['floor'].values[0]
        this_path_wifi_place['path'] = iter_path
        this_path_wifi_place = this_path_wifi_place[~this_path_wifi_place.duplicated(subset='bssid')]
        all_wifi.append(this_path_wifi_groupby.merge(this_path_wifi_place, how='left', on='bssid'))
        
    all_wifi = pd.concat(all_wifi)
    all_wifi = all_wifi.reset_index(drop=True)
    
    y_floor = []
    pred_floor = []
#     kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    gkf = GroupKFold(n_splits=5)

    for train_index, test_index in gkf.split(site_paths, groups=site_paths):
        
        train_wifi = all_wifi
        train_wifi = train_wifi[train_wifi['path'].isin(site_paths[train_index])]
        train_wifi = train_wifi.drop('path', axis=1)
        train_wifi = train_wifi.groupby(['floor', 'bssid']).max().reset_index()
        train_wifi = train_wifi.rename(columns={'floor':'floor_pred'})

        test_wifi = all_wifi
        test_wifi = test_wifi[test_wifi['path'].isin(site_paths[test_index])].reset_index(drop=True)
        test_wifi = test_wifi.rename(columns={'floor':'floor_actual'})

        wifi_merge = test_wifi.merge(train_wifi, how='left', on='bssid', suffixes=('_test', '_train'))
#         wifi_merge = wifi_merge.fillna({"floor_pred": 'F1', 'rssi_train'})
        wifi_merge = wifi_merge.groupby(['path', 'bssid', 'floor_pred']).max().reset_index()
        wifi_merge = wifi_merge[wifi_merge['rssi_train'] >= wifi_merge['rssi_test']]

        wifi_merge_pred = wifi_merge.pivot_table(index='path', columns='floor_pred', values='rssi_test', aggfunc=len).idxmax(axis=1)
        wifi_merge_pred = wifi_merge_pred.reset_index()
        wifi_merge_pred = wifi_merge_pred.rename(columns={0:'floor_pred'})
        wifi_pred_result = wifi_merge_pred.merge(wifi_merge[['path', 'floor_actual']], how='left', on='path')
        wifi_pred_result = wifi_pred_result[~wifi_pred_result.duplicated(subset='path')]
        wifi_pred_result = wifi_pred_result.reset_index(drop=True)
        
        oof_preds.append(wifi_pred_result)
        
        y_floor.extend(wifi_pred_result['floor_actual'].values)
        pred_floor.extend(wifi_pred_result['floor_pred'].values)
    
    accuracy = accuracy_score(pred_floor, y_floor)
    floor_loss = calc_floor_loss(pred_floor, y_floor)
    
    flooor_scores.append([test_site, accuracy, floor_loss, len(site_paths)])
    
#     break

100%|██████████| 24/24 [08:43<00:00, 21.80s/it]


In [24]:
oof_preds = pd.concat(oof_preds)
oof_preds

Unnamed: 0,path,floor_pred,floor_actual
0,5d10a1699c50c70008fe8979.txt,F3,F3
1,5d10a16cf9037900086afc3f.txt,F3,F3
2,5d11942cffe23f0008604e2e.txt,F3,F3
3,5d119434ffe23f0008604e34.txt,F3,F3
4,5d11943dffe23f0008604e3a.txt,F3,F3
...,...,...,...
144,5dd7af4b9191710006b56823.txt,F2,F2
145,5dd7b2a8c5b77e0006b16ae4.txt,F1,F1
146,5dd7b4bbc5b77e0006b16afb.txt,B1,B1
147,5dd7bea99191710006b568a0.txt,B1,B1


In [35]:
oof_preds.to_csv('./oof_preds_floor.csv', index=False)

In [39]:
accuracy_df = pd.DataFrame(flooor_scores, columns=['site', 'accuracy', 'floor_loss', 'path_num'])
accuracy_df

Unnamed: 0,site,accuracy,floor_loss,path_num
0,5a0546857ecc773753327266,0.984436,0.291829,514
1,5c3c44b80379370013e0fd2b,0.963636,0.857143,385
2,5d27075f03f801723c2e360f,0.985977,0.302366,1141
3,5d27096c03f801723c31e5e0,0.994302,0.128205,351
4,5d27097f03f801723c320d97,0.981865,0.310881,404
5,5d27099f03f801723c32511d,0.992366,0.114504,131
6,5d2709a003f801723c3251bf,1.0,0.0,310
7,5d2709b303f801723c327472,0.97327,0.660377,639
8,5d2709bb03f801723c32852c,0.977273,0.397727,264
9,5d2709c303f801723c3299ee,0.996988,0.045181,664


In [26]:
accuracy = np.sum((accuracy_df['accuracy'] * accuracy_df['path_num'])) / np.sum(accuracy_df['path_num'])
print(f'accuracy = {accuracy}')

accuracy = 0.9768221672244871


In [27]:
floor_loss = np.sum((accuracy_df['floor_loss'] * accuracy_df['path_num'])) / np.sum(accuracy_df['path_num'])
print(f'floor_loss = {floor_loss}')

floor_loss = 0.49164915671769865


In [40]:
for ele in oof_preds.groupby('floor_pred'):
    score = accuracy_score(ele[1]['floor_pred'], ele[1]['floor_actual'])
    print(f'{ele[0]}, num={len(ele[1])}, accuracy={score}')
#     break

1F, num=573, accuracy=0.9755671902268761
2F, num=413, accuracy=0.9854721549636803
3F, num=267, accuracy=0.9925093632958801
4F, num=278, accuracy=0.9964028776978417
5F, num=79, accuracy=1.0
6F, num=56, accuracy=1.0
7F, num=50, accuracy=1.0
8F, num=13, accuracy=1.0
9F, num=26, accuracy=0.9615384615384616
B1, num=1646, accuracy=0.9933171324422844
B2, num=166, accuracy=0.9518072289156626
F1, num=1705, accuracy=0.9718475073313783
F2, num=1832, accuracy=0.9661572052401747
F3, num=1504, accuracy=0.9674202127659575
F4, num=1113, accuracy=0.977538185085355
F5, num=616, accuracy=0.9675324675324676
F6, num=255, accuracy=1.0
F7, num=152, accuracy=0.9736842105263158
F8, num=87, accuracy=1.0


In [55]:
for ele in accuracy_df.iloc:
    s_ = ele['site']
    p_ = ele['path_num']
    a_ = ele['accuracy']
    print(f'{s_}, num={p_}, accuracy={a_}')

5a0546857ecc773753327266, num=514, accuracy=0.9844357976653697
5c3c44b80379370013e0fd2b, num=385, accuracy=0.9636363636363636
5d27075f03f801723c2e360f, num=1141, accuracy=0.985977212971078
5d27096c03f801723c31e5e0, num=351, accuracy=0.9943019943019943
5d27097f03f801723c320d97, num=404, accuracy=0.9818652849740933
5d27099f03f801723c32511d, num=131, accuracy=0.9923664122137404
5d2709a003f801723c3251bf, num=310, accuracy=1.0
5d2709b303f801723c327472, num=639, accuracy=0.9732704402515723
5d2709bb03f801723c32852c, num=264, accuracy=0.9772727272727273
5d2709c303f801723c3299ee, num=664, accuracy=0.9969879518072289
5d2709d403f801723c32bd39, num=361, accuracy=0.9861495844875346
5d2709e003f801723c32d896, num=343, accuracy=0.9591836734693877
5da138274db8ce0c98bbd3d2, num=213, accuracy=0.9624413145539906
5da1382d4db8ce0c98bbe92e, num=338, accuracy=0.9556213017751479
5da138314db8ce0c98bbf3a0, num=428, accuracy=0.9906542056074766
5da138364db8ce0c98bc00f1, num=82, accuracy=1.0
5da1383b4db8ce0c98bc11a

In [57]:
np.sum(accuracy_df['path_num'])

10877

# Test Prediction

In [27]:
sub = pd.read_csv(DATA_DIR / 'indoor-location-navigation/sample_submission.csv')
sub

Unnamed: 0,site_path_timestamp,floor,x,y
0,5a0546857ecc773753327266_046cfa46be49fc1083481...,0,75.0,75.0
1,5a0546857ecc773753327266_046cfa46be49fc1083481...,0,75.0,75.0
2,5a0546857ecc773753327266_046cfa46be49fc1083481...,0,75.0,75.0
3,5a0546857ecc773753327266_046cfa46be49fc1083481...,0,75.0,75.0
4,5a0546857ecc773753327266_046cfa46be49fc1083481...,0,75.0,75.0
...,...,...,...,...
10128,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,0,75.0,75.0
10129,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,0,75.0,75.0
10130,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,0,75.0,75.0
10131,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,0,75.0,75.0


In [28]:
sub_split = sub['site_path_timestamp'].str.split('_', expand=True)
sub_split.columns = ['site', 'path', 'timestamp']
sub_split

Unnamed: 0,site,path,timestamp
0,5a0546857ecc773753327266,046cfa46be49fc10834815c6,0000000000009
1,5a0546857ecc773753327266,046cfa46be49fc10834815c6,0000000009017
2,5a0546857ecc773753327266,046cfa46be49fc10834815c6,0000000015326
3,5a0546857ecc773753327266,046cfa46be49fc10834815c6,0000000018763
4,5a0546857ecc773753327266,046cfa46be49fc10834815c6,0000000022328
...,...,...,...
10128,5dc8cea7659e181adb076a3f,fd64de8c4a2fc5ebb0e9f412,0000000082589
10129,5dc8cea7659e181adb076a3f,fd64de8c4a2fc5ebb0e9f412,0000000085758
10130,5dc8cea7659e181adb076a3f,fd64de8c4a2fc5ebb0e9f412,0000000090895
10131,5dc8cea7659e181adb076a3f,fd64de8c4a2fc5ebb0e9f412,0000000096899


In [29]:
all_floor_preds = []

for test_site in tqdm(sites):

    all_wifi = []
    site_paths = train_files[train_files['site'] == test_site]['path'].unique()
    for iter_path in site_paths:
    #     break
        this_path_wifi = pd.read_csv(train_files[train_files['path'] == iter_path]['filename'].values[0])

        min_timestamp = min(this_path_wifi['timestamp'])
        this_path_wifi = this_path_wifi[(this_path_wifi['last_timestamp'] >= min_timestamp - 5000) & (this_path_wifi['rssi'] >= -75)]
        this_path_wifi_groupby = this_path_wifi.groupby(['bssid']).max()['rssi']
        this_path_wifi_groupby = this_path_wifi_groupby.reset_index()

        this_path_wifi_place = pd.DataFrame()
        this_path_wifi_place[['bssid', 'rssi']] = this_path_wifi[['bssid', 'rssi']]
        this_path_wifi_place['floor_pred'] = train_files[train_files['path'] == iter_path]['floor'].values[0]

        all_wifi.append(this_path_wifi_place)

    all_wifi = pd.concat(all_wifi)
    all_wifi = all_wifi.reset_index(drop=True)
    all_wifi = all_wifi.groupby(['floor_pred', 'bssid']).max().reset_index()

    all_sub_wifi = []
    site_paths = sub_split[sub_split['site'] == test_site]['path'].unique()
    for iter_path in site_paths:
    #     break
        this_path_wifi = pd.read_csv(str(WIFI_DIR / 'test' / iter_path) + '.txt')

        min_timestamp = min(this_path_wifi['timestamp'])
        this_path_wifi = this_path_wifi[(this_path_wifi['last_timestamp'] >= min_timestamp - 5000) & (this_path_wifi['rssi'] >= -75)]
        this_path_wifi_groupby = this_path_wifi.groupby(['bssid']).max()['rssi']
        this_path_wifi_groupby = this_path_wifi_groupby.reset_index()

        this_path_wifi_place = pd.DataFrame()
        this_path_wifi_place[['bssid', 'rssi']] = this_path_wifi[['bssid', 'rssi']]
        this_path_wifi_place['path'] = iter_path

        all_sub_wifi.append(this_path_wifi_place)

    all_sub_wifi = pd.concat(all_sub_wifi)
    all_sub_wifi = all_sub_wifi.reset_index(drop=True)

    wifi_merge = all_sub_wifi.merge(all_wifi, how='inner', on='bssid', suffixes=('_test', '_train'))
    wifi_merge = wifi_merge.groupby(['path', 'bssid', 'floor_pred']).max().reset_index()
    wifi_merge = wifi_merge[wifi_merge['rssi_train'] >= wifi_merge['rssi_test']]

    wifi_merge_pred = wifi_merge.pivot_table(index='path', columns='floor_pred', values='rssi_test', aggfunc=len).idxmax(axis=1)
    wifi_merge_pred = wifi_merge_pred.reset_index()
    wifi_merge_pred = wifi_merge_pred.rename(columns={0:'floor_pred'})

    all_floor_preds.append(wifi_merge_pred)

  4%|▍         | 1/24 [00:09<03:30,  9.15s/it]


KeyboardInterrupt: 

In [None]:
all_floor_preds = pd.concat(all_floor_preds)
all_floor_preds = all_floor_preds.reset_index(drop=True)
all_floor_preds

In [83]:
sub_split = sub_split.merge(all_floor_preds, how='left', on='path')
sub_split

Unnamed: 0,site,path,timestamp,floor_pred
0,5a0546857ecc773753327266,046cfa46be49fc10834815c6,0000000000009,F1
1,5a0546857ecc773753327266,046cfa46be49fc10834815c6,0000000009017,F1
2,5a0546857ecc773753327266,046cfa46be49fc10834815c6,0000000015326,F1
3,5a0546857ecc773753327266,046cfa46be49fc10834815c6,0000000018763,F1
4,5a0546857ecc773753327266,046cfa46be49fc10834815c6,0000000022328,F1
...,...,...,...,...
10128,5dc8cea7659e181adb076a3f,fd64de8c4a2fc5ebb0e9f412,0000000082589,F6
10129,5dc8cea7659e181adb076a3f,fd64de8c4a2fc5ebb0e9f412,0000000085758,F6
10130,5dc8cea7659e181adb076a3f,fd64de8c4a2fc5ebb0e9f412,0000000090895,F6
10131,5dc8cea7659e181adb076a3f,fd64de8c4a2fc5ebb0e9f412,0000000096899,F6


In [84]:
sub_split.isnull().sum()

site          0
path          0
timestamp     0
floor_pred    0
dtype: int64

In [85]:
sub['floor'] = [floor_dict[key] for key in sub_split['floor_pred']]
sub

Unnamed: 0,site_path_timestamp,floor,x,y
0,5a0546857ecc773753327266_046cfa46be49fc1083481...,0,75.0,75.0
1,5a0546857ecc773753327266_046cfa46be49fc1083481...,0,75.0,75.0
2,5a0546857ecc773753327266_046cfa46be49fc1083481...,0,75.0,75.0
3,5a0546857ecc773753327266_046cfa46be49fc1083481...,0,75.0,75.0
4,5a0546857ecc773753327266_046cfa46be49fc1083481...,0,75.0,75.0
...,...,...,...,...
10128,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5,75.0,75.0
10129,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5,75.0,75.0
10130,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5,75.0,75.0
10131,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5,75.0,75.0


In [96]:
sub.to_csv(f'./sub{EXP_NUM}.csv', index=False)

99%のやつとどのくらい違うか確認→全部一致していた

In [93]:
otehon_sub_floor = pd.read_csv('./submission.csv')['floor']

In [94]:
set(otehon_sub_floor) == set(sub['floor'])

True