#  REF:https://www.kaggle.com/jiweiliu/dask-with-simple-xgb

In [None]:
from glob import glob
from collections import Counter
import os
import sys

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import KFold
import xgboost as xgb

In [None]:
def get_building_floor(fname):
    xx = fname.split('/')
    return xx[-3],xx[-2]

def get_test_building(name):
    with open(name) as f:
        for c,line in enumerate(f):
            if c==1:
                x = line.split()[1].split(':')[1]
                return x  

def get_floor_target(floor):
    floor = floor.lower()
    if floor in ['bf','bm']:
        return None
    elif floor == 'b':
        return -1
    if floor.startswith('f'):
        return int(floor[1])
    elif floor.endswith('f'):
        return int(floor[0])
    elif floor.startswith('b'):
        return -int(floor[1])
    elif floor.endswith('b'):
        return -int(floor[0])
    else:
        return None
        
ACOLS = ['timestamp','x','y','z']
        
FIELDS = {
    'acce': ACOLS,
    'acce_uncali': ACOLS,
    'gyro': ACOLS,
    'gyro_uncali': ACOLS,
    'magn': ACOLS,
    'magn_uncali': ACOLS,
    'ahrs': ACOLS,
    'wifi': ['timestamp','ssid','bssid','rssi','last_timestamp'],
    'ibeacon': ['timestamp','code','rssi'],
    'waypoint': ['timestamp','x','y']
}

NFEAS = {
    'acce': 3,
    'acce_uncali': 3,
    'gyro': 3,
    'gyro_uncali': 3,
    'magn': 3,
    'magn_uncali': 3,
    'ahrs': 3,
    'wifi': 1,
    'ibeacon': 1,
    'waypoint': 3
}

In [None]:
PATH = '../input/indoor-location-navigation'

In [None]:
def mpe(yp, y):
    e1 = (yp[:,0] - y[:,0])**2 + (yp[:,1] - y[:,1])**2
    e2 = 15*np.abs(yp[:,2] - y[:,2])
    return np.mean(e1**0.5 + e2)

In [None]:
db = pd.read_csv('../input/extracteddata/feature.csv')

In [None]:
cols = ['F' + str(i) for i in range(244)]

In [None]:
X = np.asarray(db[cols])

In [None]:
df = pd.read_csv('../input/extracteddata/target.csv')
df = df[df.columns[1:]]

In [None]:
test_files = glob(f'{PATH}/test/*.txt')
len(test_files)

In [None]:
%%time
test_b = []
for name in tqdm(test_files):
    test_b.append(get_test_building(name))
test_b = np.array(test_b)

In [None]:
db = pd.read_csv('../input/extracteddata/feature_test.csv')

In [None]:
%%time
Xt = np.asarray(db[cols])
Xt.shape

### Train XGB

In [None]:
params2 = {
        'booster' : 'gbtree',
        'objective': 'reg:linear',
        'eval_metric': 'mae',
        'eta':0.1,
        'depth':7,
        'nthread':2,
        'verbosity': 0,
    }

In [None]:
params = {
        'booster' : 'gblinear',
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        ''
        'eta':0.1,
        'depth':7,
        'nthread':2,
        'verbosity': 0,
    }

In [None]:
N = 5
dtest = xgb.DMatrix(data=Xt)
ysub = np.zeros([Xt.shape[0],3])

kf = KFold(n_splits=N,shuffle=True,random_state=42)

msgs = []
for i,(train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    yps = np.zeros([X_test.shape[0],3])
    yrs = yps.copy()
    for c,col in enumerate(['w_x','w_y','floors']):
        y = df[col].values
        y_train, y_test = y[train_index], y[test_index]
        print(y_train.shape)            
        dtrain = xgb.DMatrix(data=X_train, label=y_train)
        dvalid = xgb.DMatrix(data=X_test, label=y_test)
        watchlist = [(dtrain, 'train'), (dvalid, 'eval')] 
        if c != 2:
            clf = xgb.train(params, dtrain=dtrain,
                        num_boost_round=70,evals=watchlist,
                        early_stopping_rounds=10,
                       verbose_eval=100)
        else:
            clf = xgb.train(params2, dtrain=dtrain,
                        num_boost_round=70,evals=watchlist,
                        early_stopping_rounds=10,
                       verbose_eval=100)            
            
        yp = clf.predict(dvalid)
        yps[:,c] = yp
        yrs[:,c] = y_test
        ysub[:,c] += clf.predict(dtest)
    msg = f'Fold {i}: MPE {mpe(yps, yrs):.4f}'
    print(msg)
    msgs.append(msg)
ysub = ysub/N

In [None]:
msgs

In [None]:
sub = pd.read_csv(f'{PATH}/sample_submission.csv')
sub.head()

In [None]:
sub.shape

In [None]:
sub['site'] = sub['site_path_timestamp'].apply(lambda x: x.split('_')[0])
sub.head()

In [None]:
test_map = {i:j for i,j in zip(test_b, test_files)}
sub['filename'] = sub['site'].apply(lambda x: test_map[x])
sub.head()

In [None]:
ds = pd.DataFrame(ysub,columns=['x','y','floor'])
ds.head()

In [None]:
ds['filename'] = test_files
ds.head()

In [None]:
sub = sub.drop(['x','y','floor'],axis=1).merge(ds,on='filename',how='left')
print(sub.shape)
sub.head()

In [None]:
for i in sub.columns:
    print(i,sub[i].isnull().sum())

In [None]:
sub['floor'] = sub['floor'].astype('int')
sub.head()

In [None]:
sub.drop(['site','filename'],axis=1).to_csv('submission.csv',index=False)