In [None]:
import numpy as np
import pandas as pd
import glob
import time
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from io import StringIO

In [None]:
def read_one_file(fn, naming_df):
    allTypes = naming_df.index
    with open(fn) as f:
        A = f.readlines()
    A_str = ''.join([L for L in A if L[0]!='#'])
    B = StringIO(A_str)
    df = pd.read_csv(B,sep='\t',header=None,names=range(8),index_col=0)
    X_df = pd.DataFrame()
    for col_str in allTypes:  #  such as 'TYPE_WAYPOINT'
        temp = df[df[1]==col_str].iloc[:,1:].dropna(axis=1,how='all')
        temp.columns = [f'{col_str}_{naming_df.loc[col_str,i]}' for i in range(temp.shape[1])]
        X_df = X_df.merge(temp,left_index=True,right_index=True,how='outer')
    X_df = X_df.sort_index()
    return X_df


def find_beacon_min_max(df,my_min,my_max):
    beacon_df = df[df[1]=='TYPE_BEACON']
    if beacon_df.shape[0]>1:
        C = sorted(beacon_df[9])
    else:
        C = [np.nan, np.nan]
    my_min = min(C[0], my_min)
    my_max = max(C[-1], my_max)
    return (my_min, my_max)


def fn_to_df_and_id(fn):
    with open(fn) as f:
        A = f.readlines()
        A_str = ''.join([L for L in A if L[0]!='#'])
        B = StringIO(A_str)
        df = pd.read_csv(B,sep='\t',header=None,names=range(12),index_col=0)
        siteID_str = A[1].split('SiteID:')[-1].split('\t')[0]
        floor_str = A[1].split('FloorName:')[-1].split('\n')[0]
    return df, siteID_str, floor_str


def print_header(fn):
    with open(fn) as f:
        A = f.readlines()
        A_str = ''.join([L for L in A if L[0]=='#'])
        print(fn)
        print(A_str)
    return None

In [None]:
naming_df = pd.read_csv('/kaggle/input/namingcsv/naming.csv').dropna(how='all')
naming_df
for i in range(naming_df.shape[0]//2):
    naming_df.iloc[i*2,2:] = naming_df.iloc[i*2+1,2:]

naming_df = naming_df[~naming_df['Time'].isnull()].set_index('Data Type').iloc[:,1:]
naming_df.columns = range(naming_df.shape[1])

testPathSiteMapping_ser = pd.read_csv('/kaggle/input/namingcsv/test_path_and_site_mapping.csv',index_col='pathID').iloc[:,0]
testPathSiteMapping_ser

In [None]:
!ls /kaggle/input/indoor-pca-paths

## Read and reformat data

In [None]:
print('Find a representative site-floor for experiment purpose\n')
print('Test data\'s most used sites:\n')
print(testPathSiteMapping_ser.value_counts()[:10])

In [None]:
chosenSite = '5d2709e003f801723c32d896'
print(f'Let\'s use the second floor of this one:\n\t{chosenSite}')
print('How many path files are there for this floor?')
print('\t%d'%len(glob.glob(f'/kaggle/input/indoor-location-navigation/train/{chosenSite}/F2/*.txt')))

In [None]:
t0 = time.time()
oneFloorAllPath_df = pd.DataFrame()
for i,fn in enumerate(glob.glob(f'/kaggle/input/indoor-location-navigation/train/{chosenSite}/F2/*.txt')):
    df = read_one_file(fn, naming_df)
    pathID = fn.split('/')[-1].split('.txt')[0]
    df1 = df.filter(regex='WIFI').dropna(how='all')    # focus on Wi-Fi signals only
    df1 = df1.reset_index()
    df1 = df1[~df1.duplicated(subset=[0,'TYPE_WIFI_bssid'],keep='last')].set_index(0)
    df2 = df1.pivot(columns='TYPE_WIFI_bssid',values='TYPE_WIFI_RSSI')  # make Wi-Fi ID's the columns
    # df2 = df2.reindex(np.arange(df2.index[0],df2.index[-1]))
#     df2.columns = range(df2.shape[1])           #  don't need the name of the Wi-Fi ID's
    df2 = df2[df2.columns[(df2.nunique(axis=0)>1)]]    # if there are only 2 or less unique values, drop this feature
#     df_temp = df2.copy()
#     df2 = df2.interpolate(axis=0,limit_direction='both')   # fill the NaN's by interpolation
    # df2 = df2.fillna(-99)
    waypoint_df = df.filter(regex='WAYPOINT').dropna()
    waypoint_df.columns = ['x','y']
    df3 = df2.merge(waypoint_df,how='outer',left_index=True,right_index=True)
    df3['pathID'] = pathID
    print(fn,df1.shape,df1.index.unique().shape,df2.shape,df3.shape,end='\r')
    oneFloorAllPath_df = pd.concat([oneFloorAllPath_df,df3],axis=0)
t1 = time.time()
print(f'\nFinished. Total reading time {t1-t0}')

## Convert data into various normalised matrix form

In [None]:
oneFloorAllPath_df = oneFloorAllPath_df.sort_index()
onlyWIFI_df = oneFloorAllPath_df.drop(columns=['pathID','x','y'])
onlyWIFI_df = onlyWIFI_df.astype(float)
print(f'RSSI range ({onlyWIFI_df.max().max()}, {onlyWIFI_df.min().min()})')
print('This dataset has %d training examples and %d Wi-Fi features'%(onlyWIFI_df.shape))

In [None]:
normA_1 = (onlyWIFI_df.fillna(-99).values/99) + 1
normA_2 = ((onlyWIFI_df-onlyWIFI_df.mean(axis=0))/onlyWIFI_df.std(axis=0)).interpolate(axis=0,limit_direction='both').values 

## PCA

In [None]:
U,S,V = np.linalg.svd(normA_2) # equivalent implementation:  np.linalg.eig(A@A.T)

In [None]:
R = np.zeros_like(normA_2)    # initialise the recovery matrix
top_sigmas = 12   # how many principal components to keep?
for j in range(top_sigmas):   # fill the recovery matrix with few singular values 
    R[j,j] = S[j]
recoveredA_2 = (U@R@V)     # complete recovery process

In [None]:
fig81 = plt.figure(81,figsize=(20,10))
ax81 = fig81.subplots(ncols=1,nrows=3)
ax81[0].plot(S,color='red',marker='.')
ax81[0].plot([top_sigmas-1]*2,[1, 500],ls='--',c='gray')
ax81[0].annotate('#%d component'%top_sigmas,xy=(top_sigmas-1+0.5,100),c='gray')
ax81[0].set_title('Singular values $\sigma$ of matrix A',color='red')

for i in [7, 9, 18]:
    ax81[1].plot(normA_2[:,i],lw=1.5,c=cm.tab20(i))
    ax81[1].plot(recoveredA_2[:,i],lw=1.5,ls='--',c=cm.tab20(i))
ax81[1].set_title(f'Some Wi-Fi signals and the recovery with {top_sigmas}-top components',color='blue')
ax81[2].plot(U[:,:top_sigmas],lw=1)
ax81[2].set_title('%d most influential singular vectors'%top_sigmas,color='blue')
for stuff in ax81.ravel():
    stuff.margins(0.01)
    stuff.grid(True)

## data preparation

In [None]:
label_df = oneFloorAllPath_df[['x','y']].astype(float).dropna()
label_mean = label_df.mean(axis=0)
label_std = label_df.std(axis=0)
label_df = (label_df - label_mean)/label_std
# feature_df = pd.DataFrame(A,index=temp_df.index).reindex(label_df.index)
# feature_df = (temp_df.fillna(-99)/99+1).reindex(label_df.index)
# feature_df = (temp_df.interpolate(axis=0,limit_direction='both')/99+1).reindex(label_df.index)
# feature_df = pd.DataFrame(U[:,:top_sigmas],index=temp_df.index).reindex(label_df.index)
# feature_df = pd.DataFrame(A_recovery,index=temp_df.index).reindex(label_df.index)


# feature_df = pd.DataFrame(normA_1,index=onlyWIFI_df.index)
# feature_df = pd.DataFrame(normA_2,index=onlyWIFI_df.index)
feature_df = pd.DataFrame((V[:top_sigmas,:]@(normA_2[:,:].T)).T,index=onlyWIFI_df.index)
feature_df = feature_df.reindex(label_df.index)
feature_df.describe()

In [None]:
test_ratio = 0.2
m = feature_df.shape[0]
_ = feature_df.sample(m)
feature_train, feature_test = _.iloc[:-int(m*test_ratio)], _.iloc[-int(m*test_ratio):]
label_train = label_df.reindex(feature_train.index)
label_test = label_df.reindex(feature_test.index)

### Application - NN

In [None]:
import tensorflow as tf
batch_size = feature_df.shape[0]
nFeatures = feature_df.shape[1]
model1 = tf.keras.Sequential()
model1.add(tf.keras.layers.Dense(10,activation='relu',input_shape=(batch_size,nFeatures)))
model1.add(tf.keras.layers.Dense(50,activation='relu'))
# model1.add(tf.keras.layers.Dense(30,activation='relu'))
model1.add(tf.keras.layers.Dense(2,activation='relu'))
# model1(feature_df.values[0,:].reshape(-1))

# optimizer = tf.keras.optimizers.SGD(lr=1e-3,momentum=0.9)
optimizer = tf.keras.optimizers.Adam(lr=1e-3)
model1.compile(loss='mse',optimizer=optimizer)

In [None]:
model1.fit(feature_df,label_df.iloc[:,:].values.reshape(-1,2),epochs=100)

In [None]:
((((label_df - labelHat_df)*np.array(label_std))**2).sum(axis=1)**0.5).describe()

In [None]:
labelHat_df = pd.DataFrame(model1.predict(feature_df).reshape(-1,2),index=label_df.index)
labelHat_df.columns = ['x','y']
# labelHat_df

fig2 = plt.figure(2,figsize=(16,7))
ax2 = fig2.subplots(ncols=2,nrows=1)
ax2[0].scatter(labelHat_df.iloc[:,0],label_df.iloc[:,0],s=50,alpha=0.5)
ax2[1].scatter(labelHat_df.iloc[:,1],label_df.iloc[:,1],s=50,alpha=0.5)
for stuff in ax2.ravel():
    stuff.grid(True)
    stuff.set_xlabel('predict')
    stuff.set_ylabel('actual')

### Application: lightgbm

In [None]:
import lightgbm as lgb

In [None]:
collection_dict = {}
params = {'objective': 'regression',
      'metric': 'l2',
      'num_iterations':500,
      'num_leaves':96,
      'verbosity':-1,
      'learning_rate':0.1,'max_bin':20000
      }
test_ratio = 0.2
m = feature_df.shape[0]

sample_ind = feature_df.sample(m).index
train_ind, test_ind = sample_ind[:-int(m*test_ratio)], sample_ind[-int(m*test_ratio):]

In [None]:
np.concatenate([np.arange(2,50,4),np.arange(50,480,20)])

In [None]:
for top_sigmas in np.concatenate([np.arange(2,50,4),np.arange(50,480,20)]):
    feature_df = pd.DataFrame((V[:top_sigmas,:]@(normA_2[:,:].T)).T,index=onlyWIFI_df.index)
    feature_df = feature_df.reindex(label_df.index)


    feature_train = feature_df.reindex(train_ind)
    feature_test = feature_df.reindex(test_ind)
    label_train = label_df.reindex(train_ind)
    label_test = label_df.reindex(test_ind)
    
    evals_result_x = {}
    data_train_x = lgb.Dataset(feature_train,label_train['x'])
    data_val_x = [lgb.Dataset(feature_train,label_train['x']), lgb.Dataset(feature_test,label_test['x'])]
    boostingModel_x = lgb.train(params,data_train_x,valid_sets=data_val_x,valid_names=['train','test'],evals_result=evals_result_x,verbose_eval=False,early_stopping_rounds=150)

    evals_result_y = {}
    data_train_y = lgb.Dataset(feature_train,label_train['y'])
    data_val_y = [lgb.Dataset(feature_train,label_train['y']), lgb.Dataset(feature_test,label_test['y'])]
    boostingModel_y = lgb.train(params,data_train_y,valid_sets=data_val_y,valid_names=['train','test'],evals_result=evals_result_y,verbose_eval=False,early_stopping_rounds=150)
    error_x = (boostingModel_x.predict(feature_test)-label_test['x'])*label_std['x']
    error_y = (boostingModel_y.predict(feature_test)-label_test['y'])*label_std['y']
    res = (error_x**2).mean()**0.5,(error_y**2).mean()**0.5,((error_x**2 + error_y**2)**0.5).mean()
    collection_dict[top_sigmas] = res

In [None]:
collection_df = pd.DataFrame(collection_dict).T
collection_df.index.name = 'nPCA'
collection_df.columns = ['RMSE(x)','RMSE(y)','MAE(position)']
collection_df

In [None]:
fig4 = plt.figure(4,figsize=(16,10))
ax4 = fig4.subplots(nrows=2,ncols=1)
ax4[0].plot(evals_result_x['train']['l2'],label='training set')
ax4[0].plot(evals_result_x['test']['l2'],label='test set')
ax4[1].plot(evals_result_y['train']['l2'],label='training set')
ax4[1].plot(evals_result_y['test']['l2'],label='test set')

for stuff in ax4.ravel():
    stuff.grid(True)
    stuff.legend()

In [None]:
boostingModel_x = lgb.LGBMRegressor(n_estimators = 1000)#, max_depth=5,num_leaves=20)
boostingModel_y = lgb.LGBMRegressor(n_estimators = 1000)#, max_depth=5,num_leaves=20)

In [None]:
boostingModel_x.fit(feature_train,label_train['x'])
boostingModel_y.fit(feature_train,label_train['y'])

In [None]:
error_x = (boostingModel_x.predict(feature_test)-label_test['x'])*label_std['x']
error_y = (boostingModel_y.predict(feature_test)-label_test['y'])*label_std['y']

In [None]:
(error_x**2).mean()**0.5,(error_y**2).mean()**0.5,((error_x**2 + error_y**2)**0.5).mean()

In [None]:
wrong_mask = labelHat_df.abs().sum(axis=1)==0
wrong_ind = label_df[wrong_mask].index

In [None]:
wrong_ind

In [None]:
normA_1_df = pd.DataFrame(normA_1,index=onlyWIFI_df.index)
normA_2_df = pd.DataFrame(normA_2,index=onlyWIFI_df.index)

# .reindex(wrong_ind)#.sum(axis=1)

In [None]:
# normA_2_df

In [None]:
fig3 = plt.figure(3, figsize=(20,10))
ax3 = fig3.subplots(ncols=1,nrows=2)
ax3[0].plot(normA_2_df)

In [None]:
onlyWIFI_df.reindex(wrong_ind).sum(axis=1)

## Draft

In [None]:
import sklearn.datasets

In [None]:
import tensorflow as tf

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt


def window_data(data, win_size, batch_size):
    A = tf.data.Dataset.from_tensor_slices(data)
    A = A.window(win_size+1, shift=1, drop_remainder=True)
    A = A.flat_map(lambda window:window.batch(win_size+1))  # need to flatten for next line
    A = A.map(lambda window: (window[:-1],window[-1:]))
    A = A.shuffle(buffer_size=10)        # optional
    A = A.batch(batch_size).prefetch(1)  #  some batch technique
    return A


def model_forecast(model, data, win_size):
    B = tf.data.Dataset.from_tensor_slices(data)
    B = B.window(win_size,shift=1, drop_remainder=True)
    B = B.flat_map(lambda w:w.batch(win_size))
    B = B.batch(32).prefetch(1)
    forecast = model.predict(B)
    return forecast

In [None]:
temp = sklearn.datasets.load_boston()

In [None]:
X = temp['data']
y = temp['target']

X = (X-X.mean(axis=0))/X.std(axis=0)
y = (y-y.mean())/y.std()
X_train,X_test = X[:400,:],X[400:,:]
y_train,y_test = y[:400],y[400:]

In [None]:
pd.Series(temp['target']).describe()

In [None]:
timeLength = X.shape[0]
np.random.seed(2)
t = np.linspace(1,100,timeLength).reshape(timeLength,-1)

# X = np.random.random([timeLength,2])/10+0.5
# X = np.hstack([X, np.cos(t*10)/2+0.5])
data = pd.DataFrame(X,index=t.squeeze())
data['y'] = y
# data = data.fillna(0.5)

# data.iloc[:50,:].plot(figsize=(15,5))

In [None]:
win_size = 3
batch_size = 64
data_processed = window_data(data, win_size, batch_size)

my_model = tf.keras.Sequential()
my_model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16,return_sequences=True,input_shape=(batch_size, win_size, 3))))
my_model.add(tf.keras.layers.Dense(30,activation='relu'))
my_model.add(tf.keras.layers.Dense(1,activation='relu'))
# my_model.add(tf.keras.layers.Lambda(lambda x: x*3))


optimizer = tf.keras.optimizers.SGD(lr=1e-2,momentum=0.9)
my_model.compile(loss='mae',optimizer=optimizer)
my_model.fit(data_processed,epochs=10)

In [None]:
win_size = 5
batch_size = 128
# data_processed = window_data(data, win_size, batch_size)

MLP_model = tf.keras.Sequential()
MLP_model.add(tf.keras.layers.Dense(30,activation='relu'))
MLP_model.add(tf.keras.layers.Dense(1,activation='relu'))

optimizer = tf.keras.optimizers.SGD(lr=1e-4)
MLP_model.compile(loss='mae',optimizer=optimizer)
MLP_model.fit(X_train,y_train,epochs=100)

In [None]:
a

In [None]:
a

In [None]:

temp = np.arange(20)
a = temp**1.1
b = temp**1.1
a[::2] = np.nan
c = a.copy()
c[np.isnan(c)] = -99
b[1::2] = np.nan
plt.figure(figsize=(15,5))
plt.scatter(temp,a)
plt.scatter(temp,b)
# plt.scatter(temp,c)