# Data Modeling: Long Short Term Memory (LSTM) Neural Network Implementation

We'll be using Keras through Tensorflow to implement a LSTM Neural Network. We hope to get a good prediction of the true location of the smartphones. We'll use a different model for floor predictions.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns 
from pathlib import Path
import random
import os
import sys
import glob
import pickle

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder

import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
import tensorflow.keras.backend as K
import tensorflow_addons as tfa
from tensorflow_addons.layers import WeightNormalization
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

from datetime import datetime

from dataclasses import dataclass

from sklearn.preprocessing import MinMaxScaler

In [None]:
# copy from https://github.com/location-competition/indoor-location-competition-20/blob/master/io_f.py

@dataclass
class ReadData:
    acce: np.ndarray
    acce_uncali: np.ndarray
    gyro: np.ndarray
    gyro_uncali: np.ndarray
    magn: np.ndarray
    magn_uncali: np.ndarray
    ahrs: np.ndarray
    wifi: np.ndarray
    ibeacon: np.ndarray
    waypoint: np.ndarray


def read_data_file(data_filename):
    acce = []
    acce_uncali = []
    gyro = []
    gyro_uncali = []
    magn = []
    magn_uncali = []
    ahrs = []
    wifi = []
    ibeacon = []
    waypoint = []

    with open(data_filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    for line_data in lines:
        line_data = line_data.strip()
        if not line_data or line_data[0] == '#':
            continue

        line_data = line_data.split('\t')

        if line_data[1] == 'TYPE_WAYPOINT':
            waypoint.append([int(line_data[0]), float(line_data[2]), float(line_data[3])])
            continue
       
        if line_data[1] == 'TYPE_ACCELEROMETER':
            acce.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue
        
        if line_data[1] == 'TYPE_ACCELEROMETER_UNCALIBRATED':
            acce_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue
        
        if line_data[1] == 'TYPE_GYROSCOPE':
            gyro.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_GYROSCOPE_UNCALIBRATED':
            gyro_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue
        
        if line_data[1] == 'TYPE_MAGNETIC_FIELD':
            magn.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_MAGNETIC_FIELD_UNCALIBRATED':
            magn_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_ROTATION_VECTOR':
            ahrs.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_WIFI':
            sys_ts = line_data[0]
            ssid = line_data[2]
            bssid = line_data[3]
            rssi = line_data[4]
            lastseen_ts = line_data[6]
            wifi_data = [sys_ts, ssid, bssid, rssi, lastseen_ts]
            wifi.append(wifi_data)
            continue

        if line_data[1] == 'TYPE_BEACON':
            ts = line_data[0]
            uuid = line_data[2]
            major = line_data[3]
            minor = line_data[4]
            rssi = line_data[6]
            ibeacon_data = [ts, '_'.join([uuid, major, minor]), rssi]
            ibeacon.append(ibeacon_data)
            continue
        
    
    acce = np.array(acce)
    acce_uncali = np.array(acce_uncali)
    gyro = np.array(gyro)
    gyro_uncali = np.array(gyro_uncali)
    magn = np.array(magn)
    magn_uncali = np.array(magn_uncali)
    ahrs = np.array(ahrs)
    wifi = np.array(wifi)
    ibeacon = np.array(ibeacon)
    waypoint = np.array(waypoint)
    
    return ReadData(acce, acce_uncali, gyro, gyro_uncali, magn, magn_uncali, ahrs, wifi, ibeacon, waypoint)

In [None]:
dataset_cols = { # Dictionary containing the names of the cols of each dataframe
    'acce': ['ts', 'x_acce', 'y_acce', 'z_acce'],
    'acce_uncali': ['ts', 'x_acce_uncali', 'y_acce_uncali', 'z_acce_uncali'],
    'gyro': ['ts', 'x_gyro', 'y_gyro', 'z_gyro'],
    'gyro_uncali': ['ts', 'x_gyro_uncali', 'y_gyro_uncali', 'z_gyro_uncali'],
    'magn': ['ts', 'x_magn', 'y_magn', 'z_magn'],
    'magn_uncali': ['ts', 'x_magn_uncali', 'y_magn_uncali', 'z_magn_uncali'],
    'ahrs': ['ts', 'x_ahrs', 'y_ahrs', 'z_ahrs'],
    'wifi': ['ts', 'ssid', 'bssid', 'rssi_wifi', 'lastseen_ts'],
    'ibeacon': ['ts', 'uuid_major_minor', 'rssi_ibeacon'],
    'waypoint': ['ts', 'x', 'y']
}

def make_unified_csv(file_name):
    df_dict = {}
    file = read_data_file(file_name)
    dataset_names = [attr for attr in dir(file) if not attr.startswith('__')] 
    for dataset in dataset_names:
        try:
            cols = dataset_cols[dataset] # Getting the colnames for dataset
            vals = getattr(file, dataset) # Getting values for dataset
#             print(f'\n\n{dataset} dataframe:') 
#             print(cols)
#             print(vals)
            df_dict[dataset] = pd.DataFrame(data=vals, columns=cols) # Creating dataset using cols and vals
            df_dict[dataset].drop_duplicates('ts', inplace=True) # Dropping duplicate timestamps
            df_dict[dataset]['ts'] = pd.to_numeric(df_dict[dataset]['ts']) # Converting timestamp to numeric type
            df_dict[dataset] = df_dict[dataset].set_index('ts') # Setting timestamp as index
            # print(f'\n\n{dataset} dataframe:') 
            # display(df_dict[dataset].head())
        except:
            df_dict[dataset] = pd.DataFrame(columns=cols)
            df_dict[dataset] = df_dict[dataset].set_index('ts')
    df_dict['wifi']['lastseen_ts_datetime'] = pd.to_numeric(df_dict['wifi']['lastseen_ts'])
    df_dict['wifi']['lastseen_ts_datetime'] = pd.to_datetime(df_dict['wifi']['lastseen_ts_datetime'], unit='ms')

    # Merging df_dict to create master dataframe
    merged=pd.concat(df_dict,axis=1)
    # 'Flattening' datafram column names
    master = merged.reset_index()
    master.columns = [i[1] if i != ('ts','') else i[0] for i in master.columns]

    # Converting rssi columns to numeric
    master['rssi_wifi'] = pd.to_numeric(master['rssi_wifi'])
    master['rssi_ibeacon'] = pd.to_numeric(master['rssi_ibeacon'])

    # Creating separate column with timestamps as 
    master['ts_datetime'] = pd.to_numeric(master['ts'])
    master['ts_datetime'] = pd.to_datetime(master['ts_datetime'], unit='ms')

    # calculated column: difference of time between timestamps
    master['ts_diff'] = master['ts'].diff()
    
    # set index to ts_datetime
    master.set_index('ts_datetime', inplace=True)
    
    # interpolate x and y
    master['x'] = master['x'].interpolate(method='time')
    master['y'] = master['y'].interpolate(method='time')
    
    # set path (metadata)
    master['path'] = file_name.split('/')[-1][:-4]
        
    return master
    

sample_file = make_unified_csv('../input/indoor-location-navigation/train/5cd56c0ce2acfd2d33b6ab27/B1/5d09a625bd54340008acddb9.txt')
# sample_file = read_data_file('../input/indoor-location-navigation/train/5cd56c0ce2acfd2d33b6ab27/B1/5d09a625bd54340008acddb9.txt')

sample_file

## I. Loading Data and Visualizing Paths

Code courtesy of [@titeriks](https://www.kaggle.com/titericz), from his notebook '[EDA - Loading Data and Visualizing Paths](https://www.kaggle.com/titericz/eda-loading-data-and-visualizing-paths)';  and

In [None]:
!ls ../input/indoor-location-navigation

In [None]:
trainfiles = glob.glob('../input/indoor-location-navigation/train/*/*/*')
len(trainfiles), trainfiles[0]

In [None]:
!head ../input/indoor-location-navigation/train/5cd56c0ce2acfd2d33b6ab27/B1/5d09a625bd54340008acddb9.txt

In [None]:
def load_dict( filename ):
    with open(filename) as f:
        lines = f.readlines()

    proc = {}
    for l in lines:
        l = l.replace('\n','')
        
        if l[0]!='#':
            val = l.split('\t')
            if val[0] not in proc:
                proc[val[0]] = {}
            proc[val[0]][val[1]] = val[2:]

    return proc

In [None]:
path = load_dict( trainfiles[0] )

len(path)

In [None]:
path['1560913374746']

In [None]:
def plot_values( path, var='TYPE_ACCELEROMETER', cumsum=False ):
    x = [ float(path[p][var][0]) for p in path if var in path[p] ]
    y = [ float(path[p][var][1]) for p in path if var in path[p] ]
    if cumsum:
        plt.plot( np.cumsum(x), np.cumsum(y) )
    else:
        plt.plot( x, y )

In [None]:
plot_values( path, var='TYPE_ACCELEROMETER', cumsum=True  )
plot_values( path, var='TYPE_ACCELEROMETER_UNCALIBRATED', cumsum=True  )

In [None]:
plot_values( path, var='TYPE_MAGNETIC_FIELD', cumsum=False  )
plot_values( path, var='TYPE_MAGNETIC_FIELD_UNCALIBRATED', cumsum=False  )

In [None]:
plot_values( path, var='TYPE_GYROSCOPE', cumsum=True  )
plot_values( path, var='TYPE_GYROSCOPE_UNCALIBRATED', cumsum=True  )

In [None]:
plot_values( path, var='TYPE_ROTATION_VECTOR', cumsum=False  )

# II. Creating and Implementing the LSTM 

### Part 0.1: Prep the data (Sensors - acce, gyro, magne, ahrs)

Using functions borrowed from our second EDA notebook

In [None]:
# Getting test and train files
train_files = glob.glob('../input/indoor-location-navigation/train/*/*/*')
test_files = glob.glob('../input/indoor-location-navigation/test/*')

# Sampling train files
sampled_train = random.sample(train_files, 10)#len(train_files)//100) # sampling without replacement, grabbing 10% of the data

In [None]:
len(sampled_train)

In [None]:
master_sensor_train = pd.DataFrame()

for i, train_file in enumerate(sampled_train):
    #print(i)
    train = make_unified_csv(train_file)
    train.dropna(subset=['x_acce', 'y_acce'], inplace=True)
    master_sensor_train = pd.concat([master_sensor_train, train])

master_sensor_train.head(5)

### Part 0.2: Prep the data (Wifi features)

Code couretsy of [@kokitanisaka](https://www.kaggle.com/kokitanisaka)'s [awesome work](https://www.kaggle.com/kokitanisaka/lstm-by-keras-with-unified-wi-fi-feats).

(Ignore for now)

In [None]:
# options

N_SPLITS = 10

SEED = 2021

NUM_FEATS = 20 # number of features that we use. there are 100 feats but we don't need to use all of them

base_path = '/kaggle'

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    session_conf = tf.compat.v1.ConfigProto(
        intra_op_parallelism_threads=1,
        inter_op_parallelism_threads=1
    )
    sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
    tf.compat.v1.keras.backend.set_session(sess)
    
def comp_metric(xhat, yhat, fhat, x, y, f):
    intermediate = np.sqrt(np.power(xhat-x, 2) + np.power(yhat-y, 2)) + 15 * np.abs(fhat-f)
    return intermediate.sum()/xhat.shape[0]

In [None]:
# wifi_dir = f"{base_path}/input/indoorunifiedwifids"
# wifi_train_files = sorted(glob.glob(os.path.join(wifi_dir, '*_train.csv')))
# wifi_test_files = sorted(glob.glob(os.path.join(wifi_dir, '*_test.csv')))
# subm = pd.read_csv(f'{base_path}/input/indoor-location-navigation/sample_submission.csv', index_col=0)

# master_wifi_train = pd.DataFrame()

# for i, train_file in enumerate(wifi_train_files):
#     #print(i)
#     train = pd.read_csv(train_file)
#     master_wifi_train = pd.concat([master_wifi_train, train])


### Part 1: Pre-process (scale) the data

We might want to scale the data to normalize it before plugging it in into a neural network.

Here's a list of scalers available in sklearn:

![](https://www.kdnuggets.com/wp-content/uploads/sklearn-scalers.png)

### Part 2: Split the model for internal train and test

A common way is 75% train and 25% test

### Part 3: Build the arquitecture of the model

From [datacamp.com](https://www.datacamp.com/community/tutorials/lstm-python-stock-market):

Long Short-Term Memory models are extremely powerful time-series models. They can predict an arbitrary number of steps into the future. An LSTM module (or cell) has 5 essential components which allows it to model both long-term and short-term data.

* Cell state (ct) - This represents the internal memory of the cell which stores both short term memory and long-term memories
* Hidden state (ht) - This is output state information calculated w.r.t. current input, previous hidden state and current cell input which you eventually use to predict the target. Additionally, the hidden state can decide to only retrive the short or long-term or both types of memory stored in the cell state to make the next prediction.
* Input gate (it) - Decides how much information from current input flows to the cell state
* Forget gate (ft) - Decides how much information from the current input and the previous cell state flows into the current cell state
* Output gate (ot) - Decides how much information from the current cell state flows into the hidden state, so that if needed LSTM can only pick the long-term memories or short-term memories and long-term memories

![](http://res.cloudinary.com/dyd911kmh/image/upload/f_auto,q_auto:best/v1523953369/lstm_xszk4d.png)

#### Data Generation and Augmentation

You are first going to implement a data generator to train your model. This data generator will have a method called .unroll_batches(...) which will output a set of num_unrollings batches of input data obtained sequentially, where a batch of data is of size [batch_size, 1]. Then each batch of input data will have a corresponding output batch of data.

Also to make your model robust you will not make the output for x_t always x_t+1. Rather you will randomly sample an output from the set x_t+1,x_t+2,…,xt+N where N is a small window size.

Assumption: x_t+1,x_t+2,…,xt+N will not be very far from each other

![](http://res.cloudinary.com/dyd911kmh/image/upload/f_auto,q_auto:best/v1523953369/batch_pno02e.png)

In [None]:

class DataGeneratorSeq(object):

    def __init__(self,feats,batch_size,num_unroll):
        self._feats = feats
        self._feats_length = len(self._feats) - num_unroll
        self._batch_size = batch_size
        self._num_unroll = num_unroll
        self._segments = self._feats_length //self._batch_size
        self._cursor = [offset * self._segments for offset in range(self._batch_size)]

    def next_batch(self):

        batch_data = np.zeros((self._batch_size),dtype=np.float32)
        batch_labels = np.zeros((self._batch_size),dtype=np.float32)

        for b in range(self._batch_size):
            if self._cursor[b]+1>=self._feats_length:
                #self._cursor[b] = b * self._segments
                self._cursor[b] = np.random.randint(0,(b+1)*self._segments)

            batch_data[b] = self._feats[self._cursor[b]]
            batch_labels[b]= self._feats[self._cursor[b]+np.random.randint(0,5)]

            self._cursor[b] = (self._cursor[b]+1)%self._feats_length

        return batch_data,batch_labels

    def unroll_batches(self):

        unroll_data,unroll_labels = [],[]
        init_data, init_label = None,None
        for ui in range(self._num_unroll):

            data, labels = self.next_batch()    

            unroll_data.append(data)
            unroll_labels.append(labels)

        return unroll_data, unroll_labels

    def reset_indices(self):
        for b in range(self._batch_size):
            self._cursor[b] = np.random.randint(0,min((b+1)*self._segments,self._feats_length-1))



dg = DataGeneratorSeq(master_sensor_train,5,5)
u_data, u_labels = dg.unroll_batches()

for ui,(dat,lbl) in enumerate(zip(u_data,u_labels)):   
    print('\n\nUnrolled index %d'%ui)
    dat_ind = dat
    lbl_ind = lbl
    print('\tInputs: ',dat )
    print('\n\tOutput:',lbl)

#### Define Hyperparameters

In [None]:
D = 0 # Dimensionality of the data. Since your data is 1-D this would be 1
num_unrollings = 0 # Number of time steps you look into the future.
batch_size = 0 # Number of samples in a batch
num_nodes = [0,0,0] # Number of hidden nodes in each layer of the deep LSTM stack we're using
n_layers = len(num_nodes) # number of layers
dropout = 0.0 # dropout amount

tf.reset_default_graph() # This is important in case you run this multiple times

#### Define Inputs and Outputs

Next you define placeholders for training inputs and labels. This is very straightforward as you have a list of input placeholders, where each placeholder contains a single batch of data. And the list has num_unrollings placeholders, that will be used at once for a single optimization step.

In [None]:
train_inputs, train_outputs = [],[]

#### Defining Parameters of the LSTM and Regression layer

#### Loss Calculation and Optimizer

### Part 4: Running the LSTM

### Part 5: Visualization and Prediction 

# IV. Predicting Floor Data

(Don't worry about this, we'll use another notebook for floor predictions).

# V. Submitting