In [None]:
import numpy as np 
import pandas as pd
import os
import glob
from PIL import Image
import matplotlib.pyplot as plt
import itertools
import seaborn as sns
from pandas_profiling import ProfileReport

# **Read Data**
#### reading in the data files

In [None]:
os.listdir('/kaggle/input/indoor-location-navigation')

In [None]:
train_dir = "../input/indoor-location-navigation/train"
test_dir = "../input/indoor-location-navigation/test"
meta_dir = "../input/indoor-location-navigation/metadata"
ss = "../input/indoor-location-navigation/sample_submission.csv"

In [None]:
train_names = ['Time', 'Type'] + ['reading_'+str(x)+'_' for x in range(1,9)]
test_names = ['Time', 'Type'] + ['reading_'+str(x)+'_' for x in range(1,9)]

train_files = glob.glob(os.path.join(train_dir, "**/*.txt"), recursive=True)
test_files = glob.glob(os.path.join(test_dir, "**/*.txt"), recursive=True)
train_files = train_files[0:9]
test_files = test_files[0:9]

def find_floor(FloorName):
    floor_type = FloorName[:1]
    floor_number = int(FloorName[1:])
    if floor_type == 'B':
        floor_level = -floor_number
    elif floor_type == 'F':
        floor_level = floor_number-1
    else:
        floor_level = -99
    return floor_level

def read_files(files, names):
    full_df = pd.DataFrame(columns= ['WalkID', 'SiteID', 'Floor']+names)
    for file in files:
        file_df = pd.read_csv(file, sep='\t', comment='#', header=None, names=names) 
        file_df['WalkID'] = files.index(file)
        deets = train_files[0].split("/")
        file_df['SiteID'] = deets[4]
        file_df['Floor'] = find_floor(deets[5])
        full_df = full_df.append(file_df)
    full_df.replace(0, np.nan, inplace=True)
    return full_df

raw_train_df = read_files(train_files, train_names)
raw_test_df = read_files(test_files, test_names)
sample_submission = pd.read_csv(ss)


# additional data:
# start time
# floor name
# phone details
# sensor details

In [None]:
#file_df = pd.read_csv(train_files[0], sep='\t', comment='#', header=None, names=train_names)
#file_df[file_df['Type']=='TYPE_WAYPOINT']

In [None]:
#with open(train_files[0], "r") as fh:
#    for line in fh.readlines():
#        print(line)
#    fh.close()

# **EDA**
#### exploring the data available as well as it's health and completeness

In [None]:
floor_images = glob.glob(os.path.join(meta_dir, "**/*.png"), recursive=True)
floor_info = glob.glob(os.path.join(meta_dir, "**/floor_info.json"), recursive=True)
GeoMaps = glob.glob(os.path.join(meta_dir, "**/geojson_map.json"), recursive=True)
                                      
print("Number of Floor Images in Meta Data: ", len(floor_images))
print("Number of Floor Info(in JSON) in Meta Data: ", len(floor_info))
print("Number of Geo Map (in JSON) in Meta Data: ", len(GeoMaps))

In [None]:
for _ in range(5):
    img = Image.open(floor_images[np.random.randint(0, len(floor_images))])
    display(img)

In [None]:
def read_txt(txt_path):
    # ignore lines starting with # because they contain meta-data sort of thing
    with open(txt_path, 'r') as fh:
        unique_keys = []
        for line in fh.readlines():
            if line.startswith("#"):
                dummy = line.split("\n")[0].split("\t")
                unique_keys.extend(list(map(lambda x: '' if x=="#" else x, dummy)))
            else:
                pass
        fh.close()
    return unique_keys
    pass

read_txt(train_files[0])

In [None]:
raw_train_df.groupby('Type').agg({'Time': 'count'})

In [None]:
null_counts = raw_train_df.isnull().groupby([raw_train_df['Type']]).sum().astype(int)
print (null_counts)
# should be no nulls for 'TYPE_MAGNETIC_FIELD_UNCALIBRATED' ?
#raw_train_df[raw_train_df['Type']=='TYPE_MAGNETIC_FIELD_UNCALIBRATED']

In [None]:
null_counts = raw_test_df.isnull().groupby([raw_test_df['Type']]).sum().astype(int)
print (null_counts)

In [None]:
raw_test_df.head()

In [None]:
sample_submission.head()

# **Data Prep**
#### wrangling the data into a format that we can use to build a model

| ID Variables | Target Variables | Prediction Variables |
| --- | --- | --- |
| siteID    | Floor       | TYPE_ACCELEROMETER_1 |
| walkID    | x           | TYPE_ACCELEROMETER_UNCALIBRATED_1 | 
| timestamp | y           | TYPE_BEACON_1 |
|           |             | TYPE_GYROSCOPE_1 |
|           |             | TYPE_GYROSCOPE_UNCALIBRATED_1 |
|           |             | TYPE_MAGNETIC_FIELD_1 |
|           |             | TYPE_MAGNETIC_FIELD_UNCALIBRATED_1 |
|           |             | TYPE_ROTATION_VECTOR_1 |
|           |             | TYPE_WIFI_1 |
|           |             | (projected_x) |
|           |             | (projected_y) |

## Data Cleaning

The waypoints and readings are gathered at different times, so will split these apart as different data sets and interpolate the readings at each waypoint

In [None]:
# create clean dataset of paths

waypoint_df = raw_train_df[raw_train_df['Type'] == 'TYPE_WAYPOINT']
waypoint_df['x'] =  pd.to_numeric(waypoint_df['reading_1_'])
waypoint_df['y'] = pd.to_numeric(waypoint_df['reading_2_'])
waypoint_df = waypoint_df[['WalkID', 'SiteID', 'Time', 'Floor', 'x', 'y']]

In [None]:
# display example path

example_id = 6
example_path = waypoint_df[waypoint_df['WalkID']==example_id][['x', 'y']].to_numpy()

start_x = example_path[0, 0]
start_y = example_path[0, 1]   
end_x = example_path[len(example_path)-1, 0]
end_y = example_path[len(example_path)-1, 1]

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(start_x, start_y, 'go', end_x, end_y, 'ro', example_path[:, 0], example_path[:, 1])
ax.annotate('start', (example_path[0, 0]-0.2, example_path[0, 1]-0.2))
ax.annotate('end', (example_path[len(example_path)-1, 0]+0.2, example_path[len(example_path)-1, 1]+0.2))

In [None]:
# create clean dataset of readings

# raw_train_df.dtypes
# raw_train_df[raw_train_df['Type']=='TYPE_BEACON'].head(5)
# train_df_clean.iloc[[738,739,740]]
# many reading seem to contain SiteIDs or other non numeric values

train_df_clean = raw_train_df[raw_train_df['Type'] != 'TYPE_WAYPOINT']

# Convert sensor readings to decimal floats

# TYPE_WIFI readings 1 and 2 are strings for some reason
train_df_clean.loc[train_df_clean['Type']=='TYPE_WIFI', 'reading_1_'] = np.NaN
train_df_clean.loc[train_df_clean['Type']=='TYPE_WIFI', 'reading_2_'] = np.NaN
# TYPE_BEACON readings 1, 2, 3 and 7 are strings for some reason
train_df_clean.loc[train_df_clean['Type']=='TYPE_BEACON', 'reading_1_'] = np.NaN
train_df_clean.loc[train_df_clean['Type']=='TYPE_BEACON', 'reading_2_'] = np.NaN
train_df_clean.loc[train_df_clean['Type']=='TYPE_BEACON', 'reading_3_'] = np.NaN
train_df_clean.loc[train_df_clean['Type']=='TYPE_BEACON', 'reading_7_'] = np.NaN
train_df_clean.loc[train_df_clean['Type']=='TYPE_BEACON', 'reading_8_'] = np.NaN

train_df_clean = train_df_clean.astype({'reading_1_': 'float64', 'reading_2_': 'float64', 'reading_3_': 'float64', 'reading_4_': 'float64', 'reading_5_': 'float64', 'reading_6_': 'float64', 'reading_7_': 'float64', 'reading_8_': 'float64'})
train_df_clean.head(5)

In [None]:
train_df_sparce = pd.pivot_table(train_df_clean, 
                          index=['WalkID', 'SiteID', 'Time', 'Floor'], 
                          columns=['Type'], 
                          values=['reading_1_', 'reading_2_', 'reading_3_', 'reading_4_', 'reading_5_', 'reading_6_', 'reading_7_', 'reading_8_'], 
                          aggfunc={'reading_1_': np.sum, 'reading_2_': np.sum, 'reading_3_': np.sum, 'reading_4_' : np.sum, 'reading_5_': np.sum, 'reading_6_': np.sum, 'reading_7_': np.sum, 'reading_8_': np.sum})
train_df_sparce.reset_index(inplace=True)
train_df_sparce.columns = [''.join(col).strip() for col in train_df_sparce.columns.values]

In [None]:
null_counts = train_df_sparce.isnull().sum().astype(int)
print(null_counts)
#with pd.option_context('display.max_rows', None, 'display.max_columns',  None):  # more options can be specified also
#    print(null_counts)

In [None]:
#TYPE_WIFI and TYPE_BEACON have too many nulls for all readings (why?) - remove these columns

nan_columns = ['reading_1_TYPE_BEACON', 'reading_1_TYPE_WIFI', 'reading_2_TYPE_BEACON', 'reading_2_TYPE_WIFI', 'reading_3_TYPE_BEACON', 'reading_3_TYPE_WIFI', 'reading_4_TYPE_BEACON', 'reading_4_TYPE_WIFI', 'reading_5_TYPE_BEACON', 'reading_5_TYPE_WIFI', 'reading_6_TYPE_BEACON', 'reading_6_TYPE_WIFI', 'reading_7_TYPE_BEACON', 'reading_7_TYPE_WIFI', 'reading_8_TYPE_BEACON', 'reading_8_TYPE_WIFI']
train_df_sparce.drop(nan_columns, axis=1, inplace=True)
train_df_sparce.rename(columns={"Time": "Reading_Time"}, inplace=True)
train_df_sparce.head()

## Readings Interpolation

In [None]:
interpolation_data = waypoint_df.merge(train_df_sparce, on=['WalkID', 'SiteID', 'Floor'], how='left')
#interpolation_data['ID'] = interpolation_data['WalkID'].map(str) + '_' + interpolation_data['SiteID'].map(str)  + '_' + interpolation_data['Floor'].map(str) 

interpolation_data_prev = interpolation_data[interpolation_data['Reading_Time'] <= interpolation_data['Time']]
interpolation_data_prev_agg = interpolation_data_prev[['WalkID', 'SiteID', 'Floor', 'Time', 'Reading_Time']].groupby(['WalkID', 'SiteID', 'Floor', 'Time']).agg('max')
interpolation_data_prev = interpolation_data_prev.merge(interpolation_data_prev_agg, on=['WalkID', 'SiteID', 'Floor', 'Time', 'Reading_Time'], how='inner')

interpolation_data_after = interpolation_data[interpolation_data['Reading_Time'] >= interpolation_data['Time']]
interpolation_data_after_agg = interpolation_data_after[['WalkID', 'SiteID', 'Floor', 'Time', 'Reading_Time']].groupby(['WalkID', 'SiteID', 'Floor', 'Time']).agg('min')
interpolation_data_after = interpolation_data_after.merge(interpolation_data_after_agg, on=['WalkID', 'SiteID', 'Floor', 'Time', 'Reading_Time'], how='inner')

In [None]:
interpolation_data = interpolation_data_prev.merge(interpolation_data_after, on=['WalkID', 'SiteID', 'Floor', 'Time', 'x', 'y'], how='outer')
interpolation_data['interpolation_fraction'] = [np.NaN if np.isnan(i) or np.isnan(m) or np.isnan(j) else 1 if j==i else (m-i)/(j-i) for i,m,j in zip(interpolation_data['Reading_Time_x'], interpolation_data['Time'], interpolation_data['Reading_Time_y'])]
reading_type_names = ['TYPE_ACCELEROMETER', 'TYPE_ACCELEROMETER_UNCALIBRATED', 'TYPE_BEACON', 'TYPE_GYROSCOPE', 'TYPE_GYROSCOPE_UNCALIBRATED', 'TYPE_MAGNETIC_FIELD', 'TYPE_MAGNETIC_FIELD_UNCALIBRATED', 'TYPE_ROTATION_VECTOR', 'TYPE_WIFI']
reading_numbers = ['reading_1_', 'reading_2_', 'reading_3_']#, 'reading_4_', 'reading_5_', 'reading_6_', 'reading_7_', 'reading_8_']
reading_names = [i[0]+i[1] for i in list(itertools.product(reading_numbers, reading_type_names))]
for nan_column in nan_columns:
    if nan_column in reading_names:
        reading_names.remove(nan_column)
for reading_name in reading_names:
    prev_readings = interpolation_data[reading_name+'_x']
    after_readings = interpolation_data[reading_name+'_y'] 
    interpolation_data[reading_name] = [j if np.isnan(i) else i if np.isnan(j) is None else i+((j-i)*f) for i,j,f in zip(prev_readings, after_readings, interpolation_data['interpolation_fraction'])]


In [None]:
train_df = interpolation_data[['WalkID', 'SiteID', 'Time', 'Floor', 'x', 'y']+reading_names]
train_df.head()

## Path Projection

In [None]:
waypoint_projection_df = waypoint_df

def projected_coordinate(prev_prev_point, prev_point, prev_prev_time, prev_time, new_time):
    if prev_point == pd.NaN:
        new_point = 0
    elif prev_prev_point == pd.NaN:
        new_point = prev_point
    else:
        speed = (prev_point-prev_prev_point) / (prev_time-prev_prev_time)
        new_point = prev_point + speed*(new_time-prev_time)
        
#waypoint_df['projected_x'] = 
#waypoint_df['projected_y'] = 

**Additional Ideas:**
project path,
look at site_path as a whole,
project distance from sensor

For projecting path:
previous_x,
previous_y,
previous_direction,
previous_speed
-->
projected_x,
projected_y

# **Data Mining**
#### exploring the data correlations to inform the model build

In [None]:
from matplotlib.pyplot import figure
figure(figsize=(20, 5), dpi=80)
boxplot = train_df.boxplot(column=reading_names)
boxplot = plt.xticks(rotation=45, ha="right", fontsize=8)

In [None]:
# select the metrics and factors to correlate
metrics = ['Floor', 'x', 'y']
factors = reading_names
#train_df.dtypes

In [None]:
# calculate the correlation coefficient map 
corr_matrix = train_df.corr(method ='spearman')
corr_matrix = corr_matrix[metrics].filter(factors, axis = 0)
cm = sns.diverging_palette(20, 133, sep=20, as_cmap=True)
corr_matrix.style.background_gradient(cmap=cm)

In [None]:
# plot the correlations
for factor in factors:
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20,3))
    axs = [ax1, ax2, ax3]
    x_data = train_df[factor]
    for i in range(len(metrics)):
        metric = metrics[i]
        y_data = train_df[metric]
        axs[i].set_title(metric+' vs '+factor)
        axs[i].plot(x_data, y_data, 'o')

In [None]:
prof = ProfileReport(train_df)
prof.to_file(output_file='output.html')
display(prof)

# **Model Builds**
#### building the predictive model(s)

# **Model Implimentation**
#### applying the predictive model(s) to the test data