In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
from datetime import datetime
import pytz

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
train_df = pd.read_csv('../input/UjiIndoorLoc/TrainingData.csv')
validate_df = pd.read_csv('../input/UjiIndoorLoc/ValidationData.csv')

types of columns in train dataset

In [None]:
train_df.columns

### check for difference in columns in train and validate dataset

In [None]:
diff1 = set(train_df.columns) - set(validate_df.columns)
diff2 = set(validate_df.columns) - set(train_df.columns)
print(diff1, diff2)

* empty sets means that train and validate dataset have same features.

## now let's check how the data is distributed

In [None]:
df = pd.concat([train_df, validate_df])

In [None]:
print(f"there are {df['USERID'].nunique()} different users.")
print(f"list of user ids: {df['USERID'].unique()}\n")
print(f"entries per user:\n{df['USERID'].value_counts()}")

In [None]:
print(f"there are {df['PHONEID'].nunique()} different phones used.")
print(f"list of user ids: {df['PHONEID'].unique()}\n")
print(f"entries per phone:\n{df['PHONEID'].value_counts()}")

### Q. which phones are used by which users?

In [None]:
df.groupby('USERID')['PHONEID'].unique()

* other than one user all the users have used single phone.

In [None]:
df.groupby('PHONEID')['USERID'].unique()

* two phones are shared among users, other than that, each phone is used by single user.

### Now let's analyse WAPs.

Attributes 001 to 520 (WAP001-WAP520): Intensity value for WAP001.

Negative integer values from -104 to 0 and +100. Positive value 100 used if WAP001 was not detected.

(in -104 to 0) higher the better

In [None]:
wap_df = train_df[train_df.columns[:520]]
no_of_waps = wap_df[wap_df != 100].count(axis=1)

plt.figure(figsize=(10,5))
sns.distplot(no_of_waps)

* for majority of the entries 10 to 20 WAPs are detected out of 520.

### Relative position analysis

Attribute 526 (RelativePosition): Relative position with respect to the Space (1 - Inside, 2 - Outside in Front of the door). Categorical integer values.

In [None]:
sns.countplot(train_df['RELATIVEPOSITION'])

* majority of entries are taken at outside the frontdoor of any rooom.

### Buildings analysis

In [None]:
sns.countplot(df['BUILDINGID'])

* building-2 has relatively more entries, let's find out why.

In [None]:
print(f"no of floors in each building:\n{df.groupby('BUILDINGID')['FLOOR'].unique()}")

* turns out building-2 is largest amongst all and have 5 floors, while others have 4 floors.

### let's look at distribution of entries by floors.

In [None]:
sns.countplot(train_df['FLOOR'])

* All the floors look equally distributed. (floor-4 seems to have low count because it corresponds to only one building)

(SpaceID): Internal ID number to identify the Space (office, corridor, classroom) where the capture was taken. Categorical integer values.

In [None]:
temp1 = df[df['BUILDINGID']==0].groupby('FLOOR')['SPACEID'].nunique()
temp2 = df[df['BUILDINGID']==1].groupby('FLOOR')['SPACEID'].nunique()
temp3 = df[df['BUILDINGID']==2].groupby('FLOOR')['SPACEID'].nunique()

print(f"floorwise distribution of spaces in building-0:\n{temp1}\n")
print(f"floorwise distribution of spaces in building-1:\n{temp2}\n")
print(f"floorwise distribution of spaces in building-2:\n{temp3}\n")

In [None]:
plt.figure(figsize=(12,5))
g = sns.lineplot(data=df.groupby('USERID')['SPACEID'].nunique())
g.set_xticks(np.arange(19))
print()

In [None]:
df.groupby('SPACEID')['USERID'].unique()

In [None]:
plt.figure(figsize=(20,5))
ax = sns.barplot(data=df, x='SPACEID', y='USERID')
# g.set_xticklabels(list(range(0,254,10)))
ax.xaxis.set_major_locator(ticker.MultipleLocator(10))
ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
plt.show()

* distribution of number of unique visits on every space-id.

## Prediction of longitude and latitude using Neural network regression method.

In [None]:
train_x = train_df.drop(columns=["LONGITUDE", "LATITUDE", "FLOOR", "BUILDINGID", "SPACEID", "RELATIVEPOSITION", "USERID", "PHONEID", "TIMESTAMP"])
train_y = train_df[["LONGITUDE", "LATITUDE"]]

test_x = validate_df.drop(columns=["LONGITUDE", "LATITUDE", "FLOOR", "BUILDINGID", "SPACEID", "RELATIVEPOSITION", "USERID", "PHONEID", "TIMESTAMP"])
test_y = validate_df[["LONGITUDE", "LATITUDE"]]


In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation=tf.nn.relu),
#     tf.keras.layers.Dense(512, activation=tf.nn.relu),
    tf.keras.layers.Dense(2, activation = 'linear')
])
model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=[tf.keras.metrics.RootMeanSquaredError(), tf.keras.metrics.CosineSimilarity(axis=1)]
)

In [None]:
val_data = (test_x, test_y)

In [None]:
epochs = 5
result = model.fit(
    train_x.values.astype('float'),
    train_y.values.astype('float'),
    validation_data=val_data,
    epochs=epochs
)

* we can say that it's relatively easy to train a model on this data from the fact that cosine similarity reached 1.0 at the 2nd epoch itself.

In [None]:
x_vals = range(1, epochs+1)
plt.plot(x_vals, result.history['val_root_mean_squared_error'])
plt.legend("RMSE")
plt.show()

* after the 2nd epoch decrement in rmse is relatively low.

## Individual trajectory analysis

Before we can analyse the data for trajectory analysis, we need to sort the data according to timestamps.

In [None]:
df_sorted = df.sort_values(by=['TIMESTAMP'])

In [None]:
temp = df_sorted.groupby('USERID')['BUILDINGID'].unique()
print(f"list of buildings visited by each user:\n{temp}")

* here, we can notice that user-11 has visited all 3 buildings, so we can map it's movement through all buildings.

In [None]:
user11_building_movement = df_sorted[df_sorted['USERID']==11]['BUILDINGID']

plt.figure(figsize=(15,5))
ax = sns.lineplot(data=user11_building_movement.to_numpy())
ax.set_title('USER 11 BUILDING MOVEMENT')
ax.set_xlabel('time')
ax.set_ylabel('building_id')
ax.set_yticks([0,1,2])
plt.show()

doing the same for more users.

In [None]:
user9_building_movement = df_sorted[df_sorted['USERID']==9]['BUILDINGID']

plt.figure(figsize=(15,5))
ax = sns.lineplot(data=user9_building_movement.to_numpy())
ax.set_yticks([1,2])
ax.set_title('USER 9 BUILDINGS MOVEMENT')
ax.set_xlabel('time')
ax.set_ylabel('building_id')
plt.show()

In [None]:
user14_building_movement = df_sorted[df_sorted['USERID']==14]['BUILDINGID']

plt.figure(figsize=(15,5))
ax = sns.lineplot(data=user14_building_movement.to_numpy())
ax.set_yticks([1,2])
ax.set_title('USER 14 BUILDINGS MOVEMENT')
ax.set_xlabel('time')
ax.set_ylabel('building_id')
plt.show()

users 3, 5, 6, 12, 15 have only visited building 2, so we can analyse their floor movement without any complexity.

In [None]:
ulist = [3, 5, 6, 12, 15]

In [None]:
# confirmation
subdf = df_sorted[df_sorted['USERID'].isin(ulist)]
subdf.groupby('USERID')['BUILDINGID'].unique()

In [None]:
# to check the portion of sub-dataframe selected
p = round(len(subdf)/len(df)*100, 2)
print(f"{p}% of entries consist of users who have visited only building-2")

In [None]:
floors_visited = subdf.groupby('USERID')['FLOOR'].unique()
print(f"floors visited by each user:\n{floors_visited}")

* we can see that user 5 and 6 have visited 2 floors, so we can analyse their floor trajectory.

In [None]:
plt.figure(figsize=(10,10))
ax1 = plt.subplot(211)
ax2 = plt.subplot(212)

sns.lineplot(data=subdf[subdf['USERID']==5]['FLOOR'].to_numpy(), ax=ax1)
ax1.set_yticks([2,3,4])
ax1.set_title('USER 5 FLOORS MOVEMENT')
ax1.set_xlabel('time')
ax1.set_ylabel('floor')

sns.lineplot(data=subdf[subdf['USERID']==6]['FLOOR'].to_numpy(), ax=ax2)
ax2.set_yticks([2,3,4])
ax2.set_title('USER 6 FLOORS MOVEMENT')
ax2.set_xlabel('time')
ax2.set_ylabel('floor')

plt.show()

* We can also notice that movement of users 12, 15 is restricted to floor-1.
* Next we will compare spaces movement within builing-2 floor-1 o these users.

In [None]:
# data of users who's movement is restricted in building 2 and floor 1.
mdf = df[df['USERID'].isin([12,15])]

In [None]:
# to check the portion of mini-dataframe selected

p = round(len(mdf)/len(df)*100, 2)
print(f"{p}% of entries consist of users who have visited only building-2 floor-1")

In [None]:
# to get the details of spaceids inside building-2 floor-1

spaceid_list = df[(df['BUILDINGID']==2) & (df['FLOOR']==1)]['SPACEID'].unique()

print(f"there are total of {len(spaceid_list)} spaces in building-2 floor-1.\n")
print(f"list of spaceids:\n{spaceid_list}")

In [None]:
plt.figure(figsize=(20,10))
ax1 = plt.subplot(211)
ax2 = plt.subplot(212)

sns.lineplot(data=mdf[mdf['USERID']==12]['SPACEID'].to_numpy(), ax=ax1)
ax1.set_title('USER 12 SPACES MOVEMENT')
ax1.set_xlabel('time')
ax1.set_ylabel('spaces')

sns.lineplot(data=mdf[mdf['USERID']==15]['SPACEID'].to_numpy(), ax=ax2)
ax2.set_title('USER 15 SPACES MOVEMENT')
ax2.set_xlabel('time')
ax2.set_ylabel('spaces')

plt.show()

### we can clearly see some pre-defined pattern in their movement

## Future work: Crowd behaviour analysis.

* feature engineering
    * apply binning on timestamps and extract dates and time-hours of specific entry
    * divide data datewise and hourwise.
* time series analysis
    * determine crowd density datewise and hourwise at different abstraction levels.(building, floor, spaces)
    * for example: changes in density of users on date T+1 from date T at different buildings.