# Importing Essential Libaries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.cluster import KMeans
import plotly.graph_objs as go
from plotly.subplots import make_subplots

# Data Extraction & Pre-Processing

In [3]:
df_POI_datacategories = pd.read_csv("POI_datacategories.csv", header=None)
df_POI_A = pd.read_csv("POIdata_cityA.csv")
df_POI_B = pd.read_csv("POIdata_cityB.csv")
df_POI_C = pd.read_csv("POIdata_cityC.csv")
df_POI_D = pd.read_csv("POIdata_cityD.csv")
df_cityA = pd.read_csv("cityABCD/cityA/task1_dataset_kotae.csv")
df_cityB = pd.read_csv("cityABCD/cityB/hiroshima_challengedata.csv")
df_cityC = pd.read_csv("cityABCD/cityC/sapporo_challengedata.csv")
df_cityD = pd.read_csv("cityABCD/cityD/kumamoto_challengedata.csv")

In [4]:
# Match the 'category' column in df_POI_X to rows in df_POI_datacategories and create the 'establishment' column
df_POI_A['establishment'] = df_POI_A['category'].apply(lambda x: df_POI_datacategories.iloc[x - 1, 0] if x <= len(df_POI_datacategories) else None)
df_POI_B['establishment'] = df_POI_B['category'].apply(lambda x: df_POI_datacategories.iloc[x - 1, 0] if x <= len(df_POI_datacategories) else None)
df_POI_C['establishment'] = df_POI_C['category'].apply(lambda x: df_POI_datacategories.iloc[x - 1, 0] if x <= len(df_POI_datacategories) else None)
df_POI_D['establishment'] = df_POI_D['category'].apply(lambda x: df_POI_datacategories.iloc[x - 1, 0] if x <= len(df_POI_datacategories) else None)

In [5]:
df_cityB = df_cityB[(df_cityB['uid'] >= 0) & (df_cityB['uid'] <= 150)]

In [6]:
# Preprocess and merge data
df_merged = pd.merge(df_cityB, df_POI_B, on=['x', 'y'], how='left')

# Encode the 'category' column to numeric if needed
df_merged['category'] = df_merged['category'].fillna(-1).astype(int)

# Generate sequences based on each 'uid'
sequence_length = 5  # Look back for the past 5 steps
X_sequences, y_sequences = [], []



In [None]:
for uid in df_merged['uid'].unique():
    user_data = df_merged[df_merged['uid'] == uid].sort_values(by=['d', 't'])
    for i in range(sequence_length, len(user_data)):
        X_sequences.append(user_data[['x', 'y', 'd', 't', 'category']].iloc[i-sequence_length:i].values)
        y_sequences.append(user_data[['x', 'y']].iloc[i].values)

# Pad sequences to ensure they are of the same length
X_sequences = pad_sequences(X_sequences, maxlen=sequence_length, dtype='float32')
y_sequences = np.array(y_sequences)

In [None]:
# Convert list to numpy array after padding
X_sequences = np.array(X_sequences)
y_sequences = np.array(y_sequences)

In [None]:
# Save the array
np.save('X_sequences.npy', X_sequences)
np.save('y_sequences.npy', y_sequences)

# LSTM Model Training for Predicting the Next Location
The **LSTM network** is a type of **Recurrent Neural Network (RNN)** that is effective for modeling sequences over a period of time. Unlike traditional RNNs, **LSTM** can capture long-term dependencies in sequential data due to its unique architecture, which includes mechanisms to manage and retain information across timesteps. 

Given this capability, we have chosen **LSTMs** as our baseline model for this task. LSTM's architecture is well-suited for scenarios where past observations need to influence future predictions. In our case, we aim to predict a particular user's next location in Hiroshima based on historical movement patterns. By using LSTM, we can better model the sequence of a user's previous locations and improve the accuracy of our predictions.

In [None]:
X_sequences = np.load('X_sequences.npy')
y_sequences = np.load('y_sequences.npy')
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_sequences, y_sequences, test_size=0.2, random_state=42)

In [None]:
# Model building
model = Sequential()
model.add(LSTM`(64, input_shape=(sequence_length, X_sequences.shape[2]), return_sequences=True))
model.add(LSTM(32))
model.add(Dense(2))  # Output x and y coordinates

model.compile(optimizer='adam', loss='mse', metrics=['mae'])


In [8]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [9]:
# Evaluate the model
loss, mae = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test MAE: {mae}")

Test Loss: 9.802356719970703, Test MAE: 0.8231422305107117


In [10]:
# Predictions
y_pred = model.predict(X_test)

# Display a few predictions vs actuals
for i in range(5):
    print(f"Predicted: {y_pred[i]}, Actual: {y_test[i]}")

Predicted: [79.10563 93.29632], Actual: [79 94]
Predicted: [ 90.34775 107.41593], Actual: [ 90 108]
Predicted: [80.19136 95.48814], Actual: [80 96]
Predicted: [78.01843 94.58343], Actual: [78 95]
Predicted: [159.7515  172.78491], Actual: [162 171]


# Model Refinement & Parameter Tuning
In this section, we refine our LSTM model to improve its predictive accuracy by experimenting with various hyperparameters, architectural choices, and feature engineering techniques.

1. **Early Stopping**: We implement early stopping to monitor validation loss and prevent overfitting. Training halts automatically if the model's validation loss shows no improvement over consecutive epochs, promoting generalizability.

2. **Sequence Length**: We adjust sequence length to balance model performance and computational efficiency. 
   - **Chosen Length (5)**: Selected as an optimal balance for capturing movement patterns without overfitting.
   - **Shorter Sequences (e.g., 3)**: Faster training with lower overfitting risk but might miss long-term dependencies.
   - **Longer Sequences (e.g., 7)**: Better at recognizing complex patterns, though at a higher memory cost and overfitting risk.

3. **Layer Units**: Our model uses two LSTM layers with 128 and 64 units to maintain a balance between model capacity and generalization. 
   - **Lower Units (e.g., 32)**: Provide faster training but may lack the capacity to capture detailed movement patterns.
   - **Higher Units (e.g., 128)**: Enhance pattern recognition abilities but increase memory usage and risk of overfitting.

These refinements aim to create a robust model suited to capturing complex movement dynamics with efficiency and generalizability.

In [11]:
#Experiment with different sequence lengths, e.g., 3, 5, 7
sequence_length = 5

# Modified Model with adjusted sequence length, adding more units for tuning
model = Sequential()
model.add(LSTM(128, input_shape=(sequence_length, X_sequences.shape[2]), return_sequences=True))
model.add(LSTM(64))
model.add(Dense(2))

In [12]:
# Compile model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Early Stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

# Train the model
history = model.fit(
    X_train, y_train, epochs=20, batch_size=32, validation_split=0.2,
    callbacks=[early_stopping]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20


In [13]:
# Evaluation on test set
loss, mae = model.evaluate(X_test, y_test)
y_pred = model.predict(X_test)



In [14]:
# Calculating additional metrics
rmse = mean_squared_error(y_test, y_pred, squared=False)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"Test MAE: {mae}")
print(f"Test RMSE: {rmse}")
print(f"Test MAPE: {mape}")

Test MAE: 0.8564621806144714
Test RMSE: 2.3784425097976074
Test MAPE: 0.011397217188434946




# Visualisations

1. **Mean Square Error (MSE) and Mean Absolute Error (MAE) Metrics Against Epochs**:  
   We track **MSE** and **MAE** across epochs to assess model performance over time. 
   - **MSE** provides insight into the squared differences between predicted and actual values, emphasizing larger errors.
   - **MAE** gives a more intuitive measure of average error magnitude, unaffected by outliers. 

   Plotting MSE and MAE against epochs allows us to monitor training progress, adjust learning rates, and identify potential overfitting or underfitting.

2. **Actual vs. Predicted Locations (Scatter Grid Plot)**:  
   This visualization shows a comparison between **actual and predicted locations** on a grid scatter plot, helping us analyze the spatial accuracy of our model. Each point represents a predicted location alongside its corresponding actual position, making it easier to evaluate spatial prediction accuracy and identify patterns or discrepancies across different locations.

In [15]:
# Plot MAE and MSE across epochs using Plotly
fig = make_subplots(rows=1, cols=2, subplot_titles=("Training vs Validation MSE", "Training vs Validation MAE"))

# MSE Plot
fig.add_trace(
    go.Scatter(y=history.history['loss'], mode='lines', name='Train MSE'),
    row=1, col=1
)
fig.add_trace(
    go.Scatter(y=history.history['val_loss'], mode='lines', name='Validation MSE'),
    row=1, col=1
)
fig.update_xaxes(title_text="Epochs", row=1, col=1)
fig.update_yaxes(title_text="MSE", row=1, col=1)

# MAE Plot
fig.add_trace(
    go.Scatter(y=history.history['mae'], mode='lines', name='Train MAE'),
    row=1, col=2
)
fig.add_trace(
    go.Scatter(y=history.history['val_mae'], mode='lines', name='Validation MAE'),
    row=1, col=2
)
fig.update_xaxes(title_text="Epochs", row=1, col=2)
fig.update_yaxes(title_text="MAE", row=1, col=2)

fig.update_layout(title="Model Performance Metrics Across Epochs", showlegend=True)
fig.show()

# Actual vs Predicted Locations Scatter Plot
fig_loc = go.Figure()

# Actual locations
fig_loc.add_trace(go.Scatter(
    x=y_test[:, 0], y=y_test[:, 1], mode='markers',
    marker=dict(color='blue', opacity=0.6), name="Actual Locations"
))

# Predicted locations
fig_loc.add_trace(go.Scatter(
    x=y_pred[:, 0], y=y_pred[:, 1], mode='markers',
    marker=dict(color='red', opacity=0.6), name="Predicted Locations"
))

fig_loc.update_layout(
    title="Actual vs Predicted Locations",
    xaxis_title="X coordinate",
    yaxis_title="Y coordinate",
    showlegend=True
)
fig_loc.show()



# Further Analysis Geographic Clustering of POIs (K-Means)

We use **K-Means Geographic Clustering** to visualize the density of Points of Interest (POIs) in Hiroshima, providing an intuitive view of establishment density and clustering patterns. This clustering approach is valuable for identifying spatial trends across the area.

- **High-Density POIs (Large Circles)**: Large circles in Figure 13 indicate high-density clusters of POIs, highlighting areas with a concentration of establishments or popular destinations. These regions often represent **commercial or high-traffic zones**, such as:
  - **Red and Blue Clusters**: Key areas where users frequently travel, often including shopping districts, business centers, or entertainment areas.

The presence of these distinct clusters suggests likely movement patterns, as users tend to move between or within high-density zones. Such insights are crucial for understanding common routes and spatial trends in user behavior.

In [16]:
# POI Geographic Clustering Plot (with cluster coloring)
# Geographic clustering on POIs
# Using KMeans clustering to group POIs and visualize the clusters
poi_coordinates = df_POI_B[['x', 'y']].values
kmeans = KMeans(n_clusters=5, random_state=42).fit(poi_coordinates)
df_POI_B['cluster'] = kmeans.labels_

poi_clusters = df_POI_B['cluster'].unique()
cluster_colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A']

fig_cluster = go.Figure()

for i, cluster in enumerate(poi_clusters):
    cluster_data = df_POI_B[df_POI_B['cluster'] == cluster]
    fig_cluster.add_trace(go.Scatter(
        x=cluster_data['x'], y=cluster_data['y'],
        mode='markers', marker=dict(size=cluster_data['POI_count']*2, color=cluster_colors[i]),
        name=f"Cluster {cluster}"
    ))

fig_cluster.update_layout(
    title="Geographic Clustering of Points of Interest (POIs)",
    xaxis_title="X coordinate",
    yaxis_title="Y coordinate",
    showlegend=True
)
fig_cluster.show()

# Prediction of User's Subsequent Location

In [17]:
from scipy.spatial import cKDTree

# Create a KDTree for fast nearest-neighbor lookup on POI coordinates
poi_tree = cKDTree(df_POI_B[['x', 'y']].values)

# Function to find the closest establishment name given coordinates
def find_nearest_establishment(x, y):
    distance, idx = poi_tree.query([x, y], k=1)  # Find the nearest POI
    nearest_poi = df_POI_B.iloc[idx]
    return nearest_poi['establishment'], nearest_poi['x'], nearest_poi['y']

# Iterate through a few test samples to find the current and next predicted establishment
for i in range(5):
    # Get actual current coordinates and predicted next coordinates
    current_x, current_y = y_test[i]
    predicted_x, predicted_y = y_pred[i]
    
    # Find current and predicted nearest establishments
    current_establishment, current_est_x, current_est_y = find_nearest_establishment(current_x, current_y)
    predicted_establishment, pred_est_x, pred_est_y = find_nearest_establishment(predicted_x, predicted_y)
    
    # Print results
    print(f"Sample {i+1}")
    print(f"Current Location: ({current_x}, {current_y}) - Closest Establishment: {current_establishment} at ({current_est_x}, {current_est_y})")
    print(f"Predicted Next Location: ({predicted_x}, {predicted_y}) - Closest Establishment: {predicted_establishment} at ({pred_est_x}, {pred_est_y})")
    print("-----")


Sample 1
Current Location: (79, 94) - Closest Establishment: Recruitment Office at (79, 94)
Predicted Next Location: (78.97648620605469, 93.0196762084961) - Closest Establishment: Grocery Store at (79, 93)
-----
Sample 2
Current Location: (90, 108) - Closest Establishment: Transit Station at (90, 108)
Predicted Next Location: (90.41497802734375, 107.27430725097656) - Closest Establishment: Building Material at (90, 107)
-----
Sample 3
Current Location: (80, 96) - Closest Establishment: Port at (80, 96)
Predicted Next Location: (80.68460845947266, 95.49654388427734) - Closest Establishment: NPO at (81, 95)
-----
Sample 4
Current Location: (78, 95) - Closest Establishment: Building Material at (78, 95)
Predicted Next Location: (78.35033416748047, 94.39978790283203) - Closest Establishment: School at (78, 94)
-----
Sample 5
Current Location: (162, 171) - Closest Establishment: Diner at (162, 171)
Predicted Next Location: (163.42266845703125, 171.99330139160156) - Closest Establishment: Ha

In [22]:
from scipy.spatial import cKDTree

# Create KDTree for fast nearest-neighbor lookup on POI coordinates
poi_tree = cKDTree(df_POI_B[['x', 'y']].values)

# Function to find the nearest establishment given coordinates
def find_nearest_establishment(x, y):
    distance, idx = poi_tree.query([x, y], k=1)  # Find the nearest POI
    nearest_poi = df_POI_B.iloc[idx]
    return nearest_poi['establishment'], nearest_poi['x'], nearest_poi['y']

# Define function to predict the next location for a specific user, date, and time
def predict_next_location(uid, d, t, df, sequence_length=5):
    # Filter data for this specific user up to the given date and time
    user_data = df[(df['uid'] == uid) & ((df['d'] < d) | ((df['d'] == d) & (df['t'] <= t)))]
    user_data = user_data.sort_values(by=['d', 't'])
    
    # Ensure we have enough data points for the sequence
    if len(user_data) < sequence_length:
        print(f"Not enough data for user {uid} to form a sequence of length {sequence_length}.")
        return None
    
    # Get the last sequence_length rows to form the input sequence for prediction
    input_sequence = user_data[['x', 'y', 'd', 't', 'category']].iloc[-sequence_length:].values
    input_sequence = np.expand_dims(input_sequence, axis=0)  # Reshape for model input
    
    # Retrieve the current location (last row in the input sequence)
    current_x, current_y = user_data[['x', 'y']].iloc[-1]
    current_time = user_data[['d', 't']].iloc[-1].values  # Date and time of the current location

    # Find the nearest establishment for the current location
    current_establishment, cur_est_x, cur_est_y = find_nearest_establishment(current_x, current_y)
    
    # Predict the next location
    predicted_coordinates = model.predict(input_sequence)[0]  # Extract the (x, y) prediction
    
    # Map the predicted location to the nearest establishment
    nearest_establishment, est_x, est_y = find_nearest_establishment(predicted_coordinates[0], predicted_coordinates[1])
    
    # Output results
    print(f"User {uid}, Current Location (Date {current_time[0]}, Time {current_time[1]}):")
    print(f"Current Coordinates (x, y): ({current_x}, {current_y})")
    print(f"Nearest Establishment: {current_establishment} at ({cur_est_x}, {cur_est_y})")
    print(f"Predicted Next Location (x, y): ({predicted_coordinates[0]:.2f}, {predicted_coordinates[1]:.2f})")
    print(f"Closest Establishment: {nearest_establishment} at ({est_x}, {est_y})")
    print("-----")

    
# Predict the next location for user id 
predict_next_location(uid=0, d=2, t=38, df=df_merged)
predict_next_location(uid=0, d=13, t=45, df=df_merged)
predict_next_location(uid=0, d=13, t=46, df=df_merged)
predict_next_location(uid=0, d=13, t=49, df=df_merged)
predict_next_location(uid=0, d=13, t=51, df=df_merged)
predict_next_location(uid=82, d=70, t=48, df=df_merged)
predict_next_location(uid=108, d=69, t=40, df=df_merged)


User 0, Current Location (Date 2, Time 37):
Current Coordinates (x, y): (80, 101)
Nearest Establishment: Bank at (80, 101)
Predicted Next Location (x, y): (80.65, 100.63)
Closest Establishment: Vet at (81, 101)
-----
User 0, Current Location (Date 13, Time 44):
Current Coordinates (x, y): (80, 101)
Nearest Establishment: Bank at (80, 101)
Predicted Next Location (x, y): (80.64, 100.60)
Closest Establishment: Vet at (81, 101)
-----
User 0, Current Location (Date 13, Time 44):
Current Coordinates (x, y): (80, 101)
Nearest Establishment: Bank at (80, 101)
Predicted Next Location (x, y): (80.64, 100.60)
Closest Establishment: Vet at (81, 101)
-----
User 0, Current Location (Date 13, Time 44):
Current Coordinates (x, y): (80, 101)
Nearest Establishment: Bank at (80, 101)
Predicted Next Location (x, y): (80.64, 100.60)
Closest Establishment: Vet at (81, 101)
-----
User 0, Current Location (Date 13, Time 44):
Current Coordinates (x, y): (80, 101)
Nearest Establishment: Bank at (80, 101)
Predi