## Task 3: Location Prediction - City D

Note: We only consider the first 30 days (75 days in total) of the CityD dataset to reduce the data processing load. We believe this will not affect the result of analysis because the first 30 days is long enough capture the significant trends and patterns that are relevant to the analysis.
The original dataset consists of 8,418,135 rows. After the selection, the number of rows are reduced to 3,389,319, approximately one-third of the original dataset.

In [None]:
import tensorflow as tf
import os

# Set TensorFlow to use multiple threads
os.environ['OMP_NUM_THREADS'] = '20'  # Set this to the number of CPU cores you want to use
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Disable TensorFlow debug info

# To configure intra-op and inter-op parallelism
tf.config.threading.set_intra_op_parallelism_threads(4)
tf.config.threading.set_inter_op_parallelism_threads(4)

## Preprocess Data

In [54]:
import pandas as pd
import numpy as np
import os

# Load data
# Path to the dataset subfolder
data_path = './dataset'

# List all files in the subfolder
# file_paths = [os.path.join(data_path, file) for file in os.listdir(data_path) if file.endswith('.csv.gz')]
file_path = './dataset\\cityD_challengedata.csv.gz'
city_name = os.path.basename(file_path).split('_')[0]
# data = pd.read_csv(file_path, compression='gzip', nrows = 500000)
data = pd.read_csv(file_path, compression='gzip')

### Data Statistics

In [55]:
print(f"Total number of days: {len(data['d'].unique())}")
print(f"Total number of participants: {len(data['uid'].unique())}")
print(f"Data size: {len(data)}")

Total number of days: 75
Total number of participants: 6000
Data size: 8418135


In [56]:
def create_sequences(data, seq_length=3):
    sequences = []
    labels = []
    for uid, group in data.groupby('uid'):
        coords = group[['x', 'y']].values
        for i in range(len(coords) - seq_length):
            sequences.append(coords[i:i + seq_length])  # Input sequence
            labels.append(coords[i + seq_length])  # Target value
    return np.array(sequences), np.array(labels)

In [57]:
# Training data
processed_data = data[data['d'] < 30]

# Generate sequences and labels
X, y = create_sequences(processed_data)

# Display the shape of the prepared data
print(f"Shape of input sequences (X): {X.shape}")
print(f"Shape of labels (y): {y.shape}")
print(f"Size of training data: {len(processed_data)}")

Shape of input sequences (X): (3371370, 3, 2)
Shape of labels (y): (3371370, 2)
Size of training data: 3389319


## Model Implementation

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Define LSTM model architecture
model = Sequential([
    LSTM(50, activation='relu', input_shape=(X.shape[1], X.shape[2])),
    Dense(2)  # Output layer with 2 units for predicting (x, y)
])

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(X, y, epochs=3, batch_size=64, validation_split=0.2)

# Save the model for future use
model.save('trajectory_predictor.h5')

print("Model training complete!")

Epoch 1/10


  super().__init__(**kwargs)


[1m52678/52678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 3ms/step - loss: 95.0339
Epoch 2/10
[1m52678/52678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 4ms/step - loss: 54.2944
Epoch 3/10
[1m52678/52678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 3ms/step - loss: 53.6019
Epoch 4/10
[1m46664/52678[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m30s[0m 5ms/step - loss: 53.2388

KeyboardInterrupt: 

## Model Evaluation

### Generate test data

In [None]:
# Generate sequences and labels
# X_test, y_test = create_sequences(data[(data['d'] > 30) & (data['d'] < 40)])
X_test, y_test = create_sequences(data[(data['d'] > 30)])

# Display the shape of the prepared data
print(f"Shape of input sequences (X_test): {X_test.shape}")
print(f"Shape of labels (y_test): {y_test.shape}")

Shape of input sequences (X_test): (294787, 3, 2)
Shape of labels (y_test): (294787, 2)


### Compute Accuracy

In [50]:
def compute_acc_at_k(model, X_test, y_test, k_values=[1, 2, 3, 4, 5]):
    """
    Compute Accuracy at k (Acc@k).

    Args:
        model: Trained Keras model.
        X_test: Test input data, shape (num_samples, 3, 2).
        y_test: Ground truth labels, shape (num_samples, 2).
        k_values: List of k values to compute Acc@k.

    Returns:
        acc_at_k: Dictionary with Acc@k for each k.
    """
    predictions = model.predict(X_test)  # Shape: (num_samples, 2)
    acc_at_k = {k: 0 for k in k_values}
    num_samples = X_test.shape[0]

    for i in range(num_samples):
        true_target = y_test[i]  # True target (x, y)
        pred = predictions[i]  # Predicted location (x, y)

        # Compute Euclidean distances between prediction and true target
        distance = np.linalg.norm(pred - true_target)

        # Check if the distance qualifies as a "hit" within k thresholds
        for k in k_values:
            if distance <= k:  # Define threshold based on domain context
                acc_at_k[k] += 1

    # Normalize Acc@k
    acc_at_k = {k: acc / num_samples for k, acc in acc_at_k.items()}
    return acc_at_k

# Example usage:
acc_at_k = compute_acc_at_k(model2, X_test, y_test)
# acc_at_k = compute_acc_at_k(model, X, y)
print(f"Accuracy @ k: {acc_at_k}")

[1m9213/9213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1ms/step
Accuracy @ k: {1: 0.08487823411480154, 2: 0.31254770393538384, 3: 0.5241886514669915, 4: 0.6382710228062974, 5: 0.7038132617788437}


In [51]:
def compute_mrr(model, X_test, y_test):
    """
    Compute Mean Reciprocal Rank (MRR).

    Args:
        model: Trained Keras model.
        X_test: Test input data, shape (num_samples, 3, 2).
        y_test: Ground truth labels, shape (num_samples, 2).

    Returns:
        mrr: Mean Reciprocal Rank (MRR).
    """
    predictions = model.predict(X_test)  # Shape: (num_samples, 2)
    reciprocal_ranks = []
    num_samples = X_test.shape[0]

    for i in range(num_samples):
        true_target = y_test[i]  # True target (x, y)
        pred = predictions[i]  # Predicted location (x, y)

        # Compute Euclidean distance between prediction and true target
        distance = np.linalg.norm(pred - true_target)

        # Rank is determined based on distance (assume sorted rank)
        rank = 1 if distance < 1e-5 else 2  # Adjust ranking logic as needed
        reciprocal_ranks.append(1 / rank)

    # Compute MRR
    mrr = np.mean(reciprocal_ranks)
    return mrr

# Example usage:
mrr = compute_mrr(model, X_test, y_test)
# mrr = compute_mrr(model, X, y)
print(f"Mean Reciprocal Rank (MRR): {mrr}")


[1m9213/9213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 1ms/step
Mean Reciprocal Rank (MRR): 0.5
