## Task 3: Location Prediction - City C

Note: We only consider the first 30 days (75 days in total) of the CityC dataset to reduce the data processing load. We believe this will not affect the result of analysis because the first 30 days is long enough capture the significant trends and patterns that are relevant to the analysis.
The original dataset consists of 18,456,528 rows, including 449308 rows with missing data. After the selection, the number of rows are reduced to 7,428,781, approximately one-third of the original dataset.

## Preprocess Data

In [17]:
import pandas as pd
import numpy as np
import os

# Load data
# Path to the dataset subfolder
data_path = './dataset'

# List all files in the subfolder
# file_paths = [os.path.join(data_path, file) for file in os.listdir(data_path) if file.endswith('.csv.gz')]
file_path = './dataset\\cityC_challengedata.csv.gz'
city_name = os.path.basename(file_path).split('_')[0]
# data = pd.read_csv(file_path, compression='gzip', nrows = 500000)
data = pd.read_csv(file_path, compression='gzip')
data = data[data['x'] != 999]

### Data Statistics

In [18]:
print(f"Total number of days: {len(data['d'].unique())}")
print(f"Total number of participants: {len(data['uid'].unique())}")
print(f"Data size: {len(data)}")

Total number of days: 75
Total number of participants: 20000
Data size: 18007220


In [3]:
def create_sequences(data, seq_length=3):
    sequences = []
    labels = []
    for uid, group in data.groupby('uid'):
        coords = group[['x', 'y']].values
        for i in range(len(coords) - seq_length):
            sequences.append(coords[i:i + seq_length])  # Input sequence
            labels.append(coords[i + seq_length])  # Target value
    return np.array(sequences), np.array(labels)

In [19]:
# Training data
processed_data = data[data['d'] < 30]

# Generate sequences and labels
X, y = create_sequences(processed_data)

# Display the shape of the prepared data
print(f"Shape of input sequences (X): {X.shape}")
print(f"Shape of labels (y): {y.shape}")
print(f"Size of training data: {len(processed_data)}")

Shape of input sequences (X): (7368961, 3, 2)
Shape of labels (y): (7368961, 2)
Size of training data: 7428781


## Model Implementation

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Define LSTM model architecture
model = Sequential([
    LSTM(50, activation='relu', input_shape=(X.shape[1], X.shape[2])),
    Dense(2)  # Output layer with 2 units for predicting (x, y)
])

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(X, y, epochs=3, batch_size=64, validation_split=0.2)

# Save the model for future use
model.save('trajectory_predictor.h5')

print("Model training complete!")

Epoch 1/3
[1m92112/92112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m405s[0m 4ms/step - loss: 66.9209 - val_loss: 48.0804
Epoch 2/3
[1m92112/92112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m372s[0m 4ms/step - loss: 47.5220 - val_loss: 47.5418
Epoch 3/3
[1m92112/92112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m474s[0m 5ms/step - loss: 47.3717 - val_loss: 47.8646




Model training complete!


## Model Evaluation

Create test data

In [24]:
# Generate sequences and labels
# X_test, y_test = create_sequences(data[(data['d'] > 30) & (data['d'] < 40)])
X_test, y_test = create_sequences(data[(data['d'] > 30)])

# Display the shape of the prepared data
print(f"Shape of input sequences (X_test): {X_test.shape}")
print(f"Shape of labels (y_test): {y_test.shape}")

Shape of input sequences (X_test): (10250576, 3, 2)
Shape of labels (y_test): (10250576, 2)


In [25]:
def compute_acc_at_k(model, X_test, y_test, k_values=[1, 2, 3, 4, 5]):
    """
    Compute Accuracy at k (Acc@k).

    Args:
        model: Trained Keras model.
        X_test: Test input data, shape (num_samples, 3, 2).
        y_test: Ground truth labels, shape (num_samples, 2).
        k_values: List of k values to compute Acc@k.

    Returns:
        acc_at_k: Dictionary with Acc@k for each k.
    """
    predictions = model.predict(X_test)  # Shape: (num_samples, 2)
    acc_at_k = {k: 0 for k in k_values}
    num_samples = X_test.shape[0]

    for i in range(num_samples):
        true_target = y_test[i]  # True target (x, y)
        pred = predictions[i]  # Predicted location (x, y)
        if true_target[0] == '-999': continue

        # Compute Euclidean distances between prediction and true target
        distance = np.linalg.norm(pred - true_target)

        # Check if the distance qualifies as a "hit" within k thresholds
        for k in k_values:
            if distance <= k:  # Define threshold based on domain context
                acc_at_k[k] += 1

    # Normalize Acc@k
    acc_at_k = {k: round(acc / num_samples, 4) for k, acc in acc_at_k.items()}
    return acc_at_k

# Example usage:
acc_at_k = compute_acc_at_k(model, X_test, y_test)
# acc_at_k = compute_acc_at_k(model, X, y)
print(f"Accuracy @ k: {acc_at_k}")

[1m320331/320331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m801s[0m 2ms/step
Accuracy @ k: {1: 0.1342, 2: 0.3814, 3: 0.5409, 4: 0.6322, 5: 0.6933}
