In [3]:
import pandas as pd
from read_data import read_data

def compute_delta_metrics(data):
    """
    Computes additional metrics for the dataset:
    - Delta Position: Leader's position minus Follower's position.
    - Delta Velocity: Leader's velocity minus Follower's velocity.
    - Delta Acceleration: Leader's acceleration minus Follower's acceleration.
    - Time-To-Collision (TTC): Delta Position divided by Delta Velocity.
    """
    data["delta_position"] = data["x_leader"] - data["x_follower"]
    data["delta_velocity"] = data["v_leader"] - data["v_follower"]
    data["delta_acceleration"] = data["a_leader"] - data["a_follower"]
    data["TTC"] = data["delta_position"] / data["delta_velocity"]
    return data

def aggregate_data_by_case(data):
    """
    Aggregates the dataset by 'case_id' to find the max and min 
    of each delta metric and TTC for each case.
    Renames columns for clarity and adds case_id as a column.
    """
    aggr_data = data.groupby('case_id').agg({
        'delta_position': ['max', 'min'], 
        'delta_velocity': ['max', 'min'], 
        'delta_acceleration': ['max', 'min'], 
        'TTC': ['max', 'min']
    })
    aggr_data.columns = ["_".join(x) for x in aggr_data.columns.ravel()]
    aggr_data["case_id"] = aggr_data.index
    aggr_data.columns = [
        'max_delta_position', 'min_delta_position', 
        'max_delta_velocity', 'min_delta_velocity', 
        'max_delta_acceleration', 'min_delta_acceleration', 
        'max_TTC', 'min_TTC', 'case_id'
    ]
    aggr_data = aggr_data[
        ['case_id', 'max_delta_position', 'min_delta_position', 
        'max_delta_velocity', 'min_delta_velocity', 
        'max_delta_acceleration', 'min_delta_acceleration', 
        'max_TTC', 'min_TTC']
    ]
    return aggr_data

def adjust_ttc_sign(aggregated_data):
    """
    Ensures TTC (Time-To-Collision) is non-negative by taking the absolute value.
    """
    aggregated_data["min_TTC"] = aggregated_data["min_TTC"].abs()
    return aggregated_data

def convert_df(dataset: str, mode: str):
    """
    Main function that utilizes the above helper functions to preprocess the data.
    Returns a DataFrame grouped by 'case_id' with max and min values of
    delta position, delta velocity, delta acceleration, and TTC (Time-To-Collision).
    """
    data = read_data(dataset, mode)
    data = compute_delta_metrics(data)
    aggregated_data = aggregate_data_by_case(data)
    aggregated_data = adjust_ttc_sign(aggregated_data)
    return aggregated_data


In [4]:
trainHA = convert_df("HA", "train")
trainHA.describe()

Unnamed: 0,case_id,max_delta_position,min_delta_position,max_delta_velocity,min_delta_velocity,max_delta_acceleration,min_delta_acceleration,max_TTC,min_TTC
count,26394.0,26394.0,26394.0,26394.0,26394.0,26394.0,26394.0,26394.0,26394.0
mean,13196.5,25.333803,12.660995,1.907915,-1.961766,1.214429,-1.377007,194664.2,202333.3
std,7619.435839,11.389465,7.00505,1.246547,1.262682,0.625144,1.297498,11345970.0,17630100.0
min,0.0,7.970585,3.192059,-2.645687,-15.422088,-0.164731,-10.560531,-53.70677,2.865257
25%,6598.25,18.117176,7.330479,0.874794,-2.585847,0.716436,-1.457096,1228.487,1226.827
50%,13196.5,22.091704,11.508556,1.80344,-1.798556,1.164713,-1.044787,3797.345,3719.639
75%,19794.75,28.64732,15.768843,2.779035,-1.05455,1.601606,-0.748433,15289.28,15204.6
max,26393.0,85.211403,78.518199,13.74209,1.298914,6.417317,-0.032989,1579993000.0,2840646000.0


In [5]:
df = trainHA.head(5000)

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm import tqdm
import math

class LSTMModel(nn.Module):
    def __init__(self, input_size=1, hidden_layer_size=100, output_size=1):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_layer_size, batch_first=True)
        self.linear = nn.Linear(hidden_layer_size, output_size)
    
    def forward(self, input_seq):
        lstm_out, _ = self.lstm(input_seq)
        predictions = self.linear(lstm_out[:, -1, :])
        return predictions

def create_sequences(data, n_steps_in, n_steps_out):
    X, y = [], []
    for i in range(len(data)):
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out
        if out_end_ix > len(data):
            break
        X.append(data[i:end_ix, :])
        y.append(data[end_ix:out_end_ix, :])
    return np.array(X), np.array(y)

def preprocess_data(df, n_steps_in, n_steps_out, test_size=0.2):
    scaler = MinMaxScaler()
    data_normalized = scaler.fit_transform(df.values)
    X, y = create_sequences(data_normalized, n_steps_in, n_steps_out)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    return (
        torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32),
        torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32),
        scaler
    )

def train_model(model, X_train_tensor, y_train_tensor, epochs, optimizer, loss_function):
    for epoch in tqdm(range(epochs)):
        model.train()
        optimizer.zero_grad()
        y_pred = model(X_train_tensor)
        loss = loss_function(y_pred, y_train_tensor[:,0,:])
        loss.backward()
        optimizer.step()
        if epoch%10 == 0:
            tqdm.write(f'epoch: {epoch} loss: {loss.item():.8f}')

def evaluate_model(model, X_test_tensor, y_test_tensor, scaler):
    model.eval()
    with torch.no_grad():
        y_test_pred = model(X_test_tensor)
        y_test_pred_np = scaler.inverse_transform(y_test_pred.numpy())
        y_test_actual_np = scaler.inverse_transform(y_test_tensor[:, 0, :].numpy())
        
        mse = mean_squared_error(y_test_actual_np, y_test_pred_np)
        rmse = math.sqrt(mse)
        mae = mean_absolute_error(y_test_actual_np, y_test_pred_np)
    
    print(f'Mean Squared Error (MSE): {mse:.2f}')
    print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
    print(f'Mean Absolute Error (MAE): {mae:.2f}')
    
    return mse, rmse, mae

# Usage
# Assume df is your DataFrame
n_steps_in, n_steps_out = 3, 1
X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor, scaler = preprocess_data(df, n_steps_in, n_steps_out)

model = LSTMModel(input_size=X_train_tensor.shape[2], hidden_layer_size=50, output_size=X_train_tensor.shape[2])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_function = nn.MSELoss()

train_model(model, X_train_tensor, y_train_tensor, epochs=100, optimizer=optimizer, loss_function=loss_function)
evaluate_model(model, X_test_tensor, y_test_tensor, scaler)


  9%|▉         | 9/100 [00:00<00:02, 37.74it/s]

epoch: 0 loss: 0.19935077


 18%|█▊        | 18/100 [00:00<00:02, 38.70it/s]

epoch: 10 loss: 0.14217064


 27%|██▋       | 27/100 [00:00<00:01, 39.45it/s]

epoch: 20 loss: 0.08213406


 36%|███▌      | 36/100 [00:00<00:01, 39.67it/s]

epoch: 30 loss: 0.03278566


 49%|████▉     | 49/100 [00:01<00:01, 40.74it/s]

epoch: 40 loss: 0.02085068


 59%|█████▉    | 59/100 [00:01<00:00, 41.71it/s]

epoch: 50 loss: 0.01822294


 69%|██████▉   | 69/100 [00:01<00:00, 41.36it/s]

epoch: 60 loss: 0.01719114


 79%|███████▉  | 79/100 [00:01<00:00, 41.30it/s]

epoch: 70 loss: 0.01634126


 89%|████████▉ | 89/100 [00:02<00:00, 38.49it/s]

epoch: 80 loss: 0.01578575


 98%|█████████▊| 98/100 [00:02<00:00, 39.18it/s]

epoch: 90 loss: 0.01518574


100%|██████████| 100/100 [00:02<00:00, 39.65it/s]

Mean Squared Error (MSE): 146378964992.00
Root Mean Squared Error (RMSE): 382595.04
Mean Absolute Error (MAE): 32360.24





(146378960000.0, 382595.0404696851, 32360.24)

In [8]:
eval_df = convert_df("HA", "val")


Unnamed: 0,case_id,max_delta_position,min_delta_position,max_delta_velocity,min_delta_velocity,max_delta_acceleration,min_delta_acceleration,max_TTC,min_TTC
count,3055.0,3055.0,3055.0,3055.0,3055.0,3055.0,3055.0,3055.0,3055.0
mean,1527.0,25.604352,12.711291,1.893284,-1.982638,1.211879,-1.439456,64149.2,166717.0
std,882.046862,12.007657,7.03105,1.269694,1.312555,0.623939,1.44142,481153.9,6365825.0
min,0.0,8.330992,4.075634,-2.195163,-14.847751,0.071372,-9.440815,-43.55857,3.169219
25%,763.5,18.141669,7.413019,0.839536,-2.594725,0.727187,-1.472621,1120.267,1139.718
50%,1527.0,21.993881,11.463593,1.771939,-1.838834,1.150867,-1.063833,3774.45,3867.598
75%,2290.5,28.660308,15.838648,2.796353,-1.069155,1.612258,-0.75698,16828.61,15967.79
max,3054.0,84.906824,70.405945,11.338088,0.785634,6.170332,0.012248,16133990.0,351385600.0
