In [1]:
import json

# Load the JSON file into variable "data"
with open('data_entries_ti_uniform.json', 'r', encoding='UTF-8') as file:
    data = json.load(file)

In [None]:
# Import customer feature engineering functions
from feature_engineering_functions import calculate_time_differences, calculate_distances, calculate_edge_types, calculate_speeds, calculate_directions, calculate_pressure_diffs, calculate_fraction_pen_on

In [None]:
# Create pandas dataframe from the JSON file: simpler exploring, feature engineering, and filtering with a dataframe. 
import pandas as pd
import numpy as np

df = pd.DataFrame(columns=["participant", "task", "label", "x_coords", "y_coords", 
                           "pressure", "times", "distances", "edgetypes", 
                           "speeds", "directions", "pressure_diffs", "fractions"])

for entry in data:
    participant_id = entry["participant"]
    task = entry["task"]
    class_label = 0 if participant_id.startswith("H") else 1

    # Filter data for nodes where pressure > 0 and get their original indices
    indices = [i for i, row in enumerate(entry["data"]) if row[2] > 0]
    filtered_data = [entry["data"][i] for i in indices]

    # Extract Node features
    x_coords = [row[0] for row in filtered_data]
    y_coords = [row[1] for row in filtered_data]
    pressure = [row[2] for row in filtered_data]

    # Extract engineered Edge features
    times = calculate_time_differences(entry["data"])
    distances = calculate_distances(x_coords, y_coords)
    edgetypes = calculate_edge_types(entry["data"])
    speeds = calculate_speeds(distances, times)
    directions = calculate_directions(x_coords, y_coords)
    pressure_diffs = calculate_pressure_diffs(pressure)
    fractions = calculate_fraction_pen_on(indices)

    # Add row to dataframe
    df.loc[len(df)] = [participant_id, task, class_label, x_coords, y_coords, 
                       pressure, times, distances, edgetypes, speeds, 
                       directions, pressure_diffs, fractions]


In [None]:
# Filter out missing data
df_filtered = df[(df["x_coords"].apply(len) > 0)]

In [None]:
# Get unique tasks
tasks = df_filtered['task'].unique()
# Group tasks where each task is a key, and the corresponding rows (as a DataFrame) where task matches are stored as values. 
task_dfs = {task: df_filtered[df_filtered['task'] == task] for task in tasks}
print(f"Number of tasks: {len(tasks)}")

Number of tasks: 25


In [6]:
from sklearn.model_selection import train_test_split

task_splits = {}
for task, task_df in task_dfs.items():
    train_val_df, test_df = train_test_split(task_df, test_size=0.15, random_state=5)
    train_df, val_df = train_test_split(train_val_df, test_size=0.1765, random_state=5)  # 0.1765 * 0.85 ≈ 0.15
    task_splits[task] = {'train': train_df, 'val': val_df, 'test': test_df}
    print(f"Task {task}: Train size: {len(train_df)}, Val size: {len(val_df)}, Test size: {len(test_df)}")

Task T15: Train size: 117, Val size: 26, Test size: 26
Task T16: Train size: 118, Val size: 26, Test size: 26
Task T17: Train size: 118, Val size: 26, Test size: 26
Task T18: Train size: 118, Val size: 26, Test size: 26
Task T19: Train size: 97, Val size: 21, Test size: 21
Task T20: Train size: 114, Val size: 25, Test size: 25
Task T21: Train size: 111, Val size: 25, Test size: 24
Task T22: Train size: 114, Val size: 25, Test size: 25
Task T23: Train size: 115, Val size: 25, Test size: 25
Task T24: Train size: 113, Val size: 25, Test size: 25
Task T25: Train size: 111, Val size: 25, Test size: 25
Task T1: Train size: 120, Val size: 26, Test size: 26
Task T2: Train size: 119, Val size: 26, Test size: 26
Task T3: Train size: 120, Val size: 26, Test size: 26
Task T4: Train size: 121, Val size: 26, Test size: 26
Task T10: Train size: 118, Val size: 26, Test size: 26
Task T11: Train size: 118, Val size: 26, Test size: 26
Task T12: Train size: 117, Val size: 26, Test size: 26
Task T13: Train

In [None]:
def skewness_corrections(train_df, val_df, test_df):
    import numpy as np
    from scipy.special import logit
    def flatten_column(df, column):
        return np.concatenate(df[column].values) if not df[column].empty else np.array([])

    # Fit parameters on training set
    pressure_train = flatten_column(train_df, 'pressure')
    max_pressure = np.max(pressure_train) if len(pressure_train) > 0 else 2047
    pressure_diffs_train = flatten_column(train_df, 'pressure_diffs')
    min_pressure_diffs = np.min(pressure_diffs_train) if len(pressure_diffs_train) > 0 else -1315

    # Transform function
    def transform_df(df):
        df_transformed = df.copy()
        
        # No transformation for mild skewness
        df_transformed['x_coords_transformed'] = df['x_coords']
        df_transformed['y_coords_transformed'] = df['y_coords']
        df_transformed['directions_transformed'] = df['directions']
        
        # Log for positive skewness with small constant to avoid log(0)
        df_transformed['times_transformed'] = df['times'].apply(
            lambda x: np.log(np.array(x) + 1e-6) if len(x) > 0 else x
        )
        df_transformed['distances_transformed'] = df['distances'].apply(
            lambda x: np.log(np.array(x) + 1e-6) if len(x) > 0 else x
        )
        df_transformed['speeds_transformed'] = df['speeds'].apply(
            lambda x: np.log(np.array(x) + 1e-6) if len(x) > 0 else x
        )
        
        # Reflect and log for negative skewness
        df_transformed['pressure_transformed'] = df['pressure'].apply(
            lambda x: np.log(max_pressure - np.array(x) + 1) if len(x) > 0 else x
        )
        df_transformed['pressure_diffs_transformed'] = df['pressure_diffs'].apply(
            lambda x: np.log(np.maximum(np.array(x) - min_pressure_diffs + 1, 1e-6)) if len(x) > 0 else x
        )
        
        # Logit for fractions
        df_transformed['fractions_transformed'] = df['fractions'].apply(
            lambda x: logit(np.clip(np.array(x), 0.01, 0.99)) if len(x) > 0 else x
        )
        
        # No change for edgetypes (binary values)
        df_transformed['edgetypes_transformed'] = df['edgetypes']
        
        return df_transformed

    # Apply to all sets
    train_df_transformed = transform_df(train_df)
    val_df_transformed = transform_df(val_df)
    test_df_transformed = transform_df(test_df)
    
    return train_df_transformed, val_df_transformed, test_df_transformed

import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

def apply_scaling_after_transform(train_df, val_df, test_df):
    # Initialize scalers
    minmax_scaler = MinMaxScaler()
    standard_scaler = StandardScaler()

    # Columns to scale
    norm_cols = ['x_coords_transformed', 'y_coords_transformed']
    std_cols = ['pressure_transformed', 'times_transformed', 'distances_transformed', 
                'speeds_transformed', 'pressure_diffs_transformed', 'fractions_transformed']
    no_scale_cols = ['edgetypes_transformed', 'directions_transformed']

    # Fit scalers on training data
    for col in norm_cols:
        train_flat = np.concatenate(train_df[col].values)
        train_flat = np.clip(train_flat, -1e10, 1e10)
        train_flat = np.nan_to_num(train_flat, nan=0.0, posinf=1e10, neginf=-1e10)
        minmax_scaler.fit(train_flat.reshape(-1, 1))
        for df in [train_df, val_df, test_df]:
            df[f'{col}_scaled'] = df[col].apply(
                lambda x: minmax_scaler.transform(
                    np.clip(np.nan_to_num(x, nan=0.0, posinf=1e10, neginf=-1e10), -1e10, 1e10).reshape(-1, 1)
                ).flatten() if len(x) > 0 else x
            )

    for col in std_cols:
        train_flat = np.concatenate(train_df[col].values)
        train_flat = np.clip(train_flat, -1e10, 1e10)
        train_flat = np.nan_to_num(train_flat, nan=0.0, posinf=1e10, neginf=-1e10)
        standard_scaler.fit(train_flat.reshape(-1, 1))
        for df in [train_df, val_df, test_df]:
            df[f'{col}_scaled'] = df[col].apply(
                lambda x: standard_scaler.transform(
                    np.clip(np.nan_to_num(x, nan=0.0, posinf=1e10, neginf=-1e10), -1e10, 1e10).reshape(-1, 1)
                ).flatten() if len(x) > 0 else x
            )

    # Copy unscaled columns
    for col in no_scale_cols:
        for df in [train_df, val_df, test_df]:
            df[f'{col}_scaled'] = df[col]

    return train_df, val_df, test_df

In [None]:
task_processed = {}
for task, splits in task_splits.items():
    # Apply skewness corrections
    train_df, val_df, test_df = skewness_corrections(splits['train'], splits['val'], splits['test'])
    # Apply scaling
    train_df, val_df, test_df = apply_scaling_after_transform(train_df, val_df, test_df)
    task_processed[task] = {'train': train_df, 'val': val_df, 'test': test_df}

In [None]:
import numpy as np
import torch
from torch_geometric.data import Data

def create_data_list(df):
    data_list = []
    for _, row in df.iterrows():
        x = np.array(list(zip(
            row['x_coords_transformed_scaled'],
            row['y_coords_transformed_scaled'],
            row['pressure_transformed_scaled']
        )))
        x = torch.tensor(x, dtype=torch.float)
        num_nodes = len(row['x_coords_transformed_scaled'])

        if num_nodes > 1:
            edge_attr = np.array(list(zip(
                row['times_transformed_scaled'],
                row['distances_transformed_scaled'],
                row['edgetypes_transformed_scaled'],
                row['speeds_transformed_scaled'],
                row['directions_transformed_scaled'],
                row['pressure_diffs_transformed_scaled'],
                row['fractions_transformed_scaled']
            )))

            # Sanity Check:
            expected_edges = num_nodes - 1
            if edge_attr.shape[0] != expected_edges:
                print(f"Warning: Graph with {num_nodes} nodes has {edge_attr.shape[0]} edges, expected {expected_edges}. Skipping.")
                continue
            
            edge_attr = torch.tensor(edge_attr, dtype=torch.float)
            edge_index = torch.tensor([[i, i + 1] for i in range(num_nodes - 1)], dtype=torch.long).t()
        else:
            edge_attr = torch.empty((0, 7), dtype=torch.float)
            edge_index = torch.empty((2, 0), dtype=torch.long)

        y = torch.tensor([row['label']], dtype=torch.float)
        data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
        data_list.append(data)
    return data_list

In [12]:
task_graphs = {}
for task, processed in task_processed.items():
    graph_list_train = create_data_list(processed['train'])
    graph_list_val = create_data_list(processed['val'])
    graph_list_test = create_data_list(processed['test'])
    task_graphs[task] = {'train': graph_list_train, 'val': graph_list_val, 'test': graph_list_test}
    print(f"Task {task}: Train graphs: {len(graph_list_train)}, Val graphs: {len(graph_list_val)}, Test graphs: {len(graph_list_test)}")

Task T15: Train graphs: 117, Val graphs: 26, Test graphs: 26
Task T16: Train graphs: 118, Val graphs: 26, Test graphs: 26
Task T17: Train graphs: 118, Val graphs: 26, Test graphs: 26
Task T18: Train graphs: 118, Val graphs: 26, Test graphs: 26
Task T19: Train graphs: 97, Val graphs: 21, Test graphs: 21
Task T20: Train graphs: 114, Val graphs: 25, Test graphs: 25
Task T21: Train graphs: 111, Val graphs: 25, Test graphs: 24
Task T22: Train graphs: 114, Val graphs: 25, Test graphs: 25
Task T23: Train graphs: 115, Val graphs: 25, Test graphs: 25
Task T24: Train graphs: 113, Val graphs: 25, Test graphs: 25
Task T25: Train graphs: 111, Val graphs: 25, Test graphs: 25
Task T1: Train graphs: 120, Val graphs: 26, Test graphs: 26
Task T2: Train graphs: 119, Val graphs: 26, Test graphs: 26
Task T3: Train graphs: 120, Val graphs: 26, Test graphs: 26
Task T4: Train graphs: 121, Val graphs: 26, Test graphs: 26
Task T10: Train graphs: 118, Val graphs: 26, Test graphs: 26
Task T11: Train graphs: 118, 

In [None]:
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GATConv, global_mean_pool
import torch.nn as nn
import torch.nn.functional as F

# Define the GAT model (unchanged)
class GATModel(nn.Module):
    def __init__(self, in_channels=3, hidden_channels=16, out_channels=1, heads=4):
        super(GATModel, self).__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads=heads, edge_dim=7)
        self.conv2 = GATConv(hidden_channels * heads, hidden_channels, heads=heads, edge_dim=7)
        self.conv3 = GATConv(hidden_channels * heads, hidden_channels, heads=heads, edge_dim=7)
        self.lin = nn.Linear(hidden_channels * heads, out_channels)

    def forward(self, data):
        x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
        x = F.relu(self.conv1(x, edge_index, edge_attr))
        x = F.relu(self.conv2(x, edge_index, edge_attr))
        x = F.relu(self.conv3(x, edge_index, edge_attr))
        x = global_mean_pool(x, batch)
        x = self.lin(x)
        return x

# Training function
def train(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)
        # Use squeeze(-1) to preserve batch dimension
        loss = criterion(out.squeeze(-1), data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# Evaluation function
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            out = model(data)
            loss = criterion(out.squeeze(), data.y)
            total_loss += loss.item() * data.num_graphs
            pred = (out.squeeze() > 0).float()
            correct += (pred == data.y).sum().item()
            total += data.y.size(0)
    avg_loss = total_loss / total
    accuracy = correct / total
    return avg_loss, accuracy

# Training loop for each task
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
task_models = {}
patience = 5

for task, graphs in task_graphs.items():
    print(f"\nTraining model for Task {task}")
    train_loader = DataLoader(graphs['train'], batch_size=32, shuffle=True)
    val_loader = DataLoader(graphs['val'], batch_size=32, shuffle=False)
    test_loader = DataLoader(graphs['test'], batch_size=32, shuffle=False)

    model = GATModel().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = nn.BCEWithLogitsLoss()

    best_val_loss = float('inf')
    counter = 0
    for epoch in range(1, 201):
        train_loss = train(model, train_loader, optimizer, criterion, device)
        val_loss, val_acc = evaluate(model, val_loader, criterion, device)
        print(f"Task: {task}, Epoch: {epoch:03d}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), f'best_model_{task}.pth')
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                print(f"Early stopping at epoch {epoch} for task {task}")
                break

    task_models[task] = model


Training model for Task T15
Task: T15, Epoch: 001, Train Loss: 0.6982, Val Loss: 0.6731, Val Acc: 0.7692
Task: T15, Epoch: 002, Train Loss: 0.6841, Val Loss: 0.6418, Val Acc: 0.7692
Task: T15, Epoch: 003, Train Loss: 0.6806, Val Loss: 0.6177, Val Acc: 0.7308
Task: T15, Epoch: 004, Train Loss: 0.6616, Val Loss: 0.6060, Val Acc: 0.6154
Task: T15, Epoch: 005, Train Loss: 0.6742, Val Loss: 0.5703, Val Acc: 0.7692
Task: T15, Epoch: 006, Train Loss: 0.6503, Val Loss: 0.5721, Val Acc: 0.6923
Task: T15, Epoch: 007, Train Loss: 0.6504, Val Loss: 0.5345, Val Acc: 0.8462
Task: T15, Epoch: 008, Train Loss: 0.6572, Val Loss: 0.4889, Val Acc: 0.8077
Task: T15, Epoch: 009, Train Loss: 0.6326, Val Loss: 0.5574, Val Acc: 0.7308
Task: T15, Epoch: 010, Train Loss: 0.6366, Val Loss: 0.5152, Val Acc: 0.9231
Task: T15, Epoch: 011, Train Loss: 0.6144, Val Loss: 0.4449, Val Acc: 0.9231
Task: T15, Epoch: 012, Train Loss: 0.6200, Val Loss: 0.4263, Val Acc: 0.8462
Task: T15, Epoch: 013, Train Loss: 0.6013, Val 

In [15]:
for task, model in task_models.items():
    model.load_state_dict(torch.load(f'best_model_{task}.pth'))
    test_loader = DataLoader(task_graphs[task]['test'], batch_size=32, shuffle=False)
    test_loss, test_acc = evaluate(model, test_loader, criterion, device)
    print(f"Task: {task}, Final Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}")

Task: T15, Final Test Loss: 0.5458, Test Accuracy: 0.7308
Task: T16, Final Test Loss: 0.5253, Test Accuracy: 0.7692
Task: T17, Final Test Loss: 0.6035, Test Accuracy: 0.6538
Task: T18, Final Test Loss: 0.5878, Test Accuracy: 0.7692
Task: T19, Final Test Loss: 0.6069, Test Accuracy: 0.7619
Task: T20, Final Test Loss: 0.6074, Test Accuracy: 0.7200
Task: T21, Final Test Loss: 0.4576, Test Accuracy: 0.9167
Task: T22, Final Test Loss: 0.5842, Test Accuracy: 0.7600
Task: T23, Final Test Loss: 0.6527, Test Accuracy: 0.6000
Task: T24, Final Test Loss: 0.6231, Test Accuracy: 0.6400
Task: T25, Final Test Loss: 0.6161, Test Accuracy: 0.6800
Task: T1, Final Test Loss: 0.6147, Test Accuracy: 0.6538
Task: T2, Final Test Loss: 0.5818, Test Accuracy: 0.8077
Task: T3, Final Test Loss: 0.6695, Test Accuracy: 0.6538
Task: T4, Final Test Loss: 0.6005, Test Accuracy: 0.7308
Task: T10, Final Test Loss: 0.6273, Test Accuracy: 0.5385
Task: T11, Final Test Loss: 0.6524, Test Accuracy: 0.5769
Task: T12, Final T