# dynamic path

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import OrdinalEncoder

from tab_transformer_pytorch import TabTransformer  # make sure this is installed
import torch.nn.functional as F


# ---------------------------------------------------
# 0. Dynamic “parent” directory and path helper
# ---------------------------------------------------
parent = os.path.abspath("")
print(parent)  # For debugging; can be removed in production

def make_path(relative_path: str) -> str:
    """
    Helper to join the project’s parent folder with a relative subpath.
    Example: make_path("data/train_specifications.csv")
    will return "<absolute_parent>/data/train_specifications.csv".
    """
    return os.path.join(parent, relative_path)

# ---------------------------------------------------
# 1. Prepare your raw data
# ---------------------------------------------------

def create_X_y(csv_path, sensor_features, context=70, verbose=True):
    """
    Reads the time‐series CSV, slides over each vehicle’s data to form windows of length `context`.
    Returns:
      - X:  numpy array of shape (N, context, num_features)
      - y:  numpy array of shape (N,) of RUL labels
      - vids: numpy array of shape (N,) of vehicle_ids, one per window
    """
    df = pd.read_csv(csv_path)
    X, y, vids = [], [], []
    for vehicle_id, group in df.groupby('vehicle_id'):
        group = group.sort_values('time_step')
        data = group[sensor_features].values
        rul = group['RUL'].values
        if len(data) < context:
            if verbose:
                print(f"Skipping vehicle {vehicle_id}: {len(data)} < {context}")
            continue
        for i in range(len(data) - context + 1):
            X.append(data[i : i + context])
            y.append(rul[i + context - 1])
            vids.append(vehicle_id)
    X = np.stack(X)      # (N, context, num_features)
    y = np.array(y)      # (N,)
    vids = np.array(vids)
    print(f"Total windows: {len(X)}, window shape: {X.shape[1:]}")
    return X, y, vids

# Define sensor features exactly as before
sensor_features = [
    '171_0', '666_0', '427_0', '837_0', '167_0', '167_1', '167_2', '167_3', '167_4',
    '167_5', '167_6', '167_7', '167_8', '167_9', '309_0', '272_0', '272_1', '272_2',
    '272_3', '272_4', '272_5', '272_6', '272_7', '272_8', '272_9', '835_0', '370_0',
    '291_0', '291_1', '291_2', '291_3', '291_4', '291_5', '291_6', '291_7', '291_8',
    '291_9', '291_10', '158_0', '158_1', '158_2', '158_3', '158_4', '158_5', '158_6',
    '158_7', '158_8', '158_9', '100_0', '459_0', '459_1', '459_2', '459_3', '459_4',
    '459_5', '459_6', '459_7', '459_8', '459_9', '459_10', '459_11', '459_12', '459_13',
    '459_14', '459_15', '459_16', '459_17', '459_18', '459_19', '397_0', '397_1', '397_2',
    '397_3', '397_4', '397_5', '397_6', '397_7', '397_8', '397_9', '397_10', '397_11',
    '397_12', '397_13', '397_14', '397_15', '397_16', '397_17', '397_18', '397_19',
    '397_20', '397_21', '397_22', '397_23', '397_24', '397_25', '397_26', '397_27',
    '397_28', '397_29', '397_30', '397_31', '397_32', '397_33', '397_34', '397_35'
]

# Replace the hard‐coded path with a dynamic one
csv_path = make_path(os.path.join("data", "super_same_norm.csv"))
X_windows, y_labels, window_vids = create_X_y(csv_path, sensor_features, context=70)

# ---------------------------------------------------
# 1.1 Load & ordinal‐encode the vehicle specs
# ---------------------------------------------------
spec_csv_rel = os.path.join("data", "train_specifications.csv")
spec_df = pd.read_csv(make_path(spec_csv_rel))

spec_columns = [f"Spec_{i}" for i in range(8)]
encoder = OrdinalEncoder()
spec_df[spec_columns] = encoder.fit_transform(spec_df[spec_columns])

# Build matrix of specs per window
specs_per_window = (
    pd.DataFrame({"vehicle_id": window_vids})
      .merge(spec_df[["vehicle_id"] + spec_columns], on="vehicle_id", how="left")
)[spec_columns].values.astype(int)
# specs_per_window: shape (N, 8)

# ---------------------------------------------------
# 2. Define the Dataset
# ---------------------------------------------------

class RULCombinedDataset(Dataset):
    """
    Returns, for each index:
      - x_ts: time-series window, shape (context, num_features)
      - x_categ: categorical specs, shape (8,)
      - y: RUL label (scalar)
    """
    def __init__(self, windows: np.ndarray, specs: np.ndarray, labels: np.ndarray):
        super().__init__()
        self.windows = windows            # (N, context, num_features)
        self.specs = specs                # (N, 8)
        self.labels = labels.reshape(-1, 1)  # (N, 1)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        x_ts = torch.from_numpy(self.windows[idx]).float()    # (context, num_features)
        x_categ = torch.from_numpy(self.specs[idx]).long()    # (8,)
        y = torch.from_numpy(self.labels[idx]).float()        # (1,)
        return x_categ, x_ts, y

# Split into train/validation sets
from sklearn.model_selection import train_test_split

Xc_train, Xc_val, xspec_train, xspec_val, y_train, y_val = train_test_split(
    X_windows, specs_per_window, y_labels, test_size=0.2, random_state=42
)

train_dataset = RULCombinedDataset(Xc_train, xspec_train, y_train)
val_dataset   = RULCombinedDataset(Xc_val,   xspec_val,   y_val)

BATCH_SIZE = 256
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

# ---------------------------------------------------
# 3. Define the Combined Model
# ---------------------------------------------------

class TimeSeriesEmbedder(nn.Module):
    """
    Takes as input a batch of raw sensor‐windows shaped (batch, context_length, num_features),
    applies a small TransformerEncoder, and returns a single d_model-dim vector per window by last-step pooling.
    """
    def __init__(self, num_features, d_model=128, n_heads=8, num_layers=2, dropout=0.1):
        super().__init__()
        # Project raw sensor‐dim → d_model
        self.input_proj = nn.Linear(num_features, d_model)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dropout=dropout,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        # Optional LayerNorm was commented out in your edit
        self.layer_norm = nn.LayerNorm(d_model)

    def forward(self, x):
        """
        x: (batch_size, context_length, num_features)
        returns: (batch_size, d_model)
        """
        # 1) Project each time‐step’s features into d_model
        x = self.input_proj(x)         # (batch, context, d_model)
        # 2) Feed into standard PyTorch TransformerEncoder
        x = self.encoder(x)            # (batch, context, d_model)
        # 3) “Last‐step” pooling instead of mean
        x = x[:, -1, :]                # (batch, d_model)
        # 4) Optional nonlinearity (commented out in your edit):
        # x = F.tanh(x)
        return x

class CombinedRULModel(nn.Module):
    def __init__(self, num_sensor_features, context_length, categories, continuous_dim, cont_mean_std=None):
        """
        - num_sensor_features: number of raw sensor channels (e.g., 105)
        - context_length: length of each time window (e.g., 70)
        - categories: tuple of cardinalities for each categorical spec column (length 8)
        - continuous_dim: dimensionality of the embedder’s output (e.g., 128)
        - cont_mean_std: tensor of shape (continuous_dim, 2) for TabTransformer normalization;
                         if None, we assume no normalization (i.e., mean=0, std=1).
        """
        super().__init__()

        # 3.1 TimeSeries Embedder
        self.tf = TimeSeriesEmbedder(
            num_features=num_sensor_features,
            d_model=continuous_dim,
            n_heads=8,
            num_layers=2,
            dropout=0.1
        )

        # 3.2 TabTransformer
        if cont_mean_std is None:
            # Create a (continuous_dim x 2) tensor: mean=0, std=1 for each embedding dimension
            cont_mean_std = torch.stack([
                torch.zeros(continuous_dim),
                torch.ones(continuous_dim)
            ], dim=1)

        self.tabtf = TabTransformer(
            categories=categories,              # e.g. (num_levels_spec0, ..., num_levels_spec7)
            num_continuous=continuous_dim,      # 128
            dim=continuous_dim,                 # internal TabTransformer dimensionality
            dim_out=1,                          # single‐value regression
            depth=6,
            heads=8,
            attn_dropout=0.1,
            ff_dropout=0.1,
            mlp_hidden_mults=(4, 2),
            mlp_act=nn.ReLU(),
            continuous_mean_std=cont_mean_std   # commented out per user’s edit
        )

        # LayerNorm applied after embedder (user added, but can be toggled)
        self.layer_norm = nn.LayerNorm(continuous_dim)

    def forward(self, x_cat, x_ts):
        """
        x_cat:  (batch_size, 8)              # each entry is an integer spec index
        x_ts:   (batch_size, context_length, num_sensor_features)
        returns: (batch_size, 1)             # predicted RUL scalar
        """
        # 1) Compute 128‐dim embedding from raw time window
        all_embs = self.tf(x_ts)    # (batch_size, 128)
        all_embs = self.layer_norm(all_embs)  # Apply LayerNorm to the embeddings    

        # 2) Feed embeddings + categorical specs to TabTransformer
        pred = self.tabtf(x_cat, all_embs)      # (batch_size, 1)
        return pred

# ---------------------------------------------------
# 4. Instantiate model, loss, optimizer, move to GPU
# ---------------------------------------------------

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Determine the cardinalities of each Spec_i from encoder.categories_
category_sizes = tuple(len(encoder.categories_[i]) for i in range(len(spec_columns)))  # e.g. (3, 3, 5, etc.)

NUM_SENSOR_FEATURES = len(sensor_features)   # 105
CONTEXT_LENGTH = 70
EMBED_DIM = 128

# cont_mean_std not used (per user’s edit)
cont_mean_std_tensor = None

model = CombinedRULModel(
    num_sensor_features=NUM_SENSOR_FEATURES,
    context_length=CONTEXT_LENGTH,
    categories=category_sizes,
    continuous_dim=EMBED_DIM,
    cont_mean_std=cont_mean_std_tensor
).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# ---------------------------------------------------
# 5. Training & Validation Loops
# ---------------------------------------------------

def train_one_epoch():
    model.train()
    running_loss = 0.0
    for x_cat, x_ts, y in train_loader:
        x_cat = x_cat.to(device)           # (batch_size, 8)
        x_ts = x_ts.to(device)             # (batch_size, 70, 105)
        y = y.to(device)                   # (batch_size, 1)

        optimizer.zero_grad()
        preds = model(x_cat, x_ts)         # (batch_size, 1)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * y.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    return epoch_loss

def validate_one_epoch():
    model.eval()
    running_loss = 0.0
    with torch.no_grad():
        for x_cat, x_ts, y in val_loader:
            x_cat = x_cat.to(device)
            x_ts = x_ts.to(device)
            y = y.to(device)

            preds = model(x_cat, x_ts)
            loss = criterion(preds, y)
            running_loss += loss.item() * y.size(0)

    epoch_loss = running_loss / len(val_loader.dataset)
    return epoch_loss

# ---------------------------------------------------
# 6. Run Training
# ---------------------------------------------------

NUM_EPOCHS = 20
best_val_loss = float("inf")

for epoch in range(1, NUM_EPOCHS + 1):
    train_mse = train_one_epoch()
    val_mse = validate_one_epoch()
    print(f"Epoch {epoch:02d} → Train MSE: {train_mse:.4f} | Val MSE: {val_mse:.4f}")
    
    # Save best model
    model_save_path = make_path(os.path.join("models", "best_combined_model.pt"))
    if val_mse < best_val_loss:
        best_val_loss = val_mse
        # Ensure the “models” directory exists
        os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
        torch.save(model.state_dict(), model_save_path)

print("Training complete. Best validation MSE:", best_val_loss)


# hardcoded path

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import OrdinalEncoder

from tab_transformer_pytorch import TabTransformer  # make sure this is installed
import torch.nn.functional as F


# ----------------------------
# 1. Prepare your raw data
# ----------------------------

def create_X_y(csv_path, sensor_features, context=70, verbose=True):
    """
    Reads the time‐series CSV, slides over each vehicle’s data to form windows of length `context`.
    Returns:
      - X:  numpy array of shape (N, context, num_features)
      - y:  numpy array of shape (N,) of RUL labels
      - vids: numpy array of shape (N,) of vehicle_ids, one per window
    """
    df = pd.read_csv(csv_path)
    X, y, vids = [], [], []
    for vehicle_id, group in df.groupby('vehicle_id'):
        group = group.sort_values('time_step')
        data = group[sensor_features].values
        rul = group['RUL'].values
        if len(data) < context:
            if verbose:
                print(f"Skipping vehicle {vehicle_id}: {len(data)} < {context}")
            continue
        for i in range(len(data) - context + 1):
            X.append(data[i : i + context])
            y.append(rul[i + context - 1])
            vids.append(vehicle_id)
    X = np.stack(X)      # (N, context, num_features)
    y = np.array(y)      # (N,)
    vids = np.array(vids)
    print(f"Total windows: {len(X)}, window shape: {X.shape[1:]}")
    return X, y, vids

# Define sensor features exactly as you had before
sensor_features = [
    '171_0', '666_0', '427_0', '837_0', '167_0', '167_1', '167_2', '167_3', '167_4',
    '167_5', '167_6', '167_7', '167_8', '167_9', '309_0', '272_0', '272_1', '272_2',
    '272_3', '272_4', '272_5', '272_6', '272_7', '272_8', '272_9', '835_0', '370_0',
    '291_0', '291_1', '291_2', '291_3', '291_4', '291_5', '291_6', '291_7', '291_8',
    '291_9', '291_10', '158_0', '158_1', '158_2', '158_3', '158_4', '158_5', '158_6',
    '158_7', '158_8', '158_9', '100_0', '459_0', '459_1', '459_2', '459_3', '459_4',
    '459_5', '459_6', '459_7', '459_8', '459_9', '459_10', '459_11', '459_12', '459_13',
    '459_14', '459_15', '459_16', '459_17', '459_18', '459_19', '397_0', '397_1', '397_2',
    '397_3', '397_4', '397_5', '397_6', '397_7', '397_8', '397_9', '397_10', '397_11',
    '397_12', '397_13', '397_14', '397_15', '397_16', '397_17', '397_18', '397_19',
    '397_20', '397_21', '397_22', '397_23', '397_24', '397_25', '397_26', '397_27',
    '397_28', '397_29', '397_30', '397_31', '397_32', '397_33', '397_34', '397_35'
]

csv_path = r"C:\Users\ASUS\Desktop\SCANIA\2024-34-2\Code\super_same_norm.csv"
X_windows, y_labels, window_vids = create_X_y(csv_path, sensor_features, context=70)

# Load & ordinal‐encode the vehicle specs
spec_df = pd.read_csv(r"C:\Users\ASUS\Desktop\SCANIA\2024-34-2\2024-34-2\data\train_specifications.csv")
spec_columns = [f"Spec_{i}" for i in range(8)]
encoder = OrdinalEncoder()
spec_df[spec_columns] = encoder.fit_transform(spec_df[spec_columns])

# Build a matrix of specs per window, aligned with X_windows ordering
specs_per_window = (
    pd.DataFrame({"vehicle_id": window_vids})
      .merge(spec_df[["vehicle_id"] + spec_columns], on="vehicle_id", how="left")
)[spec_columns].values.astype(int)
# specs_per_window: shape (N, 8)

# ----------------------------
# 2. Define the Dataset
# ----------------------------

class RULCombinedDataset(Dataset):
    """
    Returns, for each index:
      - x_ts: time-series window, shape (context, num_features)
      - x_categ: categorical specs, shape (8,)
      - y: RUL label (scalar)
    """
    def __init__(self, windows: np.ndarray, specs: np.ndarray, labels: np.ndarray):
        super().__init__()
        self.windows = windows            # (N, context, num_features)
        self.specs = specs                # (N, 8)
        self.labels = labels.reshape(-1, 1)  # (N, 1)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        x_ts = torch.from_numpy(self.windows[idx]).float()    # (context, num_features)
        x_categ = torch.from_numpy(self.specs[idx]).long()    # (8,)
        y = torch.from_numpy(self.labels[idx]).float()        # (1,)
        return x_categ, x_ts, y

# Split into train/validation sets
from sklearn.model_selection import train_test_split

Xc_train, Xc_val, xspec_train, xspec_val, y_train, y_val = train_test_split(
    X_windows, specs_per_window, y_labels, test_size=0.2, random_state=42
)

train_dataset = RULCombinedDataset(Xc_train, xspec_train, y_train)
val_dataset   = RULCombinedDataset(Xc_val,   xspec_val,   y_val)

BATCH_SIZE = 256
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

# ----------------------------
# 3. Define the Combined Model
# ----------------------------


class TimeSeriesEmbedder(nn.Module):
    def __init__(self, num_features, d_model=128, n_heads=8, num_layers=2, dropout=0.1):
        super().__init__()
        # Project raw sensor‐dim → d_model
        self.input_proj = nn.Linear(num_features, d_model)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dropout=dropout,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        # We will mean‐pool over the time dimension

    def forward(self, x):
        """
        x: (batch_size, context_length, num_features)
        returns: (batch_size, d_model)
        """
        # 1) Project each time‐step’s features into d_model
        x = self.input_proj(x)         # (batch, context, d_model)
        # 2) Feed into standard PyTorch TransformerEncoder
        x = self.encoder(x)            # (batch, context, d_model)
        # 3) Mean‐pool across the context dimension
        # x = x.mean(dim=1)              # (batch, d_model)
        x=x[:, -1, :]
        # x=F.tanh(x)
        return x







"""
x: (batch_size, context_length, num_features)
x = self.input_proj(x)   # → (batch_size, context_length, d_model)#self.input_proj linearly maps each of the num_features at every time step into a d_model-dimensional vector.
x = self.encoder(x)      # → (batch_size, context_length, d_model) #self.encoder (the nn.TransformerEncoder) processes the entire sequence of length context_length and returns another sequence of the same shape:
|
\
  >  x: (batch_size, context_length, d_model)

for each window of 70 time steps, the encoder gives you 70 separate “token” embeddings, 
each of size d_model.





now 
x has shape (batch_size, context_length, d_model). 
specifically in my combined model—want a single fixed-size vector per window,
not a sequence of 70 vectors. 

TabTransformer expects its “continuous” input to be a 2D tensor of shape (batch_size, continuous_dim).
cannot feed it a 3D tensor of shape (batch_size, context_length, continuous_dim)



x = x.mean(dim=1)   # → (batch_size, d_model)
“pooling” (averaging) across all time steps. The result is a single d_model-length vector 
per training example in the batch. That vector is intended to capture, in a coarse way, 
the entire 70-step window’s information.

TransformerEncoder gives you 70 “time-step embeddings” of size d_model.
Need a single 128-dimensional embedding for each window.
Taking the arithmetic mean along dim=1 is a simple way to collapse the time dimension into one vector.

Other common pooling choices might be:
*Last time step: x = x[:, -1, :]
*Max pooling: x = x.max(dim=1).values
*Learned “classification” token: prepend a dummy token and read its embedding (like BERT’s [CLS]).




if i output x just like that then
model’s forward would output a 3D tensor (batch_size, 70, 128). 
The downstream TabTransformer is not designed to accept a 3D input.
 It expects:
*a categorical tensor of shape (batch_size, num_categorical_features)
*a continuous tensor of shape (batch_size, num_continuous_features)

skip the pooling, we need rework TabTransformer to process 70 separate continuous vectors per example.
 Shape mismatch—TabTransformer’s signature is forward(x_cat, x_cont) where x_cont must be 2D: (batch, cont_dim).
Architectural intent—we want each window summarized by a single embedding. Returning a 70-step sequence defeats that design.

"""


















"""
as peer github this is the code 
import torch
import torch.nn as nn
from tab_transformer_pytorch import TabTransformer

cont_mean_std = torch.randn(10, 2)

model = TabTransformer(
    categories = (10, 5, 6, 5, 8),      # tuple containing the number of unique values within each category
    num_continuous = 10,                # number of continuous values
    dim = 32,                           # dimension, paper set at 32
    dim_out = 1,                        # binary prediction, but could be anything
    depth = 6,                          # depth, paper recommended 6
    heads = 8,                          # heads, paper recommends 8
    attn_dropout = 0.1,                 # post-attention dropout
    ff_dropout = 0.1,                   # feed forward dropout
    mlp_hidden_mults = (4, 2),          # relative multiples of each hidden dimension of the last mlp to logits
    mlp_act = nn.ReLU(),                # activation for final mlp, defaults to relu, but could be anything else (selu etc)
    continuous_mean_std = cont_mean_std # (optional) - normalize the continuous values before layer norm
)

x_categ = torch.randint(0, 5, (1, 5))     # category values, from 0 - max number of categories, in the order as passed into the constructor above
x_cont = torch.randn(1, 10)               # assume continuous values are already normalized individually

pred = model(x_categ, x_cont) # (1, 1)"""













class CombinedRULModel(nn.Module):
    def __init__(self, num_sensor_features, context_length, categories, continuous_dim, cont_mean_std=None):
        """
        - num_sensor_features: number of raw sensor channels (e.g., 105)
        - context_length: length of each time window (e.g., 70)
        - categories: tuple of cardinalities for each categorical spec column (length 8)
        - continuous_dim: dimensionality of the embedder’s output (e.g., 128)
        - cont_mean_std: tensor of shape (continuous_dim, 2) for TabTransformer normalization;
                         if None, we assume no normalization (i.e., mean=0, std=1).
        """
        super().__init__()

        # self.layer_norm = nn.LayerNorm(continuous_dim)

        # 3.1 TimeSeries Embedder
        self.tf = TimeSeriesEmbedder(
            num_features=num_sensor_features,
            d_model=continuous_dim,
            n_heads=8,
            num_layers=2,
            dropout=0.1
        )

        # 3.2 TabTransformer
        # If you want TabTransformer to normalize continuous features, pass cont_mean_std.
        # Otherwise, set a trivial mean/std.
        if cont_mean_std is None:
            # Create a (continuous_dim x 2) tensor: mean=0, std=1 for each embedding dimension
            cont_mean_std = torch.stack([
                torch.zeros(continuous_dim),
                torch.ones(continuous_dim)
            ], dim=1)

        self.tabtf = TabTransformer(
            categories=categories,              # e.g. (num_levels_spec0, ..., num_levels_spec7)
            num_continuous=continuous_dim,      # 128
            dim=continuous_dim,                 # internal TabTransformer dimensionality
            dim_out=1,                          # single‐value regression
            depth=6,
            heads=8,
            attn_dropout=0.1,
            ff_dropout=0.1,
            mlp_hidden_mults=(4, 2),
            mlp_act=nn.ReLU()
            # ,continuous_mean_std=cont_mean_std   # tensor shape (128, 2)
        )

    def forward(self, x_cat, x_ts):
        """
        x_cat:  (batch_size, 8)              # each entry is an integer spec index
        x_ts:   (batch_size, context, F)      # raw sensor window,(256, 70, 105)
        returns: (batch_size, 1)             # predicted RUL scalar
        """
        # 1) Compute 128‐dim embedding from raw time window
        all_embs = self.tf(x_ts)    # (batch_size, 128)
        all_embs= self.layer_norm(all_embs)  # Apply LayerNorm to the embeddings    

        # 2) Feed embeddings + categorical specs to TabTransformer
        pred = self.tabtf(x_cat, all_embs)      # (batch_size, 1)
        return pred




In [None]:
# ---------------------------------------------------
# 4. Instantiate model, loss, optimizer, move to GPU
# ---------------------------------------------------

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Determine the cardinalities of each Spec_i from encoder.categories_
category_sizes = tuple(len(encoder.categories_[i]) for i in range(len(spec_columns)))  # (e.g. (3, 3, 5, ...))

NUM_SENSOR_FEATURES = len(sensor_features)   # 105
CONTEXT_LENGTH = 70
EMBED_DIM = 128

# If you really want TabTransformer to standardize the embeddings internally,
# you could do a dummy pass over the train set to compute mean/std of all embeddings.
# For now, we'll skip that and let TabTransformer assume (mean=0,std=1).
cont_mean_std_tensor = None

model = CombinedRULModel(
    num_sensor_features=NUM_SENSOR_FEATURES,
    context_length=CONTEXT_LENGTH,
    categories=category_sizes,
    continuous_dim=EMBED_DIM,
    cont_mean_std=cont_mean_std_tensor
).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# ---------------------------------------------------
# 5. Training & Validation Loops
# ---------------------------------------------------

def train_one_epoch():
    model.train()
    running_loss = 0.0
    for x_cat, x_ts, y in train_loader:
        x_cat = x_cat.to(device)           # (batch_size, 8)
        x_ts = x_ts.to(device)             # (batch_size, 70, 105)
        y = y.to(device)                   # (batch_size, 1)

        optimizer.zero_grad()
        preds = model(x_cat, x_ts)         # (batch_size, 1)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * y.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    return epoch_loss

def validate_one_epoch():
    model.eval()
    running_loss = 0.0
    with torch.no_grad():
        for x_cat, x_ts, y in val_loader:
            x_cat = x_cat.to(device)
            x_ts = x_ts.to(device)
            y = y.to(device)

            preds = model(x_cat, x_ts)
            loss = criterion(preds, y)
            running_loss += loss.item() * y.size(0)

    epoch_loss = running_loss / len(val_loader.dataset)
    return epoch_loss

# ---------------------------------------------------
# 6. Run Training
# ---------------------------------------------------

NUM_EPOCHS = 20
best_val_loss = float("inf")

for epoch in range(1, NUM_EPOCHS + 1):
    train_mse = train_one_epoch()
    val_mse = validate_one_epoch()
    print(f"Epoch {epoch:02d} → Train MSE: {train_mse:.4f} | Val MSE: {val_mse:.4f}")
    
    # Save best model
    if val_mse < best_val_loss:
        best_val_loss = val_mse
        torch.save(model.state_dict(), "best_combined_model.pt")

print("Training complete. Best validation MSE:", best_val_loss)

In [None]:
for param in model.parameters():
    print(param, param.shape, param.requires_grad)

In [None]:
# model = YourModelClass()
# model.load_state_dict(torch.load("model.pth"))  # use the correct path
model.eval()
import matplotlib.pyplot as plt
import seaborn as sns
# Visualize weights
def visualize_weights(model):
    for name, param in model.named_parameters():
        if 'weight' in name and param.requires_grad:
            weights = param.data.cpu().numpy()

            plt.figure(figsize=(10, 4))
            if weights.ndim == 2:
                sns.heatmap(weights, cmap="viridis", cbar=True)
                plt.title(f'Heatmap of {name}')
                plt.xlabel("Output units")
                plt.ylabel("Input units")
            else:
                plt.plot(weights)
                plt.title(f'Weights of {name}')
                plt.xlabel("Index")
                plt.ylabel("Weight value")
            plt.tight_layout()
            plt.show()

visualize_weights(model)