In [1]:
import pandas as pd

# Load the dataset from the GitHub repository
url = 'https://raw.githubusercontent.com/gregversteeg/LinearCorex/master/tests/data/test_big5.csv'
df = pd.read_csv(url)

# Display basic information about the dataset
print("Number of instances in the dataset:", df.shape[0])
print("Number of columns in the dataset:", df.shape[1])
print("\nFirst 5 rows of the dataset:")
print(df.head())

# Display additional information
print("\nData Types and Non-Null Counts:")
print(df.info())

Number of instances in the dataset: 2000
Number of columns in the dataset: 50

First 5 rows of the dataset:
   blue_q0  red_q1  green_q2  purple_q3  q4  blue_q5  red_q6  green_q7  \
0        2       0         3          1   4        1       4         1   
1        2       0         1          2   2        1       4         3   
2        3       0         2          1   3        1       4         3   
3        2       0         1          1   1        0       4         1   
4        2       0         1          1   3        0       4         3   

   purple_q8  q9  ...  blue_q40  red_q41  green_q42  purple_q43  q44  \
0          2   2  ...         3        3          3           2    3   
1          3   1  ...         2        3          2           2    3   
2          3   0  ...         4        4          2           1    4   
3          3   1  ...         1        2          2           1    3   
4          2   0  ...         3        4          1           3    4   

   blue_q45  r

In [2]:
# Identify column prefixes for each true factor
factor_columns = {
    'Factor1': [col for col in df.columns if col.startswith('blue')],
    'Factor2': [col for col in df.columns if col.startswith('green')],
    'Factor3': [col for col in df.columns if col.startswith('purple')],
    'Factor4': [col for col in df.columns if col.startswith('red')],
    'Factor5': [col for col in df.columns if col.startswith('q')]
}

# Calculate true factors by summing the respective columns
true_factors = pd.DataFrame()
for factor_name, columns in factor_columns.items():
    true_factors[factor_name] = df[columns].sum(axis=1)

# Display the first few rows of the calculated true factors
print(true_factors.head())

   Factor1  Factor2  Factor3  Factor4  Factor5
0       20       21       21       22       28
1       21       20       21       26       23
2       23       20       17       22       25
3       17       15       11       22       15
4       20       14       24       23       24


In [3]:
df = df / 4.0
# print(df.head())

In [4]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Convert the DataFrame to a NumPy array
data_array = df.to_numpy()

# Convert the data to a PyTorch tensor
data_tensor = torch.tensor(data_array, dtype=torch.float32)

# Create a PyTorch dataset
dataset = TensorDataset(data_tensor)

# Create a DataLoader for the dataset
batch_size = 32  # You can adjust the batch size as needed
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Display the shape of the tensor to verify
print(f"Data tensor shape: {data_tensor.shape}")
print(f"Number of batches: {len(dataloader)}")

Data tensor shape: torch.Size([2000, 50])
Number of batches: 63


In [10]:
# Import necessary libraries
import torch
import torch.nn as nn

# Encoder class
class Encoder(nn.Module):
    def __init__(self, input_dim, output_dim, embedding_dim, hidden_dims=[]):
        super(Encoder, self).__init__()
        # Define the layers of the MLP
        dims = [input_dim] + hidden_dims + [output_dim]
        layers = []
        for i in range(len(dims) - 1):
            layers.append(nn.Linear(dims[i], dims[i + 1]))
            if i < len(dims) - 2:
                layers.append(nn.ReLU())
        self.mlp = nn.Sequential(*layers)

        # Learnable embedding vector e (moved from Decoder to Encoder)
        self.e = nn.Parameter(torch.randn(embedding_dim))
        self.embedding_dim = embedding_dim

    def forward(self, x):
        # Pass the input through the MLP to get Z
        Z = self.mlp(x)  # Shape: (batch_size, output_dim)

        # Convert Z to \hat Z by multiplying each scalar z_i with embedding vector e
        batch_size = Z.size(0)
        output_dim = Z.size(1)
        e_expanded = self.e.unsqueeze(0).unsqueeze(0)      # Shape: (1, 1, embedding_dim)
        Z_expanded = Z.unsqueeze(2)                        # Shape: (batch_size, output_dim, 1)
        hat_Z = Z_expanded * e_expanded                    # Shape: (batch_size, output_dim, embedding_dim)

        return hat_Z

In [11]:
# Decoder class
class Decoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dims=[]):
        super(Decoder, self).__init__()
        self.input_dim = input_dim      # Number of observed variables (n)
        self.embedding_dim = embedding_dim

        # Learnable query embeddings (e1, e2, ..., en)
        self.query_embeddings = nn.Parameter(torch.randn(input_dim, embedding_dim))

        # MultiheadAttention module with 1 head
        self.attention = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=1, batch_first=True)

        # Layer normalization
        self.layer_norm = nn.LayerNorm(embedding_dim)

        # MLP to predict x_i's from embeddings
        dims = [embedding_dim] + hidden_dims + [1]
        layers = []
        for i in range(len(dims) - 1):
            layers.append(nn.Linear(dims[i], dims[i + 1]))
            if i < len(dims) - 2:
                layers.append(nn.ReLU())
        self.mlp = nn.Sequential(*layers)

    def forward(self, hat_Z):
        """
        hat_Z: Tensor of shape (batch_size, output_dim, embedding_dim)
        """
        batch_size = hat_Z.size(0)

        # Prepare query embeddings and expand to batch size
        query_embeddings = self.query_embeddings.unsqueeze(0).expand(batch_size, -1, -1)  # Shape: (batch_size, input_dim, embedding_dim)

        # Apply scaled dot-product attention
        attn_output, attn_weights = self.attention(query_embeddings, hat_Z, hat_Z)        # Output shape: (batch_size, input_dim, embedding_dim)

        # Add residual connection and apply layer normalization
        out = self.layer_norm(attn_output + query_embeddings)                             # Shape: (batch_size, input_dim, embedding_dim)

        # Flatten the embeddings and pass through MLP to predict x_i's
        out_flat = out.reshape(-1, self.embedding_dim)                                    # Shape: (batch_size * input_dim, embedding_dim)
        x_hat_flat = self.mlp(out_flat)                                                   # Shape: (batch_size * input_dim, 1)
        x_hat = x_hat_flat.view(batch_size, self.input_dim)                               # Shape: (batch_size, input_dim)

        return x_hat

In [12]:
# Complete model combining the encoder and decoder
class Model(nn.Module):
    def __init__(self, input_dim, output_dim, embedding_dim, encoder_hidden_dims=[], decoder_hidden_dims=[]):
        super(Model, self).__init__()
        self.encoder = Encoder(input_dim=input_dim, output_dim=output_dim, embedding_dim=embedding_dim, hidden_dims=encoder_hidden_dims)
        self.decoder = Decoder(input_dim=input_dim, embedding_dim=embedding_dim, hidden_dims=decoder_hidden_dims)

    def forward(self, x):
        hat_Z = self.encoder(x)     # Obtain \hat Z from the encoder
        x_hat = self.decoder(hat_Z) # Reconstruct x from \hat Z using the decoder
        return x_hat

In [13]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# Assume that the Encoder, Decoder, and Model classes are already defined

# Define dimensions
input_dim = 50        # Number of observed variables (same as the number of x_i's in the dataset)
output_dim = 5        # Output dimension of the encoder (dimension of Z)
embedding_dim = 64    # Embedding dimension for the embeddings e and e_i's
encoder_hidden_dims = [128, 64]  # Hidden dimensions for the encoder
decoder_hidden_dims = [64, 32]   # Hidden dimensions for the decoder

# Instantiate the model
model = Model(
    input_dim=input_dim,
    output_dim=output_dim,
    embedding_dim=embedding_dim,
    encoder_hidden_dims=encoder_hidden_dims,
    decoder_hidden_dims=decoder_hidden_dims
)

# Move the model to the appropriate device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Loss function and optimizer
criterion = nn.MSELoss()  # Mean Squared Error Loss for reconstruction
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training parameters
num_epochs = 1024           # Number of epochs
batch_size = 32           # Batch size (already set in the DataLoader)
print_every = 1           # How often to print loss (in epochs)

# Load the dataset from the DataLoader (assume it is already defined)
# Example DataLoader code
# df = pd.read_csv(url)
# data_array = df.to_numpy()
# data_tensor = torch.tensor(data_array, dtype=torch.float32)
# dataset = TensorDataset(data_tensor)
# dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Training loop
for epoch in range(num_epochs):
    running_loss = 0.0
    for batch_idx, (batch,) in enumerate(dataloader):
        batch = batch.to(device)  # Move batch to device

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass: Compute predicted x_hat by passing input x through the model
        x_hat = model(batch)

        # Compute the loss
        loss = criterion(x_hat, batch)

        # Backward pass: Compute the gradients
        loss.backward()

        # Optimize the parameters
        optimizer.step()

        # Accumulate loss for reporting
        running_loss += loss.item()

    # Print average loss for the epoch
    if (epoch + 1) % print_every == 0:
        avg_loss = running_loss / len(dataloader)
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}')

# Example save the trained model after training
torch.save(model.state_dict(), "trained_model.pth")
print("Training complete and model saved.")

Epoch [1/1024], Loss: 0.1158
Epoch [2/1024], Loss: 0.0828
Epoch [3/1024], Loss: 0.0746
Epoch [4/1024], Loss: 0.0721
Epoch [5/1024], Loss: 0.0702
Epoch [6/1024], Loss: 0.0678
Epoch [7/1024], Loss: 0.0656
Epoch [8/1024], Loss: 0.0646
Epoch [9/1024], Loss: 0.0639
Epoch [10/1024], Loss: 0.0636
Epoch [11/1024], Loss: 0.0633
Epoch [12/1024], Loss: 0.0629
Epoch [13/1024], Loss: 0.0626
Epoch [14/1024], Loss: 0.0623
Epoch [15/1024], Loss: 0.0620
Epoch [16/1024], Loss: 0.0619
Epoch [17/1024], Loss: 0.0617
Epoch [18/1024], Loss: 0.0616
Epoch [19/1024], Loss: 0.0615
Epoch [20/1024], Loss: 0.0616
Epoch [21/1024], Loss: 0.0615
Epoch [22/1024], Loss: 0.0612
Epoch [23/1024], Loss: 0.0611
Epoch [24/1024], Loss: 0.0609
Epoch [25/1024], Loss: 0.0608
Epoch [26/1024], Loss: 0.0605
Epoch [27/1024], Loss: 0.0604
Epoch [28/1024], Loss: 0.0602
Epoch [29/1024], Loss: 0.0601
Epoch [30/1024], Loss: 0.0601
Epoch [31/1024], Loss: 0.0600
Epoch [32/1024], Loss: 0.0606
Epoch [33/1024], Loss: 0.0603
Epoch [34/1024], Lo