In [1]:
# Necessary imports
import numpy as np

def histogram_discretize(target, num_bins=20):
    """
    Discretization based on histograms.
    Converts continuous representations into discrete bins.
    
    Args:
    - target (np.ndarray): The continuous representation array to discretize.
    - num_bins (int): The number of bins to use for discretization.
    
    Returns:
    - discretized (np.ndarray): Discretized version of the input array.
    """
    discretized = np.zeros_like(target)
    # Iterate over each dimension of the input array and apply histogram-based discretization
    for i in range(target.shape[0]):
        # Use np.digitize to assign bin numbers
        discretized[i, :] = np.digitize(target[i, :], np.histogram(target[i, :], num_bins)[1][:-1])
    
    return discretized

# # Test the function with a sample input
# test_array = np.random.rand(5, 100)  # Random array for testing purposes
# discretized_array = histogram_discretize(test_array, num_bins=10)

# discretized_array.shape, discretized_array  # Display the shape and content of the discretized array


In [2]:
import numpy as np
from sklearn.metrics import mutual_info_score

def discrete_mutual_info(mus, ys):
    """
    Compute discrete mutual information.
    """
    num_codes = mus.shape[0]
    num_factors = ys.shape[0]
    m = np.zeros([num_codes, num_factors])
    for i in range(num_codes):
        for j in range(num_factors):
            m[i, j] = mutual_info_score(ys[j, :], mus[i, :])
    return m

In [3]:
import numpy as np
from sklearn.metrics import mutual_info_score

def discrete_entropy(ys):
    """
    Compute discrete mutual information.
    """
    num_factors = ys.shape[0]
    h = np.zeros(num_factors)
    for j in range(num_factors):
        h[j] = mutual_info_score(ys[j, :], ys[j, :])
    return h


In [4]:
def compute_mig(mus_train, ys_train, num_bins=20):
    """
    Computes the Mutual Information Gap (MIG) score.
    
    Args:
    - mus_train: Latent representations, numpy array of shape (num_latents, num_samples).
    - ys_train: True factors, numpy array of shape (num_factors, num_samples).
    - num_bins: Number of bins for discretization (default is 20).
    
    Returns:
    - mig_score: The computed MIG score.
    """
    # Discretize the latent representations
    discretized_mus = histogram_discretize(mus_train, num_bins=num_bins)
    
    # Compute mutual information matrix
    mutual_info_matrix = discrete_mutual_info(discretized_mus, ys_train)
    
    # Compute entropy for each factor
    entropy_values = discrete_entropy(ys_train)
    
    # Sort mutual information values for each factor
    sorted_mutual_info = np.sort(mutual_info_matrix, axis=0)[::-1]
    
    # Compute the MIG score
    mig_score = np.mean((sorted_mutual_info[0, :] - sorted_mutual_info[1, :]) / entropy_values)
    
    return mig_score

In [5]:
np.random.seed(0)
z_dim, y_dim, num_samples = 5, 5, 100  # Number of dimensions and samples
z = np.random.randn(z_dim, num_samples)  # Latent representations
y = np.random.randn(y_dim, num_samples)  # True factors
z_discretized = histogram_discretize(z, num_bins=20)
y_discretized = histogram_discretize(y, num_bins=20)

# Compute MIG score
mig_score = compute_mig(z_discretized, z_discretized)

# Print the MIG score
print("MIG Score:", mig_score)

MIG Score: 0.5799612610419946


In [6]:
import pandas as pd

# Load the dataset from the GitHub repository
url = 'https://raw.githubusercontent.com/gregversteeg/LinearCorex/master/tests/data/test_big5.csv'
df = pd.read_csv(url)

# Display basic information about the dataset
print("Number of instances in the dataset:", df.shape[0])
print("Number of columns in the dataset:", df.shape[1])
print("\nFirst 5 rows of the dataset:")
print(df.head())

# Display additional information
print("\nData Types and Non-Null Counts:")
print(df.info())

print("\nSummary Statistics of the Dataset:")
print(df.describe())

Number of instances in the dataset: 2000
Number of columns in the dataset: 50

First 5 rows of the dataset:
   blue_q0  red_q1  green_q2  purple_q3  q4  blue_q5  red_q6  green_q7  \
0        2       0         3          1   4        1       4         1   
1        2       0         1          2   2        1       4         3   
2        3       0         2          1   3        1       4         3   
3        2       0         1          1   1        0       4         1   
4        2       0         1          1   3        0       4         3   

   purple_q8  q9  ...  blue_q40  red_q41  green_q42  purple_q43  q44  \
0          2   2  ...         3        3          3           2    3   
1          3   1  ...         2        3          2           2    3   
2          3   0  ...         4        4          2           1    4   
3          3   1  ...         1        2          2           1    3   
4          2   0  ...         3        4          1           3    4   

   blue_q45  r

In [7]:
# Identify columns containing -1 values
columns_with_negative_ones = df.columns[(df == -1).any()].tolist()

# Count the number of -1 values in each column
negative_one_counts = df[columns_with_negative_ones].apply(lambda col: (col == -1).sum())

# Display the results
print("Columns containing -1 values and their counts:")
print(negative_one_counts)


Columns containing -1 values and their counts:
green_q12     1
q14           1
red_q16       1
green_q17     1
purple_q18    1
q19           1
blue_q20      1
red_q21       2
green_q22     1
purple_q23    1
q24           1
blue_q25      2
red_q26       1
green_q27     1
purple_q28    1
q29           1
blue_q30      1
red_q31       1
green_q32     1
purple_q33    2
blue_q35      2
red_q36       2
green_q37     2
purple_q38    2
q39           2
blue_q40      2
red_q41       3
green_q42     3
purple_q43    3
q44           3
blue_q45      1
green_q47     1
q49           1
dtype: int64


In [8]:
import pandas as pd

# Load the dataset from the GitHub repository
url = 'https://raw.githubusercontent.com/gregversteeg/LinearCorex/master/tests/data/test_big5.csv'
df = pd.read_csv(url)

# Identify column prefixes for each true factor
factor_columns = {
    'Factor1': [col for col in df.columns if col.startswith('blue')],
    'Factor2': [col for col in df.columns if col.startswith('green')],
    'Factor3': [col for col in df.columns if col.startswith('purple')],
    'Factor4': [col for col in df.columns if col.startswith('red')],
    'Factor5': [col for col in df.columns if col.startswith('q')]
}

# Calculate true factors by summing the respective columns
true_factors = pd.DataFrame()
for factor_name, columns in factor_columns.items():
    true_factors[factor_name] = df[columns].sum(axis=1)

# Display the first few rows of the calculated true factors
print(true_factors.head())


   Factor1  Factor2  Factor3  Factor4  Factor5
0       20       21       21       22       28
1       21       20       21       26       23
2       23       20       17       22       25
3       17       15       11       22       15
4       20       14       24       23       24


In [9]:
 import numpy as np

# Convert the DataFrame to a NumPy array and transpose it
true_factors_array = true_factors.to_numpy().T  # Shape will be (z_dim, num_samples)

# Calculate the MIG score between the true factors and itself
mig_score = compute_mig(true_factors_array, true_factors_array)

# Print the MIG score
print("MIG Score between true factors and itself:", mig_score)

MIG Score between true factors and itself: 0.7441793239604465


In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Define the Autoencoder architecture
class Autoencoder(nn.Module):
    def __init__(self, input_dim=50, latent_dim=5):
        super(Autoencoder, self).__init__()
        
        # Encoder: Compress the input to the latent space
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 32),  # First hidden layer
            nn.ReLU(),  # Activation function
            nn.Linear(32, 16),  # Second hidden layer
            nn.ReLU(),
            nn.Linear(16, latent_dim)  # Latent layer
        )
        
        # Decoder: Reconstruct the input from the latent space
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 16),  # First hidden layer
            nn.ReLU(),
            nn.Linear(16, 32),  # Second hidden layer
            nn.ReLU(),
            nn.Linear(32, input_dim),  # Output layer
            nn.Sigmoid()  # Output activation to bring the reconstructed values between 0 and 1
        )

    def forward(self, x):
        # Encode input
        latent = self.encoder(x)
        # Decode input from latent representation
        reconstructed = self.decoder(latent)
        return reconstructed

# Create an instance of the Autoencoder model
autoencoder = Autoencoder()

# Display the architecture
print(autoencoder)


Autoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=50, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=5, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=5, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=50, bias=True)
    (5): Sigmoid()
  )
)


In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Convert the DataFrame to a NumPy array
data_array = df.to_numpy()

# Convert the data to a PyTorch tensor
data_tensor = torch.tensor(data_array, dtype=torch.float32)

# Create a PyTorch dataset
dataset = TensorDataset(data_tensor)

# Create a DataLoader for the dataset
batch_size = 64  # You can adjust the batch size as needed
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Display the shape of the tensor to verify
print(f"Data tensor shape: {data_tensor.shape}")
print(f"Number of batches: {len(dataloader)}")
