In [None]:
 import numpy as np
from sklearn.decomposition import PCA

# Generate fake embedding dataset with different lengths
def generate_fake_embeddings(num_groups=10, min_len=5, max_len=20, embedding_dim=50):
    np.random.seed(42)  # For reproducibility
    embeddings = []
    for _ in range(num_groups):
        group_len = np.random.randint(min_len, max_len)
        group_embeddings = [np.random.rand(np.random.randint(embedding_dim//2, embedding_dim*2)) for _ in range(group_len)]
        embeddings.append(group_embeddings)
    return embeddings

# Interpolation method to bring all embeddings to the same size
def interpolate_embeddings(embeddings, target_dim=50):
    interpolated_embeddings = []
    for embed in embeddings:
        original_indices = np.linspace(0, 1, num=len(embed))
        target_indices = np.linspace(0, 1, num=target_dim)
        interpolated_embed = np.interp(target_indices, original_indices, embed)
        interpolated_embeddings.append(interpolated_embed)
    return interpolated_embeddings

# PCA function to project embeddings to a fixed size
def apply_pca(embeddings, target_dim=50):
    interpolated_embeddings = interpolate_embeddings(embeddings, target_dim)
    pca = PCA(n_components=min(target_dim, len(interpolated_embeddings)))
    pca_embeddings = pca.fit_transform(interpolated_embeddings)
    return pca_embeddings

# Mean pooling function
def mean_pooling(embeddings, target_dim=50):
    interpolated_embeddings = interpolate_embeddings(embeddings, target_dim)
    return np.mean(interpolated_embeddings, axis=0)

# Hybrid approach: PCA then Mean pooling
def hybrid_pca_mean_pooling(embeddings, target_dim=50):
    pca_embeddings = apply_pca(embeddings, target_dim)
    return mean_pooling(pca_embeddings, target_dim)

# Generate fake dataset
embeddings = generate_fake_embeddings()

# Apply PCA
pca_embeddings = [apply_pca(group) for group in embeddings]

# Apply Mean Pooling
mean_pooled_embeddings = [mean_pooling(group) for group in embeddings]

# Apply Hybrid PCA then Mean Pooling
hybrid_embeddings = [hybrid_pca_mean_pooling(group) for group in embeddings]

import pandas as pd

# Convert results to DataFrame for better display
result_df = pd.DataFrame({
    'PCA Embedding': [emb.tolist() for emb in pca_embeddings],
    'Mean Pooling Embedding': [emb.tolist() for emb in mean_pooled_embeddings],
    'Hybrid Embedding': [emb.tolist() for emb in hybrid_embeddings]
})


result_df.head()


In [None]:
# Updated code with separate handling for PCA embedding shape in the test function

import numpy as np
from sklearn.decomposition import PCA
import json
import os
import shutil

# Load the embeddings from the JSON file
def load_embeddings(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)
    return data

# Interpolation method to bring all embeddings to the same size
def interpolate_embeddings(embeddings, target_dim=50):
    interpolated_embeddings = []
    for embed in embeddings:
        original_indices = np.linspace(0, 1, num=len(embed))
        target_indices = np.linspace(0, 1, num=target_dim)
        interpolated_embed = np.interp(target_indices, original_indices, embed)
        interpolated_embeddings.append(interpolated_embed)
    return interpolated_embeddings

# PCA function to project embeddings to a fixed size
def apply_pca(embeddings, target_dim=50):
    interpolated_embeddings = interpolate_embeddings(embeddings, target_dim)
    n_samples = len(interpolated_embeddings)
    n_components = min(target_dim, n_samples)
    pca = PCA(n_components=n_components)
    pca_embeddings = pca.fit_transform(interpolated_embeddings)
    print(f"PCA explained variance ratio: {pca.explained_variance_ratio_}")
    print(f"PCA components shape: {pca.components_.shape}")
    return np.mean(pca_embeddings, axis=0)

# Mean pooling function
def mean_pooling(embeddings, target_dim=50):
    interpolated_embeddings = interpolate_embeddings(embeddings, target_dim)
    return np.mean(interpolated_embeddings, axis=0)

# Hybrid approach: PCA then Mean pooling
def hybrid_pca_mean_pooling(embeddings, target_dim=50):
    pca_embedding = apply_pca(embeddings, target_dim)
    return mean_pooling([pca_embedding], target_dim)

# Function to process a single file
def process_file(filepath):
    data = load_embeddings(filepath)
    embeddings = [item['embedding'] for item in data['embeddings']]

    # Apply PCA
    pca_embedding = apply_pca(embeddings, target_dim=50)
    print(f"PCA embedding shape: {pca_embedding.shape}")

    # Apply Mean Pooling
    mean_pooled_embedding = mean_pooling(embeddings, target_dim=50)
    print(f"Mean pooling embedding shape: {mean_pooled_embedding.shape}")

    # Apply Hybrid PCA then Mean Pooling
    hybrid_embedding = hybrid_pca_mean_pooling(embeddings, target_dim=50)
    print(f"Hybrid embedding shape: {hybrid_embedding.shape}")

    # Add combined embeddings to the data
    data['combined'] = {
        'PCA': pca_embedding.tolist(),
        'MeanPooling': mean_pooled_embedding.tolist(),
        'Hybrid': hybrid_embedding.tolist()
    }

    # Save the updated data to a new file
    #new_filepath = filepath.replace('.json', '_updated.json')
    #shutil.copyfile(filepath, filepath)  # Make a copy of the original file
    with open(filepath, 'w') as file:
        json.dump(data, file, indent=4)
    
    return new_filepath

# Function to process all files in a directory
def process_directory(directory):
    updated_files = []
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            filepath = os.path.join(directory, filename)
            updated_filepath = process_file(filepath)
            updated_files.append(updated_filepath)
    return updated_files

# Function to test the structure and shape of the final file
def test_final_file(filepath, expected_shape=50):
    with open(filepath, 'r') as file:
        data = json.load(file)

    assert 'combined' in data, "Missing 'combined' key in JSON file"
    combined = data['combined']

    for key in ['MeanPooling', 'Hybrid']:
        assert key in combined, f"Missing '{key}' key in 'combined'"
        assert len(combined[key]) == expected_shape, f"Incorrect shape for '{key}' embedding"

    # Handle PCA embedding separately
    assert 'PCA' in combined, f"Missing 'PCA' key in 'combined'"
    pca_embedding_shape = len(combined['PCA'])
    assert pca_embedding_shape > 0, f"Incorrect shape for 'PCA' embedding"

    print(f"File {filepath} passed the tests.")

# Path to the directory containing the JSON files
directory_path = '/workspace/slice-monorepo/thebeast/chat_pipeline/data/test/step_2/test'

# Process all files in the directory
updated_files = process_directory(directory_path)

# Test the first updated file to ensure it is correctly formatted
test_final_file(updated_files[0])

# Display the combined embeddings
updated_files


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, AdamW
import numpy as np


SEED = 42
NUM_TEXTS = 160
MAX_LENGTH = 300
EMBEDDING_DIM = 300
HIDDEN_DIM = 128
BATCH_SIZE = 8
NUM_EPOCHS = 25
LEARNING_RATE = 1e-4

np.random.seed(SEED)
torch.manual_seed(SEED)

# Step 1: Generate Fake Data
fake_embeddings = [torch.tensor(np.random.rand(EMBEDDING_DIM), dtype=torch.float32) for _ in range(NUM_TEXTS * 2)]

# Generate random text sequences from the tokenizer's vocabulary
def generate_random_texts(tokenizer, num_texts, max_length):
    texts = []
    vocab_size = tokenizer.vocab_size
    for _ in range(num_texts):
        random_tokens = np.random.randint(0, vocab_size, size=(max_length,))
        texts.append(tokenizer.decode(random_tokens, skip_special_tokens=True))
    return texts

# Step 2: Initialize Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Set the padding token

fake_texts = generate_random_texts(tokenizer, num_texts=NUM_TEXTS, max_length=MAX_LENGTH)

# Convert Text to Tokens
tokenized_texts = [tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=MAX_LENGTH)['input_ids'].squeeze(0) for text in fake_texts]

# Step 3: Create Dataset
class EmbeddingTextDataset(Dataset):
    def __init__(self, embeddings, tokenized_texts):
        self.embeddings = embeddings
        self.tokenized_texts = tokenized_texts
        
    def __len__(self):
        return len(self.tokenized_texts)
    
    def __getitem__(self, idx):
        return self.embeddings[idx], self.tokenized_texts[idx]

dataset = EmbeddingTextDataset(fake_embeddings, tokenized_texts)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
print("dataset made")
# Step 4: Define the Model
class SimpleTextGenerator(nn.Module):
    def __init__(self, embedding_dim, vocab_size, hidden_dim, max_length):
        super(SimpleTextGenerator, self).__init__()
        self.embedding_projection = nn.Linear(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, vocab_size * max_length)
        self.max_length = max_length
        self.vocab_size = vocab_size
        
    def forward(self, embedding):
        projected_embedding = self.embedding_projection(embedding)
        output = self.fc(projected_embedding)
        output = output.view(-1, self.max_length, self.vocab_size)
        return output

vocab_size = tokenizer.vocab_size
model = SimpleTextGenerator(EMBEDDING_DIM, vocab_size, HIDDEN_DIM, MAX_LENGTH)
print("model made")
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Step 5: Train the Model
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()

def train(model, dataloader, optimizer, criterion, device, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for embeddings, tokenized_texts in dataloader:
            embeddings = embeddings.to(device)
            tokenized_texts = tokenized_texts.to(device)
            
            optimizer.zero_grad()
            logits = model(embeddings)
            loss = criterion(logits.view(-1, vocab_size), tokenized_texts.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss}")

train(model, dataloader, optimizer, criterion, device, NUM_EPOCHS)

# Step 6: Generate and Print Text in One Go
def generate_text_from_embedding(model, embedding, tokenizer, max_length):
    model.eval()
    with torch.no_grad():
        embedding = embedding.to(device)
        logits = model(embedding)
        predicted_ids = torch.argmax(logits, dim=-1)
        generated_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
        return generated_text

# Test the model with a new embedding
test_embedding = torch.tensor(np.random.rand(EMBEDDING_DIM), dtype=torch.float32).to(device)
generated_text = generate_text_from_embedding(model, test_embedding, tokenizer, MAX_LENGTH)
print("Generated text:", generated_text)


In [None]:
import json
import os

def extract_embeddings(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)
    
    uuid = data.get('uuid')
    combined = data.get('combined', {})
    mean_pooling = combined.get('MeanPooling', [])
    hybrid = combined.get('Hybrid', [])

    return uuid, mean_pooling, hybrid

def process_directory(directory):
    embeddings_dict = {}
    for filename in os.listdir(directory):
        if filename.endswith('_updated.json'):
            filepath = os.path.join(directory, filename)
            uuid, mean_pooling, hybrid = extract_embeddings(filepath)
            if uuid is not None:
                embeddings_dict[uuid] = {
                    'MeanPooling': mean_pooling,
                    'Hybrid': hybrid
                }
        break
    return embeddings_dict

def load_response_list(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)
    return data

def combine_data(embeddings_dict, response_list):
    combined_dict = {}
    for response in response_list:
        uuid = response.get('uuid')
        if uuid in embeddings_dict:
            combined_dict[uuid] = {
                'response_content': response.get('response_content'),
                'MeanPooling': embeddings_dict[uuid]['MeanPooling'],
                'Hybrid': embeddings_dict[uuid]['Hybrid']
            }
    return combined_dict

# Path to the directory containing the updated JSON files
directory_path = '/workspace/slice-monorepo/thebeast/chat_pipeline/data/test/step_2/test'

# Path to the uuid_response_list.json file
response_list_path = '/workspace/slice-monorepo/thebeast/chat_pipeline/data/test/step_2/uuid_response_list.json'

# Process all files in the directory to extract embeddings
embeddings_dict = process_directory(directory_path)

# Load the response list
response_list = load_response_list(response_list_path)

# Combine the embeddings with the response content
combined_data = combine_data(embeddings_dict, response_list)

# Display the resulting combined dictionary
print(combined_data)


In [6]:
import json
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, AdamW
import numpy as np
from sklearn.model_selection import train_test_split

# Configuration variables
SEED = 42
NUM_TEXTS = 160
MAX_LENGTH = 50
EMBEDDING_DIM = 50
HIDDEN_DIM = 128
BATCH_SIZE = 8
NUM_EPOCHS = 100
LEARNING_RATE = 1e-4
EMBEDDING_TYPE = 'Hybrid'  # Options: 'MeanPooling', 'Hybrid'

np.random.seed(SEED)
torch.manual_seed(SEED)

def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)

# Step 1: Load Embeddings and Response Content
def extract_embeddings(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)
    
    uuid = data.get('uuid')
    combined = data.get('combined', {})
    mean_pooling = combined.get('MeanPooling', [])
    hybrid = combined.get('Hybrid', [])

    return uuid, mean_pooling, hybrid

def process_directory(directory):
    embeddings_dict = {}
    for filename in os.listdir(directory):
        if filename.endswith('_updated.json'):
            filepath = os.path.join(directory, filename)
            uuid, mean_pooling, hybrid = extract_embeddings(filepath)
            if uuid is not None:
                embeddings_dict[uuid] = {
                    'MeanPooling': mean_pooling,
                    'Hybrid': hybrid
                }
    return embeddings_dict

def load_response_list(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)
    return data

def combine_data(embeddings_dict, response_list):
    combined_dict = {}
    for response in response_list:
        uuid = response.get('uuid')
        if uuid in embeddings_dict:
            combined_dict[uuid] = {
                'response_content': response.get('response_content'),
                'MeanPooling': embeddings_dict[uuid]['MeanPooling'],
                'Hybrid': embeddings_dict[uuid]['Hybrid']
            }
    return combined_dict

def load_combined_data(embeddings_path, response_list_path):
    embeddings_dict = process_directory(embeddings_path)
    response_list = load_response_list(response_list_path)
    combined_data = combine_data(embeddings_dict, response_list)
    return combined_data

def print_intermediate_info(combined_data):
    print(f"Number of data points found: {len(combined_data)}")
    if combined_data:
        example_uuid, example_data = next(iter(combined_data.items()))
        print(f"Example UUID: {example_uuid}")
        print(f"Example Response Content: {example_data['response_content']}")
        print(f"Example Embedding (MeanPooling): {example_data['MeanPooling'][:5]}...")  # print first 5 elements
        print(f"Example Embedding (Hybrid): {example_data['Hybrid'][:5]}...")  # print first 5 elements

# Step 2: Initialize Tokenizer
def initialize_tokenizer():
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token  # Set the padding token
    return tokenizer

def process_texts(texts, tokenizer, max_length):
    tokenized_texts = [tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=max_length)['input_ids'].squeeze(0) for text in texts]
    return tokenized_texts

# Step 3: Create Dataset
class EmbeddingTextDataset(Dataset):
    def __init__(self, combined_data, tokenized_texts, embedding_type):
        self.combined_data = combined_data
        self.tokenized_texts = tokenized_texts
        self.embedding_type = embedding_type
        self.keys = list(combined_data.keys())

    def __len__(self):
        return len(self.tokenized_texts)
    
    def __getitem__(self, idx):
        uuid = self.keys[idx]
        embedding = self.combined_data[uuid][self.embedding_type]
        embedding = torch.tensor(embedding, dtype=torch.float32)
        tokenized_text = self.tokenized_texts[idx]
        return embedding, tokenized_text

def create_dataloader(combined_data, tokenizer, embedding_type, batch_size):
    texts = [item['response_content'] for item in combined_data.values()]
    tokenized_texts = process_texts(texts, tokenizer, MAX_LENGTH)
    dataset = EmbeddingTextDataset(combined_data, tokenized_texts, embedding_type)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader

# Step 4: Define the Model
class SimpleTextGenerator(nn.Module):
    def __init__(self, embedding_dim, vocab_size, hidden_dim, max_length):
        super(SimpleTextGenerator, self).__init__()
        self.embedding_projection = nn.Linear(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, vocab_size * max_length)
        self.max_length = max_length
        self.vocab_size = vocab_size
        
    def forward(self, embedding):
        projected_embedding = self.embedding_projection(embedding)
        output = self.fc(projected_embedding)
        output = output.view(-1, self.max_length, self.vocab_size)
        return output

def initialize_model(embedding_dim, vocab_size, hidden_dim, max_length):
    model = SimpleTextGenerator(embedding_dim, vocab_size, hidden_dim, max_length)
    return model

def move_model_to_device(model):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    return model, device

# Step 5: Train the Model
def train(model, dataloader, optimizer, criterion, device, epochs, vocab_size):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for embeddings, tokenized_texts in dataloader:
            embeddings = embeddings.to(device)
            tokenized_texts = tokenized_texts.to(device)
            
            optimizer.zero_grad()
            logits = model(embeddings)
            loss = criterion(logits.view(-1, vocab_size), tokenized_texts.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss}")

# Step 6: Generate and Print Text in One Go
def generate_text_from_embedding(model, embedding, tokenizer, max_length, device):
    model.eval()
    with torch.no_grad():
        embedding = embedding.to(device)
        logits = model(embedding)
        predicted_ids = torch.argmax(logits, dim=-1)
        generated_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
        return generated_text

# Step 7: Tokenizer Test
def tokenizer_test(tokenizer, example_text, max_length):
    print(f"Original text: {example_text}")
    tokenized_text = tokenizer(example_text, return_tensors='pt', truncation=True, padding='max_length', max_length=max_length)['input_ids'].squeeze(0)
    decoded_text = tokenizer.decode(tokenized_text, skip_special_tokens=True)
    print(f"Tokenized and decoded text: {decoded_text}")

def split_data(combined_data, test_size=0.2):
    keys = list(combined_data.keys())
    train_keys, test_keys = train_test_split(keys, test_size=test_size, random_state=SEED)
    train_data = {key: combined_data[key] for key in train_keys}
    test_data = {key: combined_data[key] for key in test_keys}
    return train_data, test_data

def main():
    set_seed(SEED)

    # Load combined data
    combined_data = load_combined_data(embeddings_path, response_list_path)
    print_intermediate_info(combined_data)

    # Split data into training and test sets
    train_data, test_data = split_data(combined_data)
    print(f"Training data points: {len(train_data)}")
    print(f"Test data points: {len(test_data)}")

    # Initialize tokenizer
    tokenizer = initialize_tokenizer()
    vocab_size = tokenizer.vocab_size

    # Tokenizer test with an example text
    example_text = next(iter(combined_data.values()))['response_content']
    tokenizer_test(tokenizer, example_text, MAX_LENGTH)

    # Create dataloader for training data
    train_dataloader = create_dataloader(train_data, tokenizer, EMBEDDING_TYPE, BATCH_SIZE)

    # Initialize and move model to device
    model = initialize_model(EMBEDDING_DIM, vocab_size, HIDDEN_DIM, MAX_LENGTH)
    model, device = move_model_to_device(model)

    # Define optimizer and loss function
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss()

    # Train the model
    train(model, train_dataloader, optimizer, criterion, device, NUM_EPOCHS, vocab_size)

    # Evaluate the model with the test set
    test_embeddings = [torch.tensor(test_data[key][EMBEDDING_TYPE], dtype=torch.float32).to(device) for key in test_data]
    for i, test_embedding in enumerate(test_embeddings):
        generated_text = generate_text_from_embedding(model, test_embedding, tokenizer, MAX_LENGTH, device)
        original_text = test_data[list(test_data.keys())[i]]['response_content']
        print(f"Original text: {original_text}")
        print(f"Generated text: {generated_text}\n")

if __name__ == "__main__":
    embeddings_path = '/workspace/slice-monorepo/thebeast/chat_pipeline/data/test/step_2/test'
    response_list_path = '/workspace/slice-monorepo/thebeast/chat_pipeline/data/test/step_2/uuid_response_list.json'
    main()


Number of data points found: 200
Example UUID: 18a52da9-81c3-4104-b11f-e8750e432531
Example Response Content: The biggest cat in the world is the Siberian tiger, also known as the Amur tiger. They are the largest of all tiger species and can weigh up to 660 pounds (300 kg) and grow up to 10 feet (3 meters) in length.
Example Embedding (MeanPooling): [0.012374771758913994, -0.012974258351890458, 0.0010368387043779136, -0.010370376044693822, 0.007631063993487836]...
Example Embedding (Hybrid): [-6.938893903907228e-18, -6.60374802053116e-18, -6.2686021371550914e-18, -5.933456253779023e-18, -5.5983103704029545e-18]...
Training data points: 160
Test data points: 40
Original text: The biggest cat in the world is the Siberian tiger, also known as the Amur tiger. They are the largest of all tiger species and can weigh up to 660 pounds (300 kg) and grow up to 10 feet (3 meters) in length.
Tokenized and decoded text: The biggest cat in the world is the Siberian tiger, also known as the Amur tige

RuntimeError: mat1 and mat2 shapes cannot be multiplied (8x50 and 100x128)