In [1]:
!pip install -q scikit-learn transformers matplotlib h5py

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [3]:
import os
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertModel
import matplotlib.pyplot as plt
import numpy as np
import h5py

class BertAutoencoder(nn.Module):
    def __init__(self, bert_model_name, embedding_dim=1536, hidden_dim=768, lstm_units=256, sequence_length=4):
        super(BertAutoencoder, self).__init__()
        self.sequence_length = sequence_length
        self.embedding_adapter = nn.Linear(embedding_dim, hidden_dim * sequence_length)
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.decoder = nn.LSTM(hidden_dim, lstm_units, batch_first=True)
        self.output_layer = nn.Linear(lstm_units * sequence_length, embedding_dim)

    def forward(self, x):
        x = self.embedding_adapter(x)
        x = x.view(x.size(0), self.sequence_length, -1)  # Reshape to (batch_size, sequence_length, hidden_dim)
        encoder_outputs = self.bert(inputs_embeds=x).last_hidden_state
        decoder_outputs, _ = self.decoder(encoder_outputs)
        decoder_outputs = decoder_outputs.contiguous().view(decoder_outputs.size(0), -1)  # Flatten to match input dimensions
        output = self.output_layer(decoder_outputs)
        return output


def load_embeddings(file_path):
    with h5py.File(file_path, 'r') as f:
        embeddings = f['embeddings'][:]
        names = f['names'][:]
        turn_numbers = f['turn_numbers'][:]
        file_paths = f['file_paths'][:]
        model_names = f['model_names'][:]
    
    metadata = {
        'names': [name.decode('utf8') for name in names],
        'turn_numbers': turn_numbers,
        'file_paths': [file_path.decode('utf8') for file_path in file_paths],
        'model_names': [model_name.decode('utf8') for model_name in model_names]
    }
    return embeddings, metadata

def calculate_cosine_similarity(outputs, inputs):
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    similarities = cos(outputs, inputs)
    return similarities.mean().item() * 100  # Convert to percentage

# Progressive data increment method
def progressive_training(model, train_dataset, initial_size, increment_ratio, max_epochs, device):
    total_size = len(train_dataset)
    test_size = 0.1
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    metrics = []

    # Separate the test data and ensure it's shuffled
    train_size = int((1 - test_size) * total_size)
    test_size = total_size - train_size
    train_dataset, test_dataset = random_split(train_dataset, [train_size, test_size])
    test_loader = DataLoader(test_dataset, batch_size=initial_size, shuffle=False, num_workers=4)

    current_size = initial_size

    for epoch in range(max_epochs):
        if current_size > train_size:
            break

        current_train_indices = torch.randperm(train_size)[:int(current_size)]
        current_train_dataset = torch.utils.data.Subset(train_dataset, current_train_indices)
        
        train_loader = DataLoader(current_train_dataset, batch_size=initial_size, shuffle=True, num_workers=4)

        model.train()
        train_loss, train_similarity = 0, 0
        for inputs in train_loader:
            inputs = inputs[0].to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, inputs)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            train_similarity += calculate_cosine_similarity(outputs, inputs)

        train_loss /= len(train_loader)
        train_similarity /= len(train_loader)

        model.eval()
        val_loss, val_similarity = 0, 0
        with torch.no_grad():
            for inputs in test_loader:
                inputs = inputs[0].to(device)
                outputs = model(inputs)
                loss = criterion(outputs, inputs)
                val_loss += loss.item()
                val_similarity += calculate_cosine_similarity(outputs, inputs)

        val_loss /= len(test_loader)
        val_similarity /= len(test_loader)

        metrics.append((current_size, train_loss, val_loss, train_similarity, val_similarity))
        current_size += current_size * increment_ratio

        print(f'Epoch {epoch+1}/{max_epochs} | Data size: {int(current_size)} | Train Loss: {train_loss:.8f} | Val Loss: {val_loss:.8f} | Train Similarity: {train_similarity:.2f}% | Val Similarity: {val_similarity:.2f}%')
    
    return metrics

# Plotting function
def plot_metrics(metrics):
    sizes, train_losses, val_losses, train_similarities, val_similarities = zip(*metrics)

    plt.figure(figsize=(12, 8))

    plt.subplot(2, 1, 1)
    plt.plot(sizes, train_losses, label='Training Loss')
    plt.plot(sizes, val_losses, label='Validation Loss')
    plt.xlabel('Data Size')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Loss vs Data Size')

    plt.subplot(2, 1, 2)
    plt.plot(sizes, train_similarities, label='Training Similarity')
    plt.plot(sizes, val_similarities, label='Validation Similarity')
    plt.xlabel('Data Size')
    plt.ylabel('Similarity (%)')
    plt.legend()
    plt.title('Similarity vs Data Size')

    plt.tight_layout()
    plt.show()

# Main function
def main(embeddings_path, initial_size, increment_ratio, max_epochs):
    embeddings, metadata = load_embeddings(embeddings_path)
    embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32)

    dataset = TensorDataset(embeddings_tensor)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = BertAutoencoder('bert-base-uncased', embedding_dim=1536).to(device)

    metrics = progressive_training(model, dataset, initial_size, increment_ratio, max_epochs, device)

    # Save the model
    torch.save(model.state_dict(), 'trained_autoencoder.pth')

    plot_metrics(metrics)

if __name__ == "__main__":
    embeddings_path = 'utterance_embeddings.h5'  # Path to the HDF5 file containing embeddings
    initial_size = 12288  # Initial training data size and batch size
    increment_ratio = 0.05  # Increment ratio for progressive training
    max_epochs = 5000  # Maximum number of epochs
    main(embeddings_path, initial_size, increment_ratio, max_epochs)


OSError: Unable to synchronously open file (truncated file: eof = 8595177472, sblock->base_addr = 0, stored_eof = 10849309712)

In [None]:
import os
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
import matplotlib.pyplot as plt
import numpy as np
import h5py
from sklearn.metrics import mean_squared_error

# Define the Autoencoder model
class EmbeddingAutoencoder(nn.Module):
    def __init__(self, input_dim=1536, hidden_dim=512, embedding_dim=256):
        super(EmbeddingAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(True),
            nn.Linear(hidden_dim, embedding_dim),
            nn.ReLU(True)
        )
        self.decoder = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU(True),
            nn.Linear(hidden_dim, input_dim)
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

def load_embeddings(file_path):
    with h5py.File(file_path, 'r') as f:
        embeddings = f['embeddings'][:]
        names = f['names'][:]
        turn_numbers = f['turn_numbers'][:]
        file_paths = f['file_paths'][:]
        model_names = f['model_names'][:]
    
    metadata = {
        'names': [name.decode('utf8') for name in names],
        'turn_numbers': turn_numbers,
        'file_paths': [file_path.decode('utf8') for file_path in file_paths],
        'model_names': [model_name.decode('utf8') for model_name in model_names]
    }
    return embeddings, metadata

def calculate_cosine_similarity(outputs, inputs):
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    similarities = cos(outputs, inputs)
    return similarities.mean().item() * 100  # Convert to percentage

# Progressive data increment method
def progressive_training(model, train_dataset, initial_size, increment_ratio, max_epochs, device):
    total_size = len(train_dataset)
    test_size = 0.1
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    metrics = []

    # Separate the test data and ensure it's shuffled
    train_size = int((1 - test_size) * total_size)
    test_size = total_size - train_size
    train_dataset, test_dataset = random_split(train_dataset, [train_size, test_size])
    test_loader = DataLoader(test_dataset, batch_size=initial_size, shuffle=False, num_workers=4)

    current_size = initial_size

    for epoch in range(max_epochs):
        if current_size > train_size:
            break

        current_train_indices = torch.randperm(train_size)[:int(current_size)]
        current_train_dataset = torch.utils.data.Subset(train_dataset, current_train_indices)
        
        train_loader = DataLoader(current_train_dataset, batch_size=initial_size, shuffle=True, num_workers=4)

        model.train()
        train_loss, train_similarity = 0, 0
        for inputs in train_loader:
            inputs = inputs[0].to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, inputs)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            train_similarity += calculate_cosine_similarity(outputs, inputs)

        train_loss /= len(train_loader)
        train_similarity /= len(train_loader)

        model.eval()
        val_loss, val_similarity = 0, 0
        with torch.no_grad():
            for inputs in test_loader:
                inputs = inputs[0].to(device)
                outputs = model(inputs)
                loss = criterion(outputs, inputs)
                val_loss += loss.item()
                val_similarity += calculate_cosine_similarity(outputs, inputs)

        val_loss /= len(test_loader)
        val_similarity /= len(test_loader)

        metrics.append((current_size, train_loss, val_loss, train_similarity, val_similarity))
        current_size += current_size * increment_ratio

        print(f'Epoch {epoch+1}/{max_epochs} | Data size: {int(current_size)} | Train Loss: {train_loss:.8f} | Val Loss: {val_loss:.8f} | Train Similarity: {train_similarity:.2f}% | Val Similarity: {val_similarity:.2f}%')
    
    return metrics

# Plotting function
def plot_metrics(metrics):
    sizes, train_losses, val_losses, train_similarities, val_similarities = zip(*metrics)

    plt.figure(figsize=(12, 8))

    plt.subplot(2, 1, 1)
    plt.plot(sizes, train_losses, label='Training Loss')
    plt.plot(sizes, val_losses, label='Validation Loss')
    plt.xlabel('Data Size')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Loss vs Data Size')

    plt.subplot(2, 1, 2)
    plt.plot(sizes, train_similarities, label='Training Similarity')
    plt.plot(sizes, val_similarities, label='Validation Similarity')
    plt.xlabel('Data Size')
    plt.ylabel('Similarity (%)')
    plt.legend()
    plt.title('Similarity vs Data Size')

    plt.tight_layout()
    plt.show()

# Main function
def main(embeddings_path, initial_size, increment_ratio, max_epochs):
    embeddings, metadata = load_embeddings(embeddings_path)
    embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32)

    dataset = TensorDataset(embeddings_tensor)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = EmbeddingAutoencoder().to(device)

    metrics = progressive_training(model, dataset, initial_size, increment_ratio, max_epochs, device)

    # Save the model
    torch.save(model.state_dict(), 'trained_autoencoder.pth')

    plot_metrics(metrics)

if __name__ == "__main__":
    embeddings_path = 'utterance_embeddings.h5'  # Path to the HDF5 file containing embeddings
    initial_size = 1024  # Initial training data size and batch size
    increment_ratio = 0.1  # Increment ratio for progressive training
    max_epochs = 50  # Maximum number of epochs
    main(embeddings_path, initial_size, increment_ratio, max_epochs)


In [None]:
import os
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertModel
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
from concurrent.futures import ProcessPoolExecutor

# Function to process a JSON file and extract the relevant data
def extract_data(file_path, max_length):
    with open(file_path) as file:
        data = json.load(file)
    
    extracted_data = []
    for document in data:
        for turn in document['TURNS']:
            for name in turn['NAMES']:
                utterance = ' '.join(turn['UTTERANCES']).lower()
                if max_length is None or len(utterance) <= max_length:
                    extracted_data.append({
                        'name': name,
                        'utterance': utterance,
                        'turn_number': turn['NUMBER']
                    })
    return extracted_data

# Wrapper function to enable multiprocessing
def extract_data_wrapper(args):
    return extract_data(*args)

# Load data from JSON files in the specified folder and its subdirectories
def load_data(folder_path, max_length=None, num_workers=4):
    all_texts = []
    all_names = []
    file_paths = []

    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                file_paths.append(os.path.join(root, file))

    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        results = executor.map(extract_data_wrapper, [(file_path, max_length) for file_path in file_paths])

    for result in results:
        for item in result:
            all_texts.append(item['utterance'])
            all_names.append(item['name'])

    print(f"Processed {len(file_paths)} files and extracted {len(all_texts)} utterances suitable within length {max_length}.")
    return all_texts, all_names

# Preprocess text data using a tokenizer
def preprocess_texts(texts, tokenizer, max_length=128):
    if not texts:  # Check if texts is empty
        print("No texts available for tokenization.")
        return None, None
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')
    return encodings.input_ids, encodings.attention_mask

# Define the Autoencoder model using a pretrained BERT model as the encoder
class BertAutoencoder(nn.Module):
    def __init__(self, bert_model_name, lstm_units=256, max_length=64):
        super(BertAutoencoder, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.encoder = self.bert.encoder
        self.decoder = nn.LSTM(self.bert.config.hidden_size, lstm_units, batch_first=True)
        self.output_layer = nn.Linear(lstm_units, self.bert.config.vocab_size)
        self.max_length = max_length

    def forward(self, input_ids, attention_mask):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        encoder_outputs = bert_outputs.last_hidden_state
        decoder_outputs, _ = self.decoder(encoder_outputs)
        output = self.output_layer(decoder_outputs)
        return output

def calculate_accuracy(logits, labels):
    _, predicted = torch.max(logits, -1)
    correct = (predicted == labels).float()
    accuracy = correct.sum() / len(correct)
    return accuracy

# Progressive data increment method
def progressive_training(model, train_dataset, initial_size, increment_ratio, max_epochs, device):
    total_size = len(train_dataset)
    test_size = 0.1
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    metrics = []

    # Separate the test data and ensure it's shuffled
    train_size = int((1 - test_size) * total_size)
    test_size = total_size - train_size
    train_dataset, test_dataset = random_split(train_dataset, [train_size, test_size])
    test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False, num_workers=4)

    current_size = initial_size

    for epoch in range(max_epochs):
        if current_size > train_size:
            break

        current_train_dataset = torch.utils.data.Subset(train_dataset, list(range(int(current_size))))
        train_loader = DataLoader(current_train_dataset, batch_size=1024, shuffle=False, num_workers=4)

        model.train()
        train_loss, correct, total = 0, 0, 0
        for inputs, targets, masks in train_loader:
            inputs, targets, masks = inputs.to(device), targets.to(device), masks.to(device)
            optimizer.zero_grad()
            outputs = model(inputs, masks)
            loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            _, predicted = outputs.max(2)
            total += targets.size(0) * targets.size(1)
            correct += predicted.eq(targets).sum().item()

        train_accuracy = 100. * correct / total

        model.eval()
        val_loss, correct, total = 0, 0, 0
        with torch.no_grad():
            for inputs, targets, masks in test_loader:
                inputs, targets, masks = inputs.to(device), targets.to(device), masks.to(device)
                outputs = model(inputs, masks)
                loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
                val_loss += loss.item()
                _, predicted = outputs.max(2)
                total += targets.size(0) * targets.size(1)
                correct += predicted.eq(targets).sum().item()

        val_accuracy = 100. * correct / total
        metrics.append((current_size, train_loss/len(train_loader), val_loss/len(test_loader), train_accuracy, val_accuracy))
        current_size += current_size * increment_ratio

        print(f'Epoch {epoch+1}/{max_epochs} | Data size: {current_size} | Train Loss: {train_loss/len(train_loader):.4f} | Val Loss: {val_loss/len(test_loader):.4f} | Train Acc: {train_accuracy:.2f}% | Val Acc: {val_accuracy:.2f}%')
    
    return metrics

# Plotting function
def plot_metrics(metrics):
    sizes, train_losses, val_losses, train_accuracies, val_accuracies = zip(*metrics)

    plt.figure(figsize=(12, 8))

    plt.subplot(2, 1, 1)
    plt.plot(sizes, train_losses, label='Training Loss')
    plt.plot(sizes, val_losses, label='Validation Loss')
    plt.xlabel('Data Size')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Loss vs Data Size')

    plt.subplot(2, 1, 2)
    plt.plot(sizes, train_accuracies, label='Training Accuracy')
    plt.plot(sizes, val_accuracies, label='Validation Accuracy')
    plt.xlabel('Data Size')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.title('Accuracy vs Data Size')

    plt.tight_layout()
    plt.show()

# Main function
def main(folder_path, max_text_length, initial_size, increment_ratio, max_epochs, num_workers=4):
    texts, names = load_data(folder_path, max_text_length, num_workers)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    input_ids, attention_masks = preprocess_texts(texts, tokenizer, max_length=128)
    if input_ids is None:  # Check if tokenization failed
        return

    dataset = TensorDataset(input_ids, input_ids, attention_masks)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = BertAutoencoder('bert-base-uncased').to(device)

    metrics = progressive_training(model, dataset, initial_size, increment_ratio, max_epochs, device)

    # Save the model
    torch.save(model.state_dict(), 'trained_autoencoder.pth')

    plot_metrics(metrics)

if __name__ == "__main__":
    folder_path = '/workspace/slice-monorepo/cl_cr3/aligneddata/c=3'  # Update with your actual data path
    max_text_length = 64  # Set the maximum text length for BERT
    initial_size = 1024  # Initial training data size
    increment_ratio = 0.01  # Increment ratio for progressive training
    max_epochs = 5000  # Maximum number of epochs
    num_workers = 4  # Number of workers for data loading and preprocessing
    main(folder_path, max_text_length, initial_size, increment_ratio, max_epochs, num_workers)


In [None]:
import os
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertModel
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
from concurrent.futures import ProcessPoolExecutor

# Function to process a JSON file and extract the relevant data
def extract_data(file_path, max_length):
    with open(file_path) as file:
        data = json.load(file)
    
    extracted_data = []
    for document in data:
        for turn in document['TURNS']:
            for name in turn['NAMES']:
                utterance = ' '.join(turn['UTTERANCES']).lower()
                if max_length is None or len(utterance) <= max_length:
                    extracted_data.append({
                        'name': name,
                        'utterance': utterance,
                        'turn_number': turn['NUMBER']
                    })
    return extracted_data

# Wrapper function to enable multiprocessing
def extract_data_wrapper(args):
    return extract_data(*args)

# Load data from JSON files in the specified folder and its subdirectories
def load_data(folder_path, max_length=None, num_workers=4):
    all_texts = []
    all_names = []
    file_paths = []

    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                file_paths.append(os.path.join(root, file))

    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        results = executor.map(extract_data_wrapper, [(file_path, max_length) for file_path in file_paths])

    for result in results:
        for item in result:
            all_texts.append(item['utterance'])
            all_names.append(item['name'])

    print(f"Processed {len(file_paths)} files and extracted {len(all_texts)} utterances suitable within length {max_length}.")
    return all_texts, all_names

# Preprocess text data using a tokenizer
def preprocess_texts(texts, tokenizer, max_length=128):
    if not texts:  # Check if texts is empty
        print("No texts available for tokenization.")
        return None, None
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')
    return encodings.input_ids, encodings.attention_mask

# Define the Autoencoder model using a pretrained BERT model as the encoder
class BertAutoencoder(nn.Module):
    def __init__(self, bert_model_name, lstm_units=256, max_length=128):
        super(BertAutoencoder, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.encoder = self.bert.encoder
        self.decoder = nn.LSTM(self.bert.config.hidden_size, lstm_units, batch_first=True)
        self.output_layer = nn.Linear(lstm_units, self.bert.config.vocab_size)
        self.max_length = max_length

    def forward(self, input_ids, attention_mask):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        encoder_outputs = bert_outputs.last_hidden_state
        decoder_outputs, _ = self.decoder(encoder_outputs)
        output = self.output_layer(decoder_outputs)
        return output

def calculate_accuracy(logits, labels):
    _, predicted = torch.max(logits, -1)
    correct = (predicted == labels).float()
    accuracy = correct.sum() / len(correct)
    return accuracy

# Progressive data increment method
def progressive_training(model, train_dataset, initial_size, increment_ratio, max_epochs, device):
    total_size = len(train_dataset)
    test_size = 0.1
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    metrics = []

    # Separate the test data and ensure it's shuffled
    train_size = int((1 - test_size) * total_size)
    test_size = total_size - train_size
    train_dataset, test_dataset = random_split(train_dataset, [train_size, test_size])
    test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False, num_workers=4)

    current_size = initial_size

    for epoch in range(max_epochs):
        if current_size > train_size:
            break

        current_train_indices = torch.randperm(train_size)[:int(current_size)]
        current_train_dataset = torch.utils.data.Subset(train_dataset, current_train_indices)
        train_loader = DataLoader(current_train_dataset, batch_size=512, shuffle=True, num_workers=4)

        model.train()
        train_loss, train_accuracy = 0, 0
        for inputs, targets, masks in train_loader:
            inputs, targets, masks = inputs.to(device), targets.to(device), masks.to(device)
            optimizer.zero_grad()
            outputs = model(inputs, masks)
            loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
            acc = calculate_accuracy(outputs.view(-1, outputs.size(-1)), targets.view(-1))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            train_accuracy += acc.item()

        train_accuracy /= len(train_loader)

        model.eval()
        val_loss, val_accuracy = 0, 0
        with torch.no_grad():
            for inputs, targets, masks in test_loader:
                inputs, targets, masks = inputs.to(device), targets.to(device), masks.to(device)
                outputs = model(inputs, masks)
                loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
                acc = calculate_accuracy(outputs.view(-1, outputs.size(-1)), targets.view(-1))
                val_loss += loss.item()
                val_accuracy += acc.item()

        val_accuracy /= len(test_loader)

        metrics.append((current_size, train_loss/len(train_loader), val_loss/len(test_loader), train_accuracy, val_accuracy))
        current_size += current_size * increment_ratio

        print(f'Epoch {epoch+1}/{max_epochs} | Data size: {current_size} | Train Loss: {train_loss/len(train_loader):.8f} | Val Loss: {val_loss/len(test_loader):.8f} | Train Acc: {train_accuracy:.2f}% | Val Acc: {val_accuracy:.2f}%')
    
    return metrics

# Plotting function
def plot_metrics(metrics):
    sizes, train_losses, val_losses, train_accuracies, val_accuracies = zip(*metrics)

    plt.figure(figsize=(12, 8))

    plt.subplot(2, 1, 1)
    plt.plot(sizes, train_losses, label='Training Loss')
    plt.plot(sizes, val_losses, label='Validation Loss')
    plt.xlabel('Data Size')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Loss vs Data Size')

    plt.subplot(2, 1, 2)
    plt.plot(sizes, train_accuracies, label='Training Accuracy')
    plt.plot(sizes, val_accuracies, label='Validation Accuracy')
    plt.xlabel('Data Size')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.title('Accuracy vs Data Size')

    plt.tight_layout()
    plt.show()

# Main function
def main(folder_path, max_text_length, initial_size, increment_ratio, max_epochs, num_workers=4):
    texts, names = load_data(folder_path, max_text_length, num_workers)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    input_ids, attention_masks = preprocess_texts(texts, tokenizer, max_length=128)
    if input_ids is None:  # Check if tokenization failed
        return

    dataset = TensorDataset(input_ids, input_ids, attention_masks)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = BertAutoencoder('bert-base-uncased').to(device)

    metrics = progressive_training(model, dataset, initial_size, increment_ratio, max_epochs, device)

    # Save the model
    torch.save(model.state_dict(), 'trained_autoencoder.pth')

    plot_metrics(metrics)

if __name__ == "__main__":
    folder_path = '/workspace/slice-monorepo/cl_cr3/aligneddata/c=3'  # Update with your actual data path
    max_text_length = 128  # Set the maximum text length for BERT
    initial_size = 1024  # Initial training data size
    increment_ratio = 0.01  # Increment ratio for progressive training
    max_epochs = 5000  # Maximum number of epochs
    num_workers = 4  # Number of workers for data loading and preprocessing
    main(folder_path, max_text_length, initial_size, increment_ratio, max_epochs, num_workers)


In [None]:
import os
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertModel
import matplotlib.pyplot as plt
import numpy as np
import h5py

class BertAutoencoder(nn.Module):
    def __init__(self, bert_model_name, embedding_dim=1536, hidden_dim=768, lstm_units=256, sequence_length=4):
        super(BertAutoencoder, self).__init__()
        self.sequence_length = sequence_length
        self.embedding_adapter = nn.Linear(embedding_dim, hidden_dim * sequence_length)
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.decoder = nn.LSTM(hidden_dim, lstm_units, batch_first=True)
        self.output_layer = nn.Linear(lstm_units * sequence_length, embedding_dim)

    def forward(self, x):
        x = self.embedding_adapter(x)
        x = x.view(x.size(0), self.sequence_length, -1)  # Reshape to (batch_size, sequence_length, hidden_dim)
        encoder_outputs = self.bert(inputs_embeds=x).last_hidden_state
        decoder_outputs, _ = self.decoder(encoder_outputs)
        decoder_outputs = decoder_outputs.contiguous().view(decoder_outputs.size(0), -1)  # Flatten to match input dimensions
        output = self.output_layer(decoder_outputs)
        return output

def load_embeddings(file_path):
    with h5py.File(file_path, 'r') as f:
        embeddings = f['embeddings'][:]
        names = f['names'][:]
        turn_numbers = f['turn_numbers'][:]
        file_paths = f['file_paths'][:]
        model_names = f['model_names'][:]
    
    metadata = {
        'names': [name.decode('utf8') for name in names],
        'turn_numbers': turn_numbers,
        'file_paths': [file_path.decode('utf8') for file_path in file_paths],
        'model_names': [model_name.decode('utf8') for model_name in model_names]
    }
    return embeddings, metadata

def calculate_cosine_similarity(outputs, inputs):
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    similarities = cos(outputs, inputs)
    return similarities.mean().item() * 100  # Convert to percentage

def standard_training(model, train_dataset, batch_size, max_epochs, device):
    total_size = len(train_dataset)
    test_size = 0.1
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    metrics = []

    # Separate the test data and ensure it's shuffled
    train_size = int((1 - test_size) * total_size)
    test_size = total_size - train_size
    train_dataset, test_dataset = random_split(train_dataset, [train_size, test_size])
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)

    for epoch in range(max_epochs):
        model.train()
        train_loss, train_similarity = 0, 0
        for inputs in train_loader:
            inputs = inputs[0].to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, inputs)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            train_similarity += calculate_cosine_similarity(outputs, inputs)

        train_loss /= len(train_loader)
        train_similarity /= len(train_loader)

        model.eval()
        val_loss, val_similarity = 0, 0
        with torch.no_grad():
            for inputs in test_loader:
                inputs = inputs[0].to(device)
                outputs = model(inputs)
                loss = criterion(outputs, inputs)
                val_loss += loss.item()
                val_similarity += calculate_cosine_similarity(outputs, inputs)

        val_loss /= len(test_loader)
        val_similarity /= len(test_loader)

        metrics.append((epoch, train_loss, val_loss, train_similarity, val_similarity))

        print(f'Epoch {epoch+1}/{max_epochs} | Train Loss: {train_loss:.8f} | Val Loss: {val_loss:.8f} | Train Similarity: {train_similarity:.2f}% | Val Similarity: {val_similarity:.2f}%')
    
    return metrics

# Plotting function
def plot_metrics(metrics):
    epochs, train_losses, val_losses, train_similarities, val_similarities = zip(*metrics)

    plt.figure(figsize=(12, 8))

    plt.subplot(2, 1, 1)
    plt.plot(epochs, train_losses, label='Training Loss')
    plt.plot(epochs, val_losses, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Loss vs Epochs')

    plt.subplot(2, 1, 2)
    plt.plot(epochs, train_similarities, label='Training Similarity')
    plt.plot(epochs, val_similarities, label='Validation Similarity')
    plt.xlabel('Epochs')
    plt.ylabel('Similarity (%)')
    plt.legend()
    plt.title('Similarity vs Epochs')

    plt.tight_layout()
    plt.show()

# Main function
def main(embeddings_path, batch_size, max_epochs):
    embeddings, metadata = load_embeddings(embeddings_path)
    embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32)

    dataset = TensorDataset(embeddings_tensor)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = BertAutoencoder('bert-base-uncased', embedding_dim=1536).to(device)

    metrics = standard_training(model, dataset, batch_size, max_epochs, device)

    # Save the model
    torch.save(model.state_dict(), 'trained_autoencoder.pth')

    plot_metrics(metrics)

if __name__ == "__main__":
    embeddings_path = 'utterance_embeddings.h5'  # Path to the HDF5 file containing embeddings
    batch_size = 12288  # Training batch size
    max_epochs = 50  # Maximum number of epochs
    main(embeddings_path, batch_size, max_epochs)
