In [None]:
!pip install -q scikit-learn transformers matplotlib

In [None]:
import os
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertModel
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np

# Function to process a JSON file and extract the relevant data
def extract_data(file_path):
    with open(file_path) as file:
        data = json.load(file)
    
    extracted_data = []
    for document in data:
        for turn in document['TURNS']:
            for name in turn['NAMES']:
                extracted_data.append({
                    'name': name,
                    'utterance': ' '.join(turn['UTTERANCES']),
                    'turn_number': turn['NUMBER']
                })
    return extracted_data

# Load data from JSON files in the specified folder and its subdirectories
def load_data(folder_path, max_length=None):
    all_texts = []
    file_count = 0
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                data = extract_data(file_path)
                for item in data:
                    if max_length is None or len(item['utterance']) <= max_length:
                        all_texts.append(item['utterance'])
                file_count += 1
    print(f"Processed {file_count} files and extracted {len(all_texts)} utterances suitable within length {max_length}.")
    return all_texts

# Preprocess text data using a tokenizer
def preprocess_texts(texts, tokenizer, max_length=128):
    if not texts:  # Check if texts is empty
        print("No texts available for tokenization.")
        return None, None
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')
    return encodings.input_ids, encodings.attention_mask

# Define the Autoencoder model using a pretrained BERT model as the encoder
class BertAutoencoder(nn.Module):
    def __init__(self, bert_model_name, lstm_units=256, max_length=512):
        super(BertAutoencoder, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.encoder = self.bert.encoder
        self.decoder = nn.LSTM(self.bert.config.hidden_size, lstm_units, batch_first=True)
        self.output_layer = nn.Linear(lstm_units, self.bert.config.vocab_size)
        self.max_length = max_length

    def forward(self, input_ids, attention_mask):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        encoder_outputs = bert_outputs.last_hidden_state
        decoder_outputs, _ = self.decoder(encoder_outputs)
        output = self.output_layer(decoder_outputs)
        return output

def calculate_accuracy(logits, labels):
    _, predicted = torch.max(logits, -1)
    correct = (predicted == labels).float()
    accuracy = correct.sum() / len(correct)
    return accuracy

# Progressive data increment method
def progressive_training(model, train_dataset, initial_size, increment_ratio, max_epochs, device):
    total_size = len(train_dataset)
    test_size = 0.1
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    metrics = []

    # Separate the test data and ensure it's shuffled
    train_size = int((1 - test_size) * total_size)
    test_size = total_size - train_size
    train_dataset, test_dataset = random_split(train_dataset, [train_size, test_size])
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    current_size = initial_size

    for epoch in range(max_epochs):
        if current_size > train_size:
            break

        current_train_indices = torch.randperm(train_size)[:int(current_size)]
        current_train_dataset = torch.utils.data.Subset(train_dataset, current_train_indices)
        train_loader = DataLoader(current_train_dataset, batch_size=32, shuffle=True)

        model.train()
        train_loss, train_accuracy = 0, 0
        for inputs, targets, masks in train_loader:
            inputs, targets, masks = inputs.to(device), targets.to(device), masks.to(device)
            optimizer.zero_grad()
            outputs = model(inputs, masks)
            loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
            acc = calculate_accuracy(outputs, targets.view(-1))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            train_accuracy += acc.item()

        train_accuracy /= len(train_loader)

        model.eval()
        val_loss, val_accuracy = 0, 0
        with torch.no_grad():
            for inputs, targets, masks in test_loader:
                inputs, targets, masks = inputs.to(device), targets.to(device), masks.to(device)
                outputs = model(inputs, masks)
                loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
                acc = calculate_accuracy(outputs, targets.view(-1))
                val_loss += loss.item()
                val_accuracy += acc.item()

        val_accuracy /= len(test_loader)

        metrics.append((current_size, train_loss/len(train_loader), val_loss/len(test_loader), train_accuracy, val_accuracy))
        current_size += current_size * increment_ratio

        print(f'Epoch {epoch+1}/{max_epochs} | Data size: {current_size} | Train Loss: {train_loss/len(train_loader):.8f} | Val Loss: {val_loss/len(test_loader):.8f} | Train Acc: {train_accuracy:.2f}% | Val Acc: {val_accuracy:.2f}%')
    
    return metrics

# Plotting function
def plot_metrics(metrics):
    sizes, train_losses, val_losses, train_accuracies, val_accuracies = zip(*metrics)

    plt.figure(figsize=(12, 8))

    plt.subplot(2, 1, 1)
    plt.plot(sizes, train_losses, label='Training Loss')
    plt.plot(sizes, val_losses, label='Validation Loss')
    plt.xlabel('Data Size')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Loss vs Data Size')

    plt.subplot(2, 1, 2)
    plt.plot(sizes, train_accuracies, label='Training Accuracy')
    plt.plot(sizes, val_accuracies, label='Validation Accuracy')
    plt.xlabel('Data Size')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.title('Accuracy vs Data Size')

    plt.tight_layout()
    plt.show()

# Main function
def main(folder_path, max_text_length, initial_size, increment_ratio, max_epochs):
    texts = load_data(folder_path, max_text_length)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    input_ids, attention_masks = preprocess_texts(texts, tokenizer)
    if input_ids is None:  # Check if tokenization failed
        return

    dataset = TensorDataset(input_ids, input_ids, attention_masks)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = BertAutoencoder('bert-base-uncased').to(device)

    metrics = progressive_training(model, dataset, initial_size, increment_ratio, max_epochs, device)

    # Save the model
    torch.save(model.state_dict(), 'trained_autoencoder.pth')

    plot_metrics(metrics)

if __name__ == "__main__":
    folder_path = '/workspace/slice-monorepo/cl_cr3/aligneddata'  # Update with your actual data path
    max_text_length = 150  # Set the maximum text length you want to include
    initial_size = 100  # Initial training data size
    increment_ratio = 0.005  # Increment ratio for progressive training
    max_epochs = 50  # Maximum number of epochs
    main(folder_path, max_text_length, initial_size, increment_ratio, max_epochs)


In [None]:
import os
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertModel
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np

# Function to process a JSON file and extract the relevant data
def extract_data(file_path):
    with open(file_path) as file:
        data = json.load(file)
    
    extracted_data = []
    for document in data:
        for turn in document['TURNS']:
            for name in turn['NAMES']:
                extracted_data.append({
                    'name': name,
                    'utterance': ' '.join(turn['UTTERANCES']),
                    'turn_number': turn['NUMBER']
                })
    return extracted_data

# Load data from JSON files in the specified folder and its subdirectories
def load_data(folder_path, max_length=None):
    all_texts = []
    file_count = 0
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                data = extract_data(file_path)
                for item in data:
                    if max_length is None or len(item['utterance']) <= max_length:
                        all_texts.append(item['utterance'])
                file_count += 1
    print(f"Processed {file_count} files and extracted {len(all_texts)} utterances suitable within length {max_length}.")
    return all_texts

# Preprocess text data using a tokenizer
def preprocess_texts(texts, tokenizer, max_length=128):
    if not texts:  # Check if texts is empty
        print("No texts available for tokenization.")
        return None, None
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')
    return encodings.input_ids, encodings.attention_mask

# Define the Autoencoder model using a pretrained BERT model as the encoder
class BertAutoencoder(nn.Module):
    def __init__(self, bert_model_name, lstm_units=256, max_length=512):
        super(BertAutoencoder, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.encoder = self.bert.encoder
        self.decoder = nn.LSTM(self.bert.config.hidden_size, lstm_units, batch_first=True)
        self.output_layer = nn.Linear(lstm_units, self.bert.config.vocab_size)
        self.max_length = max_length

    def forward(self, input_ids, attention_mask):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        encoder_outputs = bert_outputs.last_hidden_state
        decoder_outputs, _ = self.decoder(encoder_outputs)
        output = self.output_layer(decoder_outputs)
        return output

def calculate_accuracy(logits, labels):
    _, predicted = torch.max(logits, -1)
    correct = (predicted == labels).float()
    accuracy = correct.sum() / len(correct)
    return accuracy

# Main function
def main(folder_path, max_text_length, num_samples):
    texts = load_data(folder_path, max_text_length)
    if len(texts) > num_samples:
        texts = np.random.choice(texts, num_samples, replace=False)
        texts = list(texts)  # Convert back to list
        print(f"Randomly selected {num_samples} texts for training.")
    elif not texts:
        print("Insufficient texts meet the criteria or no texts available.")
        return

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    input_ids, attention_masks = preprocess_texts(texts, tokenizer)
    if input_ids is None:  # Check if tokenization failed
        return

    # Splitting the data into training and validation sets
    input_ids_train, input_ids_val, attention_masks_train, attention_masks_val = train_test_split(
        input_ids, attention_masks, test_size=0.1)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = BertAutoencoder('bert-base-uncased').to(device)
    train_data = TensorDataset(input_ids_train, input_ids_train, attention_masks_train)
    val_data = TensorDataset(input_ids_val, input_ids_val, attention_masks_val)
    train_loader = DataLoader(train_data, batch_size=512, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=512, shuffle=False)

    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

    epochs = 15  # Set the number of epochs
    train_losses = []
    val_losses = []
    train_accs = []
    val_accs = []

    for epoch in range(epochs):
        model.train()
        total_loss, total_acc = 0, 0
        for inputs, targets, masks in train_loader:
            inputs, targets, masks = inputs.to(device), targets.to(device), masks.to(device)
            optimizer.zero_grad()
            outputs = model(inputs, masks)
            loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
            acc = calculate_accuracy(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            total_acc += acc.item()

        avg_train_loss = total_loss / len(train_loader)
        avg_train_acc = total_acc / len(train_loader)
        train_losses.append(avg_train_loss)
        train_accs.append(avg_train_acc)

        model.eval()
        total_loss, total_acc = 0, 0
        with torch.no_grad():
            for inputs, targets, masks in val_loader:
                inputs, targets, masks = inputs.to(device), targets.to(device), masks.to(device)
                outputs = model(inputs, masks)
                loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
                acc = calculate_accuracy(outputs, targets)
                total_loss += loss.item()
                total_acc += acc.item()

        avg_val_loss = total_loss / len(val_loader)
        avg_val_acc = total_acc / len(val_loader)
        val_losses.append(avg_val_loss)
        val_accs.append(avg_val_acc)

        print(f'Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}, Train Acc = {avg_train_acc:.4f}, Val Acc = {avg_val_acc:.4f}')

    # Save the model
    torch.save(model.state_dict(), 'trained_autoencoder.pth')

    # Plotting training and validation loss
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.title('Training vs Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    # Plotting training and validation accuracy
    plt.subplot(1, 2, 2)
    plt.plot(train_accs, label='Training Accuracy')
    plt.plot(val_accs, label='Validation Accuracy')
    plt.title('Training vs Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.tight_layout()
    plt.show()

if __name__ == "__main__":
    folder_path = '/workspace/slice-monorepo/cl_cr3/aligneddata'  # Update with your actual data path
    max_text_length = 150  # Set the maximum text length you want to include
    num_samples = 2000000  # Set the number of samples you want to train on
    main(folder_path, max_text_length, num_samples)


Processed 1141 files and extracted 2674582 utterances suitable within length 150.
Randomly selected 2000000 texts for training.


In [None]:
!pip install \
--extra-index-url=https://pypi.nvidia.com \
cudf-cu12==24.6.* \
dask-cudf-cu12==24.6.* \
cuml-cu12==24.6.* \
cugraph-cu12==24.6.*

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
from cuml.manifold import UMAP as cumlUMAP  # Import GPU UMAP from cuml
import matplotlib.pyplot as plt
import numpy as np

class BertAutoencoder(nn.Module):
    def __init__(self, bert_model_name, lstm_units=256, max_length=512):
        super(BertAutoencoder, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.encoder = self.bert.encoder
        self.decoder = nn.LSTM(self.bert.config.hidden_size, lstm_units, batch_first=True)
        self.output_layer = nn.Linear(lstm_units, self.bert.config.vocab_size)
        self.max_length = max_length

    def encode(self, input_ids, attention_mask):
        with torch.no_grad():
            bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            encoder_outputs = bert_outputs.last_hidden_state
        return encoder_outputs

def load_model(model_path):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = BertAutoencoder('bert-base-uncased').to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    return model

def visualize_embeddings(embeddings, title='Embeddings'):
    plt.figure(figsize=(10, 7))
    plt.scatter(embeddings[:, 0], embeddings[:, 1], c='blue', alpha=0.5)
    plt.title(title)
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.colorbar()
    plt.show()

def main():
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    texts = ["This is a sample text.", "Another example text.", "Deep learning is fun!", "Transformers are powerful."]
    input_ids = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")['input_ids']
    attention_mask = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")['attention_mask']
    
    model = load_model('trained_autoencoder.pth')
    model.eval()
    embeddings = model.encode(input_ids, attention_mask)
    embeddings = embeddings[:, 0, :].cpu().numpy()  # Taking the embedding of the [CLS] token

    # Dimensionality reduction with GPU UMAP
    reducer = cumlUMAP(n_neighbors=15, min_dist=0.1, metric='euclidean')
    umap_embeddings = reducer.fit_transform(embeddings)

    # Visualize UMAP embeddings
    visualize_embeddings(umap_embeddings, 'UMAP Embeddings')

if __name__ == "__main__":
    main()
