In [2]:
!pip install -q scikit-learn tensorflow

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [None]:
/workspace/slice-monorepo/thebeast/data/combined

In [3]:
import os
import json
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

# Load data from JSON files in the specified folder
def load_data(folder_path):
    texts = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.json'):
            with open(os.path.join(folder_path, file_name), 'r') as file:
                data = json.load(file)
                if isinstance(data, dict) and 'response_content' in data:
                    texts.append(data['response_content'])
                elif isinstance(data, list):
                    for item in data:
                        if 'response_content' in item:
                            texts.append(item['response_content'])
    return texts

# Tokenize and pad the text data
def preprocess_texts(texts, tokenizer, max_length=512):
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
    return padded_sequences

class TextTokenizer:
    def __init__(self, oov_token="<OOV>"):
        self.oov_token = oov_token
        self.word_index = {oov_token: 1}
        self.index_word = {1: oov_token}
        self.word_counts = {}
        self.num_words = 2

    def fit_on_texts(self, texts):
        for text in texts:
            for word in text.split():
                if word not in self.word_counts:
                    self.word_counts[word] = 1
                else:
                    self.word_counts[word] += 1

        sorted_words = sorted(self.word_counts.items(), key=lambda item: item[1], reverse=True)
        for word, _ in sorted_words:
            self.word_index[word] = self.num_words
            self.index_word[self.num_words] = word
            self.num_words += 1

    def texts_to_sequences(self, texts):
        sequences = []
        for text in texts:
            sequences.append([self.word_index.get(word, self.word_index[self.oov_token]) for word in text.split()])
        return sequences

    def sequences_to_texts(self, sequences):
        texts = []
        for seq in sequences:
            texts.append(' '.join([self.index_word.get(idx, self.oov_token) for idx in seq]))
        return texts

# Padding function
def pad_sequences(sequences, maxlen, padding='post', truncating='post'):
    padded_sequences = np.zeros((len(sequences), maxlen), dtype=int)
    for i, seq in enumerate(sequences):
        if len(seq) > maxlen:
            if truncating == 'pre':
                seq = seq[-maxlen:]
            elif truncating == 'post':
                seq = seq[:maxlen]
        if padding == 'pre':
            padded_sequences[i, -len(seq):] = seq
        elif padding == 'post':
            padded_sequences[i, :len(seq)] = seq
    return padded_sequences

# Define the Autoencoder model
class TextAutoencoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, lstm_units=256, max_length=512):
        super(TextAutoencoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.encoder = nn.LSTM(embedding_dim, lstm_units, batch_first=True)
        self.decoder = nn.LSTM(lstm_units, embedding_dim, batch_first=True)
        self.output_layer = nn.Linear(embedding_dim, vocab_size)
        self.max_length = max_length

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.encoder(embedded)
        repeated_hidden = hidden.repeat(self.max_length, 1, 1).permute(1, 0, 2)
        decoded, _ = self.decoder(repeated_hidden)
        output = self.output_layer(decoded)
        return output

# Progressive data increment method
def progressive_training(model, data, tokenizer, vocab_size, initial_size, increment_ratio, max_epochs, device):
    current_size = initial_size
    total_size = len(data)
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    metrics = []

    for epoch in range(max_epochs):
        if current_size > total_size:
            break
        current_data = data[:current_size]
        x_train, x_test, y_train, y_test = train_test_split(current_data, current_data, test_size=0.1)

        x_train = torch.tensor(x_train, dtype=torch.long).to(device)
        y_train = torch.tensor(y_train, dtype=torch.long).to(device)
        x_test = torch.tensor(x_test, dtype=torch.long).to(device)
        y_test = torch.tensor(y_test, dtype=torch.long).to(device)

        train_dataset = TensorDataset(x_train, y_train)
        test_dataset = TensorDataset(x_test, y_test)

        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

        model.train()
        train_loss, correct, total = 0, 0, 0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            _, predicted = outputs.max(2)
            total += targets.size(0) * targets.size(1)
            correct += predicted.eq(targets).sum().item()

        train_accuracy = 100. * correct / total

        model.eval()
        val_loss, correct, total = 0, 0, 0
        with torch.no_grad():
            for inputs, targets in test_loader:
                outputs = model(inputs)
                loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
                val_loss += loss.item()
                _, predicted = outputs.max(2)
                total += targets.size(0) * targets.size(1)
                correct += predicted.eq(targets).sum().item()

        val_accuracy = 100. * correct / total
        metrics.append((current_size, train_accuracy, val_accuracy))
        current_size += int(current_size * increment_ratio)

        print(f'Epoch {epoch+1}/{max_epochs} completed with {current_size} samples. Train Acc: {train_accuracy:.2f}%, Val Acc: {val_accuracy:.2f}%')
    
    return metrics

# Main function
def main(folder_path):
    texts = load_data(folder_path)
    tokenizer = TextTokenizer()
    tokenizer.fit_on_texts(texts)
    sequences = preprocess_texts(texts, tokenizer)
    vocab_size = len(tokenizer.word_index) + 1

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = TextAutoencoder(vocab_size, embedding_dim=128, lstm_units=256, max_length=512).to(device)

    initial_size = 100
    increment_ratio = 0.01
    max_epochs = 5000

    metrics = progressive_training(model, sequences, tokenizer, vocab_size, initial_size, increment_ratio, max_epochs, device)

    # Save the metrics to a file
    with open('training_metrics.json', 'w') as f:
        json.dump(metrics, f)

if __name__ == "__main__":
    folder_path = '/workspace/slice-monorepo/thebeast/data/combined'  # Replace with the path to your folder containing JSON files
    main(folder_path)


Epoch 1/5000 completed with 101 samples. Train Acc: 0.00%, Val Acc: 0.02%
Epoch 2/5000 completed with 102 samples. Train Acc: 0.02%, Val Acc: 2.34%
Epoch 3/5000 completed with 103 samples. Train Acc: 3.02%, Val Acc: 2.77%
Epoch 4/5000 completed with 104 samples. Train Acc: 3.04%, Val Acc: 0.00%
Epoch 5/5000 completed with 105 samples. Train Acc: 0.05%, Val Acc: 0.04%
Epoch 6/5000 completed with 106 samples. Train Acc: 0.09%, Val Acc: 0.12%
Epoch 7/5000 completed with 107 samples. Train Acc: 0.09%, Val Acc: 0.11%
Epoch 8/5000 completed with 108 samples. Train Acc: 1.05%, Val Acc: 3.68%
Epoch 9/5000 completed with 109 samples. Train Acc: 3.28%, Val Acc: 4.00%
Epoch 10/5000 completed with 110 samples. Train Acc: 3.35%, Val Acc: 3.43%
Epoch 11/5000 completed with 111 samples. Train Acc: 3.34%, Val Acc: 3.55%
Epoch 12/5000 completed with 112 samples. Train Acc: 3.45%, Val Acc: 2.57%
Epoch 13/5000 completed with 113 samples. Train Acc: 3.38%, Val Acc: 3.22%
Epoch 14/5000 completed with 114 s

KeyboardInterrupt: 