In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import re
from torch.utils.data import Dataset, DataLoader
import os
from collections import Counter
import matplotlib.pyplot as plt
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 32
SEQ_LENGTH = 50
EMBEDDING_DIM = 256
HIDDEN_DIM = 512
NUM_LAYERS = 2
DROPOUT = 0.3
LEARNING_RATE = 0.001
NUM_EPOCHS = 10
GRAD_CLIP = 5.0
MIN_FREQ = 5

class TextDataset(Dataset):
    def __init__(self, text, seq_length, token_to_idx):
        self.tokens = text.split()
        self.indices = [token_to_idx.get(token, token_to_idx['<UNK>']) for token in self.tokens]
        self.seq_length = seq_length

    def __len__(self):
        return len(self.indices) - self.seq_length

    def __getitem__(self, idx):
        x = self.indices[idx:idx + self.seq_length]
        y = self.indices[idx + 1:idx + self.seq_length + 1]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

class TextPreprocessor:
    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def build_vocab(self, text, min_freq=5):
        tokens = text.split()
        token_counts = Counter(tokens)
        vocab = ['<PAD>', '<UNK>', '<SOS>', '<EOS>']
        vocab += [token for token, count in token_counts.items() if count >= min_freq]
        token_to_idx = {token: idx for idx, token in enumerate(vocab)}
        idx_to_token = {idx: token for token, idx in token_to_idx.items()}
        return token_to_idx, idx_to_token

class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        embedded = self.embedding(x)
        lstm_out, hidden = self.lstm(embedded, hidden)
        lstm_out = self.dropout(lstm_out)
        output = self.fc(lstm_out)
        return output, hidden

    def init_hidden(self, batch_size):
        return (torch.zeros(NUM_LAYERS, batch_size, HIDDEN_DIM).to(device),
                torch.zeros(NUM_LAYERS, batch_size, HIDDEN_DIM).to(device))

def train_model():
    with open('synthetic_text_100MB.txt', 'r', encoding='utf-8', errors='ignore') as f:
        text = f.read()
    preprocessor = TextPreprocessor()
    cleaned_text = preprocessor.clean_text(text)
    token_to_idx, idx_to_token = preprocessor.build_vocab(cleaned_text, MIN_FREQ)
    vocab_size = len(token_to_idx)

    dataset = TextDataset(cleaned_text, SEQ_LENGTH, token_to_idx)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

    model = LSTMLanguageModel(vocab_size, EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS, DROPOUT).to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=token_to_idx['<PAD>'])
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    losses = []

    for epoch in range(NUM_EPOCHS):
        model.train()
        total_loss = 0
        hidden = model.init_hidden(BATCH_SIZE)
        pb = tqdm(dataloader, desc=f'Epoch {epoch + 1}/{NUM_EPOCHS}')
        for data, targets in pb:
            data, targets = data.to(device), targets.to(device)
            optimizer.zero_grad()
            output, hidden = model(data, hidden)
            hidden = (hidden[0].detach(), hidden[1].detach())
            loss = criterion(output.view(-1, vocab_size), targets.view(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
            optimizer.step()
            total_loss += loss.item()
            pb.set_postfix({'loss': f'{loss.item():.4f}'})
        avg_loss = total_loss / len(dataloader)
        losses.append(avg_loss)
        print(f'Epoch [{epoch+1}/{NUM_EPOCHS}] Average Loss: {avg_loss:.4f}')

    torch.save({
        'model_state_dict': model.state_dict(),
        'token_to_idx': token_to_idx,
        'idx_to_token': idx_to_token,
        'vocab_size': vocab_size,
        'hyperparameters': {
            'embedding_dim': EMBEDDING_DIM,
            'hidden_dim': HIDDEN_DIM,
            'num_layers': NUM_LAYERS,
            'dropout': DROPOUT
        }
    }, 'language_model_minimal.pth')

    plt.plot(losses)
    plt.title('Training Loss Curve')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.savefig('loss_curve_minimal.png')
    plt.show()

if __name__ == "__main__":
    train_model()


Epoch 1/10:  24%|██▍       | 116961/486806 [42:14<2:11:37, 46.83it/s, loss=0.1938]

In [None]:
import torch
import torch.nn as nn

EMBEDDING_DIM = 256
HIDDEN_DIM = 512
NUM_LAYERS = 2
DROPOUT = 0.3

class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        embedded = self.embedding(x)
        lstm_out, hidden = self.lstm(embedded, hidden)
        lstm_out = self.dropout(lstm_out)
        output = self.fc(lstm_out)
        return output, hidden

def generate_text(prompt, max_length=100, temperature=0.8, top_k=50):
    checkpoint = torch.load('language_model_minimal.pth', map_location='cpu')
    token_to_idx = checkpoint['token_to_idx']
    idx_to_token = checkpoint['idx_to_token']
    vocab_size = checkpoint['vocab_size']
    hp = checkpoint['hyperparameters']

    model = LSTMLanguageModel(vocab_size, hp['embedding_dim'], hp['hidden_dim'], hp['num_layers'], hp['dropout'])
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()

    tokens = prompt.lower().split()
    input_indices = [token_to_idx.get(token, token_to_idx['<UNK>']) for token in tokens]
    hidden = None
    generated_tokens = input_indices.copy()

    with torch.no_grad():
        for token_idx in input_indices:
            input_tensor = torch.tensor([[token_idx]], dtype=torch.long)
            output, hidden = model(input_tensor, hidden)
        current_token = input_indices[-1]
        for _ in range(max_length):
            input_tensor = torch.tensor([[current_token]], dtype=torch.long)
            output, hidden = model(input_tensor, hidden)
            logits = output[0, -1] / temperature
            if top_k > 0:
                top_k_values, top_k_indices = torch.topk(logits, top_k)
                logits[logits < top_k_values[-1]] = -float('Inf')
            probabilities = torch.softmax(logits, dim=-1)
            current_token = torch.multinomial(probabilities, 1).item()
            generated_tokens.append(current_token)
    generated_text = ' '.join([idx_to_token.get(idx, '<UNK>') for idx in generated_tokens])
    return generated_text

if __name__ == "__main__":
    prompts = [
        "Artificial Intelligence will",
        "The future of technology is",
        "Machine learning algorithms can"
    ]
    for prompt in prompts:
        print(f"\nPrompt: {prompt}")
        print(f"Generated: {generate_text(prompt, 60, 0.8, 30)}")
