In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
from datasets import load_dataset

# Load the WikiText dataset using Hugging Face
print("Loading dataset...")
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# Preprocessing and Tokenization
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, sequence_length):
        self.tokenizer = tokenizer
        self.sequence_length = sequence_length
        self.inputs = []
        self.targets = []
        
        for text in texts:
            tokens = tokenizer.encode(text, add_special_tokens=False)
            for i in range(len(tokens) - sequence_length):
                self.inputs.append(tokens[i:i+sequence_length])
                self.targets.append(tokens[i+sequence_length])

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return torch.tensor(self.inputs[idx]), torch.tensor(self.targets[idx])

sequence_length = 10
train_texts = dataset['train']['text']
train_dataset = TextDataset(train_texts, tokenizer, sequence_length)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Define the LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out[:, -1, :])
        return output

vocab_size = tokenizer.vocab_size
embedding_dim = 128
hidden_dim = 256
num_layers = 2

model = LSTMModel(vocab_size, embedding_dim, hidden_dim, num_layers).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# Define Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_model(model, dataloader, criterion, optimizer, epochs):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(dataloader):.4f}")

# Train the model
print("Training the model...")
train_model(model, train_loader, criterion, optimizer, epochs=5)

# Prediction Function
def predict_next_word(model, tokenizer, text, sequence_length):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    tokens = tokenizer.encode(text, add_special_tokens=False)
    tokens = tokens[-sequence_length:]
    input_tensor = torch.tensor(tokens).unsqueeze(0).to(device)

    with torch.no_grad():
        output = model(input_tensor)
        predicted_index = torch.argmax(output, dim=1).item()
        predicted_word = tokenizer.decode([predicted_index])
        return predicted_word

# Test the model with a custom input sequence
input_sequence = "The history of artificial intelligence"
predicted_word = predict_next_word(model, tokenizer, input_sequence, sequence_length)
print(f"Input: {input_sequence}")
print(f"Predicted next word: {predicted_word}")


Loading dataset...


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (645 > 512). Running this sequence through the model will result in indexing errors


Training the model...
Epoch 1, Loss: 6.1746
Epoch 2, Loss: 5.5581
Epoch 3, Loss: 5.2791
Epoch 4, Loss: 5.0909


In [2]:
!pip install torchtext

Collecting torchtext
  Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.9 kB)
Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: torchtext
Successfully installed torchtext-0.18.0
