In [None]:
# !pip install --upgrade datasets fsspec
from datasets import load_dataset

ds = load_dataset("Salesforce/wikitext", "wikitext-2-raw-v1")

In [None]:
from datasets import load_dataset
from collections import Counter
from torch.utils.data import Dataset, DataLoader

# Load dataset
ds = load_dataset("wikitext", "wikitext-2-raw-v1")

# Build vocabulary from training set
train_text = " ".join([text for text in ds["train"]["text"] if text.strip() != ""])
words = train_text.split()
word_freq = Counter(words)
vocab_size = 10000
vocab = ["<PAD>", "<UNK>"] + [word for word, _ in word_freq.most_common(vocab_size - 2)]
word_to_index = {word: idx for idx, word in enumerate(vocab)}

pad_token_id = word_to_index["<PAD>"]
unk_token_id = word_to_index["<UNK>"]

# Tokenization function
def tokenize(text):
    return [word_to_index.get(word, unk_token_id) for word in text.split()]

# Preprocess a dataset split into sequences
def preprocess_split(split):
    tokenized = [tokenize(text) for text in split["text"] if text.strip() != ""]
    all_tokens = [token for seq in tokenized for token in seq]
    seq_len = 128
    sequences = [all_tokens[i:i + seq_len] for i in range(0, len(all_tokens), seq_len)]
    # Pad the last sequence
    if len(sequences[-1]) < seq_len:
        sequences[-1] += [pad_token_id] * (seq_len - len(sequences[-1]))
    return sequences

# Process all splits
train_sequences = preprocess_split(ds["train"])
val_sequences = preprocess_split(ds["validation"])
test_sequences = preprocess_split(ds["test"])

# Custom dataset class
class WikiTextDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long)

# Create data loaders
batch_size = 32
train_dataset = WikiTextDataset(train_sequences)
val_dataset = WikiTextDataset(val_sequences)
test_dataset = WikiTextDataset(test_sequences)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
# !pip install --upgrade datasets fsspec
from datasets import load_dataset
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import math

# Load dataset
ds = load_dataset("wikitext", "wikitext-2-raw-v1")

# Build vocabulary from training set
train_text = " ".join([text for text in ds["train"]["text"] if text.strip() != ""])
words = train_text.split()
word_freq = Counter(words)
vocab_size = 10000
vocab = ["<PAD>", "<UNK>"] + [word for word, _ in word_freq.most_common(vocab_size - 2)]
word_to_index = {word: idx for idx, word in enumerate(vocab)}

pad_token_id = word_to_index["<PAD>"]
unk_token_id = word_to_index["<UNK>"]

# Tokenization function
def tokenize(text):
    return [word_to_index.get(word, unk_token_id) for word in text.split()]

# Preprocess a dataset split into sequences
def preprocess_split(split):
    tokenized = [tokenize(text) for text in split["text"] if text.strip() != ""]
    all_tokens = [token for seq in tokenized for token in seq]
    seq_len = 128
    sequences = [all_tokens[i:i + seq_len] for i in range(0, len(all_tokens), seq_len)]
    # Pad the last sequence
    if len(sequences[-1]) < seq_len:
        sequences[-1] += [pad_token_id] * (seq_len - len(sequences[-1]))
    return sequences

# Process all splits
train_sequences = preprocess_split(ds["train"])
val_sequences = preprocess_split(ds["validation"])
test_sequences = preprocess_split(ds["test"])

# Custom dataset class
class WikiTextDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long)

# Create data loaders
batch_size = 32
train_dataset = WikiTextDataset(train_sequences)
val_dataset = WikiTextDataset(val_sequences)
test_dataset = WikiTextDataset(test_sequences)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Define PositionalEncoding class
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_len):
        super().__init__()
        pe = torch.zeros(max_seq_len, d_model)
        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x

# Define MultiHeadAttention class
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout):
        super().__init__()
        assert d_model % num_heads == 0
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.WQ = nn.Linear(d_model, d_model)
        self.WK = nn.Linear(d_model, d_model)
        self.WV = nn.Linear(d_model, d_model)
        self.WO = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, X, mask):
        batch_size, seq_len, _ = X.size()
        Q = self.WQ(X).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        K = self.WK(X).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        V = self.WV(X).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        scores = scores + mask
        weights = torch.nn.functional.softmax(scores, dim=-1)
        weights = self.dropout(weights)
        output = torch.matmul(weights, V)
        output = output.transpose(1, 2).contiguous().view(batch_size, seq_len, -1)
        output = self.WO(output)
        return output

# Define FeedForward class
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = torch.nn.functional.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.linear2(x)
        return x

# Define TransformerLayer class
class TransformerLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.ffn = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_input = self.norm1(x)
        attn_output = self.attention(attn_input, mask)
        x = x + self.dropout(attn_output)
        ffn_input = self.norm2(x)
        ffn_output = self.ffn(ffn_input)
        x = x + self.dropout(ffn_output)
        return x

# Define Transformer class
class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, max_seq_len, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_seq_len)
        self.layers = nn.ModuleList([TransformerLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.norm = nn.LayerNorm(d_model)
        self.output_layer = nn.Linear(d_model, vocab_size)
        self.output_layer.weight = self.embedding.weight  # Weight tying
        self.pad_token_id = 0

    def forward(self, input_ids):
        batch_size, seq_len = input_ids.size()
        # Create causal mask (triangular) and padding mask
        causal_mask = torch.triu(torch.ones((seq_len, seq_len), device=input_ids.device), diagonal=1).bool()
        padding_mask = (input_ids == self.pad_token_id)
        full_mask = causal_mask[None, None, :, :] | padding_mask[:, None, None, :]
        full_mask = full_mask.float() * -1e9  # Masked positions get a large negative value
        x = self.embedding(input_ids)
        x = self.pos_encoding(x)
        for layer in self.layers:
            x = layer(x, full_mask)
        x = self.norm(x)
        logits = self.output_layer(x)
        return logits

# Set device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the model
model = Transformer(
    vocab_size=vocab_size, # Use actual vocab_size from data preprocessing
    d_model=256,
    num_layers=6,
    num_heads=8,
    d_ff=1024,
    max_seq_len=128, # Use actual seq_len from data preprocessing
    dropout=0.1
).to(device)

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

# Training loop with output
for epoch in range(3):  # Reduced to 3 epochs for quick demonstration
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch.to(device)
        optimizer.zero_grad()
        logits = model(input_ids)
        targets = input_ids[:, 1:]  # Shifted target for next-token prediction
        logits = logits[:, :-1, :]  # Remove last prediction to match target length
        mask = (targets != pad_token_id)  # Mask for padding (use pad_token_id)
        # Add .contiguous() before .view() to ensure contiguous memory layout
        # Add .contiguous() to targets before calling view()
        loss = torch.nn.functional.cross_entropy(logits.contiguous().view(-1, vocab_size), targets.contiguous().view(-1), reduction='none')
        # Apply mask to loss and calculate mean
        loss = (loss * mask.view(-1)).sum() / mask.sum()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Training Loss: {avg_train_loss:.4f}")

In [None]:
# Evaluate on test set
model.eval()
test_loss = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch.to(device)
        logits = model(input_ids)
        targets = input_ids[:, 1:]  # Shifted target for next-token prediction
        logits = logits[:, :-1, :]  # Remove last prediction to match target length
        mask = (targets != pad_token_id)  # Mask for padding
        loss = torch.nn.functional.cross_entropy(
            logits.contiguous().view(-1, vocab_size),
            targets.contiguous().view(-1),
            reduction='none'
        )
        loss = (loss * mask.view(-1)).sum() / mask.sum()
        test_loss += loss.item()
avg_test_loss = test_loss / len(test_loader)
perplexity = math.exp(avg_test_loss)
print(f"Test Loss: {avg_test_loss:.4f}, Test Perplexity: {perplexity:.4f}")

In [None]:
# Reverse vocabulary for decoding (ID to word)
index_to_word = {idx: word for word, idx in word_to_index.items()}

def generate_text(model, prompt_ids, max_length=50, sampling="greedy"):
    """
    Generate text starting from prompt_ids.
    Args:
        model: Trained Transformer model
        prompt_ids: List or tensor of token IDs to start generation
        max_length: Maximum number of tokens to generate
        sampling: "greedy" or "top_k" (top_k with k=50 for diversity)
    Returns:
        List of generated token IDs
    """
    model.eval()
    if isinstance(prompt_ids, list):
        input_ids = torch.tensor([prompt_ids], dtype=torch.long).to(device)
    else:
        input_ids = prompt_ids.clone().detach().unsqueeze(0).to(device)

    generated = input_ids.tolist()[0]

    with torch.no_grad():
        for _ in range(max_length):
            logits = model(input_ids)
            next_logits = logits[:, -1, :]  # Logits for the last position

            if sampling == "greedy":
                next_token = torch.argmax(next_logits, dim=-1)
            elif sampling == "top_k":
                top_k = 50
                top_k_probs, top_k_indices = torch.topk(next_logits, top_k, dim=-1)
                next_token = top_k_indices[
                    0, torch.multinomial(torch.nn.functional.softmax(top_k_probs, dim=-1), 1)
                ]

            generated.append(next_token.item())
            input_ids = torch.tensor([generated], dtype=torch.long).to(device)

            if next_token.item() == pad_token_id:
                break

    return generated

# Example: Generate text from a prompt
prompt_text = "The history of"  # Example prompt
prompt_ids = tokenize(prompt_text)[:128]  # Tokenize and truncate to max_seq_len
generated_ids = generate_text(model, prompt_ids, max_length=50, sampling="top_k")

# Decode generated IDs to text
generated_text = " ".join([index_to_word.get(id, "<UNK>") for id in generated_ids])
print("Generated Text:", generated_text)