In [26]:
import torch  # Import PyTorch for tensor operations and neural network building
import torch.nn as nn  # Import neural network modules from PyTorch
from torch.nn import functional as F  # Import functional operations like softmax, cross_entropy
import matplotlib.pyplot as plt  # Import matplotlib for plotting (not used in this code)
# %matplotlib inline  # Enable inline plotting for Jupyter notebooks (commented out to avoid UsageError in non-Jupyter environments)
import re  # Import regular expressions for word tokenization
import os  # Import os for directory and file handling
import pathlib  # Import pathlib for path manipulation
import math # Import python math library

# Hyperparameters
batch_size = 32  # How many independent sequences we are processing in every forward and backward pass.
block_size = 16  # Maximum context length (in words) to make predictions.
max_iters = 500  # Total number of training iterations
eval_interval = 250  # Interval to evaluate and print train/val losses
learning_rate = 1e-3  # Learning rate for the optimizer
device = 'cuda' if torch.cuda.is_available() else 'cpu'  # Use GPU if available, else CPU
eval_iters = 100  # Number of iterations for loss estimation during evaluation
n_embd = 192  # Size of token and position embedding vectors
n_head = 3  # Number of attention heads in multi-head attention
n_layer = 3  # Number of transformer blocks
dropout = 0.2  # Dropout probability for regularization

torch.manual_seed(1337)  # Set random seed for reproducibility

# Path to folder containing text files (replace with your folder path)
folder_path = r"C:\Users\ptoma\Desktop\Math 579 CSULB\txt Files"  # Specify directory containing .txt files

# List to store all file contents
all_texts = []  # Initialize empty list to store text from all files

# Loop over files in the folder
for filename in os.listdir(folder_path):  # Iterate through files in the specified folder
    if filename.endswith(".txt"):  # Check for .txt files
        file_path = os.path.join(folder_path, filename)  # Construct full path to file
        try:
            with open(file_path, "r", encoding="utf-8") as file:  # Attempt to open file with UTF-8 encoding
                content = file.read()  # Read entire file content
                all_texts.append(content)  # Append content to list
                print(f"Read {filename}: {len(content)} characters")  # Print file name and character count
        except UnicodeDecodeError:  # Handle UTF-8 decoding errors
            print(f"Encoding error in {filename}; trying latin1")  # Log encoding error
            with open(file_path, "r", encoding="latin1") as file:  # Retry with Latin-1 encoding
                content = file.read()  # Read file content
                all_texts.append(content)  # Append content to list
        except Exception as e:  # Handle other potential errors
            print(f"Error reading {filename}: {e}")  # Log error

# Combine all file contents into a single text string
text = "\n".join(all_texts)  # Join all texts with newline separators
print(f"Total files read: {len(all_texts)}")  # Print total number of files read
if all_texts:  # Check if any files were read
    print(f"Sample from first file: {all_texts[0][:100]}...")  # Print first 100 characters of first file

# Tokenize text into words
def tokenize(text):  # Define function to tokenize text into words
    # Simple word tokenization: split on whitespace and punctuation
    words = re.findall(r'\b\w+\b', text.lower())  # Convert to lowercase and extract words using regex (regular expression)
    return words  # Return list of words

words = tokenize(text)  # Tokenize the combined text into a list of words
vocab = sorted(list(set(words + ['<UNK>'])))  # Add <UNK> token for out-of-vocabulary words and create sorted vocabulary
vocab_size = len(vocab)  # Calculate the size of the vocabulary (number of unique words + <UNK>)
print(f"Vocabulary size: {vocab_size}")  # Print the vocabulary size
print(f"Sample words: {vocab[:10]}")  # Print the first 10 words in the vocabulary for inspection

# Create mapping from words to integers
stoi = {w: i for i, w in enumerate(vocab)}  # Create dictionary mapping words to their indices
itos = {i: w for i, w in enumerate(vocab)}  # Create dictionary mapping indices to their words
def encode(s):  # Define function to encode a string into a list of word indices
    # Encoder: input a string, output a list of integers, mapping OOV words to <UNK>
    return [stoi.get(w, stoi['<UNK>']) for w in tokenize(s)]  # Map each word to its index, using <UNK> for OOV words

decode = lambda l: ' '.join([itos[i] for i in l])  # Decoder: take a list of integers, output a string by joining words with spaces

# Test encoding/decoding
print(encode("hii there"))  # Test encoding the string "hii there" into indices
print(decode(encode("hii there")))  # Test decoding the encoded indices back to a string

# Splitting the data into training and validation
data = torch.tensor(encode(' '.join(words)), dtype=torch.long)  # Encode the entire text into a tensor of word indices

n = int(0.9*len(data))  # Calculate index to split data (90% for training, 10% for validation)
train_data = data[:n]  # Split data into training set (first 90%)
val_data = data[n:]  # Split data into validation set (last 10%)

# Data loading
def get_batch(split):  # Define function to generate a batch of input-target pairs
    # Generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data  # Select training or validation data based on split
    ix = torch.randint(len(data) - block_size, (batch_size,))  # Randomly sample starting indices for batch
    x = torch.stack([data[i:i+block_size] for i in ix])  # Create input tensor of shape (batch_size, block_size)
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])  # Create target tensor, shifted by one position
    x, y = x.to(device), y.to(device)  # Move tensors to the specified device (CPU/GPU)
    return x, y  # Return input and target tensors

# Computing the loss
@torch.no_grad()  # Disable gradient computation for efficiency
def estimate_loss():  # Define function to estimate average loss on train and val sets
    out = {}  # Initialize dictionary to store losses
    model.eval()  # Set model to evaluation mode (disables dropout, etc.)
    for split in ['train', 'val']:  # Iterate over train and validation splits
        losses = torch.zeros(eval_iters)  # Initialize tensor to store losses for each iteration
        for k in range(eval_iters):  # Perform eval_iters iterations
            X, Y = get_batch(split)  # Get a batch of data for the current split
            logits, loss = model(X, Y)  # Forward pass to compute logits and loss
            losses[k] = loss.item()  # Store the loss value for this iteration
        out[split] = losses.mean()  # Compute mean loss for the split
    model.train()  # Set model back to training mode
    return out  # Return dictionary of train and validation losses

class Head(nn.Module):  # Define single attention head class
    # One head of self-attention
    def __init__(self, head_size):  # Initialize head with given head_size
        super().__init__()  # Call parent class (nn.Module) initializer
        self.key = nn.Linear(n_embd, head_size, bias=False)  # Linear layer for key computation
        self.query = nn.Linear(n_embd, head_size, bias=False)  # Linear layer for query computation
        self.value = nn.Linear(n_embd, head_size, bias=False)  # Linear layer for value computation
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))  # Lower triangular mask for causal attention
        self.dropout = nn.Dropout(dropout)  # Dropout layer for regularization

    def forward(self, x):  # Define forward pass for the attention head
        B, T, C = x.shape  # Extract batch size (B), sequence length (T), and embedding size (C)
        # Self-attention performed by a single "head"
        k = self.key(x)  # Compute keys: (B, T, head_size)
        q = self.query(x)  # Compute queries: (B, T, head_size)
        # Compute attention scores ("affinities")
        wei = q @ k.transpose(-2, -1) * C**-0.5  # Compute attention scores: (B, T, head_size) @ (B, head_size, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))  # Mask future tokens for causal attention
        wei = F.softmax(wei, dim=-1)  # Apply softmax to get attention weights
        wei = self.dropout(wei)  # Apply dropout to attention weights
        # Perform the weighted aggregation of the values
        v = self.value(x)  # Compute values: (B, T, head_size)
        out = wei @ v  # Aggregate values: (B, T, T) @ (B, T, head_size) -> (B, T, head_size)
        return out  # Return attention output

class MultiHeadAttention(nn.Module):  # Define multi-head attention class
    # Multiple heads of self-attention in parallel
    def __init__(self, num_heads, head_size):  # Initialize with number of heads and head size
        super().__init__()  # Call parent class initializer
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])  # Create list of attention heads
        self.projection = nn.Linear(n_embd, n_embd)  # Linear layer to project concatenated head outputs
        self.dropout = nn.Dropout(dropout)  # Dropout layer for regularization
        
    def forward(self, x):  # Define forward pass for multi-head attention
        out = torch.cat([h(x) for h in self.heads], dim=-1)  # Concatenate outputs from all heads along the last dimension
        out = self.dropout(self.projection(out))  # Project concatenated output and apply dropout
        return out  # Return multi-head attention output

class FeedForward(nn.Module):  # Define feed-forward neural network class
    # A simple linear layer followed by a non-linearity
    def __init__(self, n_embd):  # Initialize with embedding size
        super().__init__()  # Call parent class initializer
        self.net = nn.Sequential(  # Define sequential network
            nn.Linear(n_embd, 4*n_embd),  # First linear layer: expand to 4x embedding size
            nn.ReLU(),  # ReLU activation for non-linearity
            nn.Linear(4*n_embd, n_embd),  # Second linear layer: project back to embedding size
            nn.Dropout(dropout),  # Dropout for regularization
        )

    def forward(self, x):  # Define forward pass for feed-forward network
        return self.net(x)  # Pass input through the sequential network

class Block(nn.Module):  # Define transformer block class
    # Transformer block: communication followed by computation
    def __init__(self, n_embd, n_head):  # Initialize with embedding size and number of heads
        super().__init__()  # Call parent class initializer
        head_size = n_embd // n_head  # Calculate size of each attention head
        self.sa = MultiHeadAttention(n_head, head_size)  # Initialize multi-head attention
        self.ffwd = FeedForward(n_embd)  # Initialize feed-forward network
        self.ln1 = nn.LayerNorm(n_embd)  # Layer normalization for attention input
        self.ln2 = nn.LayerNorm(n_embd)  # Layer normalization for feed-forward input

    def forward(self, x):  # Define forward pass for transformer block
        x = x + self.sa(self.ln1(x))  # Apply multi-head attention with residual connection
        x = x + self.ffwd(self.ln2(x))  # Apply feed-forward network with residual connection
        return x  # Return block output

class WordLevelLanguageModel(nn.Module):  # Define word-level language model class
    def __init__(self):  # Initialize the model
        super().__init__()  # Call parent class initializer
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)  # Embedding layer for word indices
        self.position_embedding_table = nn.Embedding(block_size, n_embd)  # Embedding layer for position indices
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])  # Stack of transformer blocks
        self.ln_f = nn.LayerNorm(n_embd)  # Final layer normalization
        self.lm_head = nn.Linear(n_embd, vocab_size)  # Linear layer to predict next word logits

    def forward(self, idx, targets=None):  # Define forward pass for the model
        B, T = idx.shape  # Extract batch size (B) and sequence length (T)
        tok_emb = self.token_embedding_table(idx)  # Get token embeddings: (B, T, n_embd)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # Get position embeddings: (T, n_embd)
        x = tok_emb + pos_emb  # Combine token and position embeddings: (B, T, n_embd)
        x = self.blocks(x)  # Pass through transformer blocks: (B, T, n_embd)
        x = self.ln_f(x)  # Apply final layer normalization: (B, T, n_embd)
        logits = self.lm_head(x)  # Compute logits for next word: (B, T, vocab_size)

        if targets is None:  # If no targets provided (e.g., during generation)
            loss = None  # No loss to compute
        else:  # If targets provided (e.g., during training)
            B, T, C = logits.shape  # Extract logits shape
            logits = logits.view(B*T, C)  # Reshape logits for loss computation: (B*T, vocab_size)
            targets = targets.view(B*T)  # Reshape targets: (B*T,)
            loss = F.cross_entropy(logits, targets)  # Compute cross-entropy loss
        return logits, loss  # Return logits and loss (if computed)

    def generate(self, idx, max_new_tokens):  # Define method to generate new tokens
        for _ in range(max_new_tokens):  # Generate up to max_new_tokens
            idx_cond = idx[:, -block_size:]  # Crop context to last block_size tokens
            logits, loss = self(idx_cond)  # Get logits for next token
            logits = logits[:, -1, :]  # Focus on logits for the last time step: (B, vocab_size)
            probs = F.softmax(logits, dim=-1)  # Convert logits to probabilities
            idx_next = torch.multinomial(probs, num_samples=1)  # Sample next token index: (B, 1)
            idx = torch.cat((idx, idx_next), dim=1)  # Append sampled token to sequence
        return idx  # Return generated sequence

model = WordLevelLanguageModel()  # Instantiate the language model
m = model.to(device)  # Move model to the specified device (CPU/GPU)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)  # Initialize AdamW optimizer with model parameters

# Optimization
for iter in range(max_iters):  # Loop over training iterations
    if iter % eval_interval == 0 or iter == max_iters - 1:  # Check if evaluation is needed
        losses = estimate_loss()  # Compute average train and validation losses
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")  # Print losses
    
    xb, yb = get_batch('train')  # Get a batch of training data
    logits, loss = model(xb, yb)  # Forward pass to compute logits and loss
    optimizer.zero_grad(set_to_none=True)  # Clear previous gradients
    loss.backward()  # Compute gradients via backpropagation
    optimizer.step()  # Update model parameters using gradients


lossU = math.log(vocab_size)
print(f"Uniform probability loss (lossU): {lossU:.4f}")
      
# Generate from the model
context = torch.zeros((1,1), dtype=torch.long, device=device)  # Initialize context with a single zero token
print(decode(m.generate(context, max_new_tokens=5000)[0].tolist()))  # Generate and decode 100 new tokens

Read A Doll's House.txt: 161514 characters
Read A Room With a View.txt: 394369 characters
Read Alice's Adventure in Wonderland4.txt: 163914 characters
Read Cranford.txt: 408695 characters
Read Crime and Punishment.txt: 1154426 characters
Read Frankenstein.txt: 438804 characters
Read IranianNames.txt: 2039 characters
Read IranianNames02.txt: 14654 characters
Read Little Women.txt: 1090382 characters
Read Middlemarch.txt: 1799369 characters
Read Moby Dick.txt: 1238224 characters
Read names.txt: 228145 characters
Read Pride and Prejudice.txt: 748124 characters
Read Romeo and Juliet.txt: 161775 characters
Read Simple Sabotage Field Manual.txt: 73479 characters
Read The Blue Castle.txt: 407867 characters
Read The Complete Works of William Shakespeare.txt: 1604604 characters
Read The Enchanted April.txt: 458899 characters
Read The Great Gatsby.txt: 290075 characters
Read The Importance of Being Earnest.txt: 136733 characters
Read The Picture of Dorian Grey.txt: 448620 characters
Read The Yel