# CHAR-RNN Training on Trump Speeches

This notebook trains a Character-RNN model on Trump speeches dataset using PyTorch.

In [11]:
import sys
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

print(f"PyTorch version: {torch.__version__}")
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

PyTorch version: 2.8.0
Using device: mps


## 1. Load the Dataset

In [12]:
# Load the parquet file
df = pd.read_parquet('../../data/transcriptions.parquet')

# Use the specified column
text_column = 'clean-v1-with-stopwords'

# Concatenate all text
full_text = " ".join(df[text_column].dropna().astype(str).tolist())

# Limit to 1M characters for faster training/testing
full_text = full_text[:1000000]

print(f"Total text length: {len(full_text)} characters")
print(f"Sample: {full_text[:80]}")

Total text length: 1000000 characters
Sample: well that be good timing be not it we have to get that right we have to get that


## 2. Preprocessing & Vectorization

In [23]:
# Create a mapping from character to integer
chars = sorted(list(set(full_text)))
vocab_size = len(chars)
char2idx = {c: i for i, c in enumerate(chars)}
idx2char = {i: c for i, c in enumerate(chars)}

# Encode the text
encoded = np.array([char2idx[c] for c in full_text])

print(f"Number of distinct characters: {vocab_size}")

Number of distinct characters: 55


## 3. Dataset Creation Helper

In [14]:
class CharDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = data
        self.seq_length = seq_length

    def __len__(self):
        # We define length as number of possible windows
        return len(self.data) - self.seq_length

    def __getitem__(self, idx):
        # Slicing numpy array is fast
        chunk = self.data[idx:idx + self.seq_length + 1]
        x = torch.tensor(chunk[:-1], dtype=torch.long)
        y = torch.tensor(chunk[1:], dtype=torch.long)
        return x, y

# Create Training, Validation, and Test sets
seq_length = 100
batch_size = 128  # Increased batch size for efficiency

dataset_size = len(encoded)
train_size = int(dataset_size * 0.9)
valid_size = int(dataset_size * 0.05)

# Split data indices
train_data = encoded[:train_size]
valid_data = encoded[train_size:train_size+valid_size]
test_data = encoded[train_size+valid_size:]

print("Building datasets...")
train_dataset = CharDataset(train_data, seq_length)
valid_dataset = CharDataset(valid_data, seq_length)
test_dataset = CharDataset(test_data, seq_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

Building datasets...


## 4. Build and Train the Model

In [15]:
class CharRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(CharRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embedding(x)
        out, hidden = self.gru(x, hidden)
        out = self.fc(out)
        return out, hidden

print("Building model...")
model = CharRNN(vocab_size, embed_dim=16, hidden_dim=128).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.NAdam(model.parameters())

print("Starting training...")
num_epochs = 1  # Kept low for demonstration

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (x, y) in enumerate(train_loader):
        x, y = x.to(device), y.to(device)
        
        optimizer.zero_grad()
        output, _ = model(x)
        
        # Reshape for loss: (N*L, C) vs (N*L)
        loss = criterion(output.reshape(-1, vocab_size), y.reshape(-1))
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        if i % 1000 == 0:
            print(f"Epoch {epoch+1}, Step {i}, Loss: {loss.item():.4f}")
            
    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            output, _ = model(x)
            loss = criterion(output.reshape(-1, vocab_size), y.reshape(-1))
            val_loss += loss.item()
    print(f"Epoch {epoch+1} Complete. Avg Train Loss: {running_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(valid_loader):.4f}")
    
    # Save checkpoint
    torch.save(model.state_dict(), "char_rnn_trump.pth")

Building model...
Starting training...
Epoch 1, Step 0, Loss: 4.0157
Epoch 1, Step 1000, Loss: 1.3066
Epoch 1, Step 2000, Loss: 1.1754
Epoch 1, Step 3000, Loss: 1.1556
Epoch 1, Step 4000, Loss: 1.1733
Epoch 1, Step 5000, Loss: 1.1214
Epoch 1, Step 6000, Loss: 1.1201
Epoch 1, Step 7000, Loss: 1.1105
Epoch 1 Complete. Avg Train Loss: 1.2130, Val Loss: 1.1538


## 5. Inference & Text Generation

In [29]:
def generate_text(model, start_string, num_generate=100, temperature=1.0):
    model.eval()
    # Convert start string to numbers
    input_eval = [char2idx[s] for s in start_string]
    input_eval = torch.tensor(input_eval, dtype=torch.long).unsqueeze(0).to(device)
    
    text_generated = []
    hidden = None

    with torch.no_grad():
        for _ in range(num_generate):
            # forward pass
            output, hidden = model(input_eval, hidden)
            
            # Take the last time step prediction
            predictions = output[:, -1, :]
            predictions = predictions / temperature
            probs = torch.softmax(predictions, dim=-1)
            
            # Sample
            predicted_id = torch.multinomial(probs, num_samples=1).item()
            
            # Pass the predicted character as the next input
            input_eval = torch.tensor([[predicted_id]], device=device)
            
            text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

# --- Run Generation ---
# model.load_state_dict(torch.load("my_trump_model.pth"))

print("Prediction test (we will make america...):")
print(generate_text(model, "we will make america", num_generate=100))

print("\n--- Temperature 0.2 (Conservative) ---")
print(generate_text(model, "we will make america", num_generate=100, temperature=0.2))

print("\n--- Temperature 1.0 (Balanced) ---")
print(generate_text(model, "we will make america", num_generate=100, temperature=1.0))

print("\n--- Temperature 2.0 (Chaotic) ---")
print(generate_text(model, "we will make america", num_generate=100, temperature=2.0))

Prediction test (we will make america...):
we will make america be go ahoot you believe in for a hot but it 's probably run from they thank you the thank you japag

--- Temperature 0.2 (Conservative) ---
we will make america be they be go to be a great job they be go to be a great job and I say that 's what 's go to be a g

--- Temperature 1.0 (Balanced) ---
we will make america no not do I do not even fight that would victure it on get them but where different mean agricia th

--- Temperature 2.0 (Chaotic) ---
we will make america holaken Ninum techarby nicemy coutoriverson technoersulvasfate who to I he zignse withsete belive l
