In [None]:
!pip install --quiet torch==2.0.1 torchtext==0.15.2
!python -m spacy download en_core_web_sm


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m89.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.1/317.1 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m116.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m102.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m61.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.1/557.1 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.4/168.4 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import GloVe
from torchtext.data.utils import get_tokenizer
import re

# === Step 1: Tokenization Setup ===
tokenizer = get_tokenizer("spacy", language="en_core_web_sm")  # Uses SpaCy tokenizer

SPECIAL_TOKENS = ['<pad>', '<unk>', '<sos>', '<eos>']
PAD_IDX = 0
UNK_IDX = 1
SOS_IDX = 2
EOS_IDX = 3

# === Step 2: Load and Tokenize Jokes ===
from google.colab import drive
drive.mount('/content/drive')

#!unzip "/content/drive/MyDrive/ECS 189G/stage_4_data.zip" -d /content/

DATA_PATH = "/content/stage_4_data/text_generation/data"

def clean_line(line):
    line = line.lower().strip()
    line = re.sub(r'https?://\S+|www\.\S+', '', line)
    line = re.sub(r'\s+', ' ', line)
    return line

with open(DATA_PATH, 'r') as f:
    raw_lines = f.readlines()

tokenized_jokes = []
for line in raw_lines:
    line = clean_line(line)
    tokens = tokenizer(line)
    if len(tokens) >= 4 and len(tokens) <= 40:
        tokenized_jokes.append(['<sos>'] + tokens + ['<eos>'])

# === Step 3: Build Vocab (GloVe + Special Tokens) ===
glove_base = GloVe(name="6B", dim=300)
special_vectors = torch.randn(len(SPECIAL_TOKENS), glove_base.dim) * 0.1
vocab_stoi = {tok: i for i, tok in enumerate(SPECIAL_TOKENS)}
offset = len(SPECIAL_TOKENS)

for word in glove_base.stoi:
    vocab_stoi[word] = glove_base.stoi[word] + offset

# Build itos mapping and full embedding matrix
vocab_itos = [None] * len(vocab_stoi)
for word, idx in vocab_stoi.items():
    vocab_itos[idx] = word

glove_vectors = torch.cat([special_vectors, glove_base.vectors], dim=0)

class GloveVocab:
    def __init__(self, stoi, itos, vectors):
        self.stoi = stoi
        self.itos = itos
        self.vectors = vectors
        self.dim = vectors.shape[1]

vocab = GloveVocab(vocab_stoi, vocab_itos, glove_vectors)

# === Step 4: Encode Dataset ===
encoded_jokes = [[vocab.stoi.get(tok, UNK_IDX) for tok in joke] for joke in tokenized_jokes]

class JokeDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, i):
        seq = torch.tensor(self.data[i], dtype=torch.long)
        return seq[:-1], seq[1:]

def collate_fn(batch):
    inputs, targets = zip(*batch)
    inputs = pad_sequence(inputs, batch_first=True, padding_value=PAD_IDX)
    targets = pad_sequence(targets, batch_first=True, padding_value=PAD_IDX)
    return inputs, targets

loader = DataLoader(JokeDataset(encoded_jokes), batch_size=32, shuffle=True, collate_fn=collate_fn)

# === Step 5: Model ===
class JokeLSTM(nn.Module):
    def __init__(self, embedding_weights, hidden_dim=256, num_layers=2, dropout=0.3):
        super().__init__()
        vocab_size, emb_dim = embedding_weights.shape
        self.embedding = nn.Embedding.from_pretrained(embedding_weights.detach(), freeze=False, padding_idx=PAD_IDX)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        emb = self.embedding(x)
        out, _ = self.lstm(emb)
        return self.fc(out)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = JokeLSTM(vocab.vectors).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

# === Step 6: Training Loop ===
def train_model(model, loader, epochs=200):
    model.train()
    for epoch in range(1, epochs + 1):
        total_loss = 0
        for inputs, targets in loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch}/{epochs} | Loss: {total_loss / len(loader):.4f}")

train_model(model, loader)

# === Step 7: Joke Generation ===
def generate_joke(model, prompt, max_len=40, temperature=1.0, top_k=30):
    model.eval()
    tokens = tokenizer(prompt.lower())
    idxs = [vocab.stoi.get(tok, UNK_IDX) for tok in tokens]
    idxs = [SOS_IDX] + idxs
    input_tensor = torch.tensor(idxs, dtype=torch.long).unsqueeze(0).to(device)
    generated = tokens.copy()

    for _ in range(max_len):
        with torch.no_grad():
            logits = model(input_tensor)[0, -1] / temperature
            probs = torch.softmax(logits, dim=-1)
            if top_k:
                top_probs, top_idx = torch.topk(probs, top_k)
                next_token = top_idx[torch.multinomial(top_probs, 1)].item()
            else:
                next_token = torch.multinomial(probs, 1).item()

        next_word = vocab.itos[next_token]
        if next_word == '<eos>':
            break
        generated.append(next_word)
        input_tensor = torch.cat([input_tensor, torch.tensor([[next_token]]).to(device)], dim=1)

    return ' '.join(generated)

# === Step 8: Try It Out ===
print("\nGENERATED JOKES:")
print(generate_joke(model, "what do you call"))
print(generate_joke(model, "why did the chicken"))
print(generate_joke(model, "how do you stop a robot"))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


100%|█████████▉| 399999/400000 [00:42<00:00, 9340.04it/s]


Epoch 1/200 | Loss: 8.4462
Epoch 2/200 | Loss: 6.0401
Epoch 3/200 | Loss: 5.7326
Epoch 4/200 | Loss: 5.6131
Epoch 5/200 | Loss: 5.5376
Epoch 6/200 | Loss: 5.4741
Epoch 7/200 | Loss: 5.4158
Epoch 8/200 | Loss: 5.3381
Epoch 9/200 | Loss: 5.2585
Epoch 10/200 | Loss: 5.1681
Epoch 11/200 | Loss: 5.0893
Epoch 12/200 | Loss: 5.0134
Epoch 13/200 | Loss: 4.9541
Epoch 14/200 | Loss: 4.8945
Epoch 15/200 | Loss: 4.8404
Epoch 16/200 | Loss: 4.7724
Epoch 17/200 | Loss: 4.7094
Epoch 18/200 | Loss: 4.6478
Epoch 19/200 | Loss: 4.5819
Epoch 20/200 | Loss: 4.5067
Epoch 21/200 | Loss: 4.4521
Epoch 22/200 | Loss: 4.3907
Epoch 23/200 | Loss: 4.3295
Epoch 24/200 | Loss: 4.2769
Epoch 25/200 | Loss: 4.2195
Epoch 26/200 | Loss: 4.1614
Epoch 27/200 | Loss: 4.1099
Epoch 28/200 | Loss: 4.0632
Epoch 29/200 | Loss: 4.0199
Epoch 30/200 | Loss: 3.9703
Epoch 31/200 | Loss: 3.9262
Epoch 32/200 | Loss: 3.8689
Epoch 33/200 | Loss: 3.8349
Epoch 34/200 | Loss: 3.7902
Epoch 35/200 | Loss: 3.7452
Epoch 36/200 | Loss: 3.7050
E

In [None]:
# === Extended Prompt List for Joke Generation ===
joke_prompts = [
    # Classic joke starters
    "what do you call a",
    "why did the chicken",
    "why did the",
    "what did the",
    "how do you",
    "how do you make a",
    "how do you stop a",
    "what's the difference between",
    "what's worse than",

    # Question-based setups
    "why don't",
    "why can't",
    "why do",
    "what happens when",
    "what do you get when",
    "what do you get if",

    # Character-based setups
    "a man walks into",
    "two guys walk into",
    "the bartender says",
    "the doctor said",
    "my wife said",

    # Animal jokes
    "what do you call a cow",
    "what do you call a dog",
    "what do you call a fish",
    "why don't elephants",
    "what did the cat",

    # Profession jokes
    "why don't scientists",
    "what did the lawyer",
    "the teacher asked",
    "why do programmers",

    # Food jokes
    "what did the grape",
    "why did the banana",
    "what's a skeleton's favorite",

    # Tech/modern jokes
    "why do robots",
    "what did the computer",
    "why don't phones",

    # Incomplete phrases to test completion
    "knock knock",
    "i told my wife",
    "my doctor said",
    "yesterday i went to"
]

# Print to confirm it's loaded
print(f"Loaded {len(joke_prompts)} prompts for testing.")
for i in range(5):
    print(f"- {joke_prompts[i]}")


Loaded 39 prompts for testing.
- what do you call a
- why did the chicken
- why did the
- what did the
- how do you


In [None]:
import random

# Number of different jokes to generate per prompt
n_generations = 2

# Sample from the loaded prompts
for prompt in joke_prompts:
    print(f"\nPrompt: '{prompt}'")
    for i in range(n_generations):
        joke = generate_joke(model, prompt, max_len=35, temperature=0.7, top_k=40)
        print(f"  {i+1}: {joke}")



Prompt: 'what do you call a'
  1: what do you call a midget psychic who just escaped from prison ? a small medium at large "
  2: what do you call a cow with one leg ? steak . "

Prompt: 'why did the chicken'
  1: why did the chicken cross the road ? to get away from gordon ramsey "
  2: why did the chicken cross the road ? to get to the moron 's house . * knock knock * <unk> <unk> ? * the chicken ... * "

Prompt: 'why did the'
  1: why did the fish go when it needed a filling ? well , it was two - tired "
  2: why did the boy take a pencil and paper to bed ? he was told to draw the curtains before going to sleep . "

Prompt: 'what did the'
  1: what did the bicycle fall over ? because it was two - tired "
  2: what did the chicken lay dyslexia ? "

Prompt: 'how do you'
  1: how do you catch a bra ? you set a booby trap . "
  2: how do you call a group of people standing in the arctic circle ? a finnish line . "

Prompt: 'how do you make a'
  1: how do you make a squid laugh ? ten tic