In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd

In [5]:
df = pd.read_csv('../stage_4_data/text_generation/data.csv') # I changed it to data.csv
df.head()

Unnamed: 0,ID,Joke
0,1,What did the bartender say to the jumper cable...
1,2,Don't you hate jokes about German sausage? The...
2,3,Two artists had an art contest... It ended in ...
3,4,Why did the chicken cross the playground? To g...
4,5,What gun do you use to hunt a moose? A moosecut!


In [7]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :]) # Get output from the last time step
        return out

# Combine all jokes into a single text
text = " ".join(df['Joke'].astype(str).tolist())
chars = sorted(list(set(text)))
char_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_char = {i: ch for i, ch in enumerate(chars)}

vocab_size = len(chars)
embedding_dim = 64 # tunable
hidden_size = 128  # tunable
output_size = vocab_size
learning_rate = 0.001
sequence_length = 100 # Length of input sequences
epochs = 10 # Number of epochsj
# Prepare training data
dataX = []
dataY = []
for i in range(0, len(text) - sequence_length, 1):
    seq_in = text[i:i + sequence_length]
    seq_out = text[i + sequence_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])

n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)
split_idx = int(n_patterns * 0.8)

train_dataX = dataX[:split_idx]
train_dataY = dataY[:split_idx]
test_dataX = dataX[split_idx:]
test_dataY = dataY[split_idx:]

X_train = torch.tensor(train_dataX, dtype=torch.long)
Y_train = torch.tensor(train_dataY, dtype=torch.long)
X_test = torch.tensor(test_dataX, dtype=torch.long)
Y_test = torch.tensor(test_dataY, dtype=torch.long)

# Instantiate the model
model = RNN(vocab_size, embedding_dim, hidden_size, output_size)
train_data = torch.utils.data.TensorDataset(X_train, Y_train)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True)

val_data = torch.utils.data.TensorDataset(X_test, Y_test)
val_loader = torch.utils.data.DataLoader(val_data, batch_size=64, shuffle=False)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(epochs):
    # Training phase
    model.train()
    train_loss_epoch = 0
    for batch_X, batch_y in train_loader:
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss_epoch += loss.item()
    avg_train_loss = train_loss_epoch / len(train_loader)
    print(f'Epoch [{epoch+1}/{epochs}], Training Loss: {avg_train_loss:.4f}')

    # Validation phase
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch_X_val, batch_y_val in val_loader:
            # batch_X_val, batch_y_val = batch_X_val.to(device), batch_y_val.to(device) # Uncomment if using GPU
            outputs_val = model(batch_X_val)
            loss_val = criterion(outputs_val, batch_y_val)
            val_loss += loss_val.item()
    
    avg_val_loss = val_loss / len(val_loader)
    print(f'Epoch [{epoch+1}/{epochs}], Validation Loss: {avg_val_loss:.4f}')


print("Training complete.")

Total Patterns:  129504
Epoch [1/10], Training Loss: 2.3369
Epoch [1/10], Validation Loss: 2.0856
Epoch [2/10], Training Loss: 2.0199
Epoch [2/10], Validation Loss: 1.9706
Epoch [3/10], Training Loss: 1.9126
Epoch [3/10], Validation Loss: 1.9059
Epoch [4/10], Training Loss: 1.8466
Epoch [4/10], Validation Loss: 1.8802
Epoch [5/10], Training Loss: 1.7979
Epoch [5/10], Validation Loss: 1.8412
Epoch [6/10], Training Loss: 1.7601
Epoch [6/10], Validation Loss: 1.8221
Epoch [7/10], Training Loss: 1.7279
Epoch [7/10], Validation Loss: 1.8147
Epoch [8/10], Training Loss: 1.7025
Epoch [8/10], Validation Loss: 1.8129
Epoch [9/10], Training Loss: 1.6790
Epoch [9/10], Validation Loss: 1.8035
Epoch [10/10], Training Loss: 1.6583
Epoch [10/10], Validation Loss: 1.8010
Training complete.


In [13]:
# ...existing code...
# print("Training complete.")

# Generate text
model.eval()  # Set model to evaluation mode

# Parameters for generation
start_string = "What do you" # Or any other seed text
num_chars_to_generate = 200
temperature = 0.8 # Higher temperature results in more random, lower in more predictable text

# Convert start string to integers
pattern = [char_to_int[char] for char in start_string if char in char_to_int] # Filter out unknown chars
if not pattern:
    print(f"Error: Seed string '{start_string}' contains no known characters or is too short after filtering.")
    # Fallback to a default known pattern if the seed is problematic
    first_key = next(iter(char_to_int)) # Get the first character from our vocab
    pattern = [char_to_int[first_key]] * min(sequence_length, 5) # Use a short sequence of a known char
    start_string = "".join([int_to_char[p] for p in pattern])
    print(f"Using fallback seed: '{start_string}'")


generated_text = list(start_string) # Use a list to append characters

print(f"Seed: \"{start_string}\"")
print("Generated text: ")
print("--------------------------")
print(start_string, end="")

with torch.no_grad():  # No need to track gradients
    for i in range(num_chars_to_generate):
        # Ensure the pattern is of the correct sequence_length
        current_sequence_input = pattern[-sequence_length:]
        
        # Prepare input tensor
        input_tensor = torch.tensor([current_sequence_input], dtype=torch.long)
        # input_tensor = input_tensor.to(device) # Uncomment if using GPU

        # Get model output (logits)
        output = model(input_tensor)
        
        # Apply temperature to logits
        output_dist = output.data.view(-1).div(temperature).exp()
        # Sample from the distribution
        top_i = torch.multinomial(output_dist, 1)[0]
        
        # Alternatively, for deterministic output (greedy):
        # _, top_i = torch.topk(output, 1)
        # top_i = top_i[0][0]


        # Get the character
        char_index = top_i.item()
        if char_index in int_to_char:
            char = int_to_char[char_index]
            generated_text.append(char)
            pattern.append(char_index)
            print(char, end="", flush=True)
        else:
            print(f"\nWarning: Predicted index {char_index} not in int_to_char map. Stopping generation.")
            break
        
        # Slide the window
        pattern = pattern[1:] 

print("\n--------------------------")
print("\nGeneration complete.")

# Full generated text:
# print("".join(generated_text))

Seed: "What do you"
Generated text: 
--------------------------
What do you call a cuitual to I was always can anterest for the race wicroan toollege Marnopant firlizan banes asked too vot own of Sairt the shower to blo meall in the find asks a turn say to the busts a Olamy 
--------------------------

Generation complete.
