In [8]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from tqdm import tqdm

In [9]:
# Load dataset
df = pd.read_csv("data/1000_leetspeak.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Original   9999 non-null   object
 1   LeetSpeak  9999 non-null   object
dtypes: object(2)
memory usage: 156.4+ KB


In [10]:
# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# change to cuda for GPU acceleration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [4]:
# torch.backends.cuda.is_available()

AttributeError: module 'torch.backends.cuda' has no attribute 'is_available'

In [11]:
class LeetSpeakDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=32):
        self.data = dataframe.to_dict(orient='records')  # Convert to list of dicts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        original = str(sample['Original'])
        leetspeak = str(sample['LeetSpeak'])

        source = self.tokenizer(
            "convert: " + original,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        target = self.tokenizer(
            leetspeak,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            "input_ids": source["input_ids"].squeeze(),
            "attention_mask": source["attention_mask"].squeeze(),
            "labels": target["input_ids"].squeeze()
        }

In [12]:
train_size = int(0.8 * len(df))
test_size = len(df) - train_size
indices = list(range(len(df)))
train_indices, test_indices = indices[:train_size], indices[train_size:]

full_dataset = LeetSpeakDataset(df, tokenizer)
train_dataset = Subset(full_dataset, train_indices)
test_dataset = Subset(full_dataset, test_indices)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)




In [None]:
EPOCHS = 5
model.train()
for epoch in range(EPOCHS):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

# Save model
model.save_pretrained("leetspeak_model")
tokenizer.save_pretrained("leetspeak_model")


Epoch 1: 100%|██████████| 1000/1000 [01:21<00:00, 12.22it/s, loss=0.61]
Epoch 2:  88%|████████▊ | 884/1000 [01:19<00:13,  8.66it/s, loss=0.52] 

In [None]:
def generate_leetspeak(text):
    model.eval()
    input_text = "convert: " + text
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

    with torch.no_grad():
        output_ids = model.generate(input_ids)

    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Example inference
print(generate_leetspeak("hello"))