<a href="https://colab.research.google.com/github/saivardhanvemula/DlAssignment2/blob/main/DlAssignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Question 1

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split


In [3]:
TRAIN_PATH = '/content/bn.translit.sampled.train.tsv'
TEST_PATH = '/content/bn.translit.sampled.test.tsv'

train_data = pd.read_csv(TRAIN_PATH, sep='\t', header=None, names=['target', 'input', 'count'])
test_data = pd.read_csv(TEST_PATH, sep='\t', header=None, names=['target', 'input', 'count'])

train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=1)


In [4]:
def create_vocab(sequences):
    all_chars = sorted(set(char for seq in sequences.dropna() for char in seq))
    base_vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2}
    base_vocab.update({ch: idx+3 for idx, ch in enumerate(all_chars)})
    return base_vocab

input_stoi = create_vocab(train_data['input'])
output_stoi = create_vocab(train_data['target'])
itos_output = {i: ch for ch, i in output_stoi.items()}


In [5]:
class TranslitDataset(Dataset):
    def __init__(self, dataframe, in_vocab, out_vocab):
        self.data = dataframe
        self.in_vocab = in_vocab
        self.out_vocab = out_vocab

    def _encode(self, sequence, vocab, sos_eos=False):
        sequence = str(sequence)
        ids = [vocab[ch] for ch in sequence]
        if sos_eos:
            ids = [vocab['<sos>']] + ids + [vocab['<eos>']]
        return torch.tensor(ids)

    def __getitem__(self, idx):
        entry = self.data.iloc[idx]
        input_seq = self._encode(entry['input'], self.in_vocab)
        target_seq = self._encode(entry['target'], self.out_vocab, sos_eos=True)
        return input_seq, target_seq

    def __len__(self):
        return len(self.data)


In [6]:
def pad_collate(batch):
    src, tgt = zip(*batch)
    src_pad = nn.utils.rnn.pad_sequence(src, batch_first=True, padding_value=0)
    tgt_pad = nn.utils.rnn.pad_sequence(tgt, batch_first=True, padding_value=0)
    return src_pad, tgt_pad


In [7]:
train_dl = DataLoader(TranslitDataset(train_data, input_stoi, output_stoi), batch_size=64, shuffle=True, collate_fn=pad_collate)
val_dl = DataLoader(TranslitDataset(val_data, input_stoi, output_stoi), batch_size=64, collate_fn=pad_collate)
test_dl = DataLoader(TranslitDataset(test_data, input_stoi, output_stoi), batch_size=1, collate_fn=pad_collate)


In [8]:
class EncoderDecoder(nn.Module):
    def __init__(self, in_dim, out_dim, emb_dim=64, hid_dim=128, rnn_kind='GRU'):
        super().__init__()
        self.enc_emb = nn.Embedding(in_dim, emb_dim)
        self.dec_emb = nn.Embedding(out_dim, emb_dim)
        RNN = {'GRU': nn.GRU, 'LSTM': nn.LSTM, 'RNN': nn.RNN}[rnn_kind]
        self.encoder = RNN(emb_dim, hid_dim, batch_first=True)
        self.decoder = RNN(emb_dim, hid_dim, batch_first=True)
        self.output_layer = nn.Linear(hid_dim, out_dim)

    def forward(self, src, tgt):
        src_embed = self.enc_emb(src)
        _, hidden = self.encoder(src_embed)
        tgt_embed = self.dec_emb(tgt[:, :-1])
        dec_out, _ = self.decoder(tgt_embed, hidden)
        return self.output_layer(dec_out)


In [9]:
def run_train(model, loader, optim, criterion, device):
    model.train()
    total = 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        pred = model(x, y)
        loss = criterion(pred.view(-1, pred.size(-1)), y[:, 1:].contiguous().view(-1))
        optim.zero_grad()
        loss.backward()
        optim.step()
        total += loss.item()
    return total / len(loader)

def run_eval(model, loader, criterion, device):
    model.eval()
    total = 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            pred = model(x, y)
            loss = criterion(pred.view(-1, pred.size(-1)), y[:, 1:].contiguous().view(-1))
            total += loss.item()
    return total / len(loader)


In [10]:
def greedy_decode(model, dataset, device, max_len=30):
    model.eval()
    results = []
    with torch.no_grad():
        for src, _ in dataset:
            src = src.unsqueeze(0).to(device)
            enc_out, hidden = model.encoder(model.enc_emb(src))
            dec_input = torch.tensor([[output_stoi['<sos>']]], device=device)
            decoded = []
            for _ in range(max_len):
                out, hidden = model.decoder(model.dec_emb(dec_input), hidden)
                prob = model.output_layer(out.squeeze(1))
                pred_id = prob.argmax(-1).item()
                if pred_id == output_stoi['<eos>']:
                    break
                decoded.append(itos_output[pred_id])
                dec_input = torch.tensor([[pred_id]], device=device)
            results.append("".join(decoded))
    return results


In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = EncoderDecoder(len(input_stoi), len(output_stoi)).to(device)
opt = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=0)


In [20]:
for ep in range(1, 21):
    tr_loss = run_train(model, train_dl, opt, criterion, device)
    vl_loss = run_eval(model, val_dl, criterion, device)
    print(f"Epoch {ep}: Train Loss = {tr_loss:.4f} | Val Loss = {vl_loss:.4f}")


Epoch 1: Train Loss = 0.1133 | Val Loss = 0.2865
Epoch 2: Train Loss = 0.1123 | Val Loss = 0.2797
Epoch 3: Train Loss = 0.1106 | Val Loss = 0.2775
Epoch 4: Train Loss = 0.1104 | Val Loss = 0.2783
Epoch 5: Train Loss = 0.1086 | Val Loss = 0.2797
Epoch 6: Train Loss = 0.1100 | Val Loss = 0.2828
Epoch 7: Train Loss = 0.1084 | Val Loss = 0.2814
Epoch 8: Train Loss = 0.1087 | Val Loss = 0.2800
Epoch 9: Train Loss = 0.1065 | Val Loss = 0.2844
Epoch 10: Train Loss = 0.1073 | Val Loss = 0.2817
Epoch 11: Train Loss = 0.1077 | Val Loss = 0.2788
Epoch 12: Train Loss = 0.1065 | Val Loss = 0.2819
Epoch 13: Train Loss = 0.1055 | Val Loss = 0.2791
Epoch 14: Train Loss = 0.1043 | Val Loss = 0.2749
Epoch 15: Train Loss = 0.1046 | Val Loss = 0.2797
Epoch 16: Train Loss = 0.1043 | Val Loss = 0.2769
Epoch 17: Train Loss = 0.1040 | Val Loss = 0.2848
Epoch 18: Train Loss = 0.1023 | Val Loss = 0.2839
Epoch 19: Train Loss = 0.1049 | Val Loss = 0.2794
Epoch 20: Train Loss = 0.1011 | Val Loss = 0.2804


In [21]:
test_preds = greedy_decode(model, TranslitDataset(test_data, input_stoi, output_stoi), device)
ground_truths = test_data['target'].tolist()
acc = sum(p.strip() == t.strip() for p, t in zip(test_preds, ground_truths)) / len(ground_truths)
print(f"\nFinal Test Accuracy: {acc * 100:.2f}%")



Final Test Accuracy: 9.07%


In [22]:
# Create dataset and dataloader for test set
test_dataset = TranslitDataset(test_data, input_stoi, output_stoi)
test_dl = DataLoader(test_dataset, batch_size=1, collate_fn=pad_collate)

# Decode predictions on test set
test_preds = greedy_decode(model, test_dataset, device)

# Actual targets
ground_truths = test_data['target'].tolist()

# Accuracy calculation
correct = sum(pred.strip() == tgt.strip() for pred, tgt in zip(test_preds, ground_truths))
accuracy = correct / len(ground_truths)

print(f"Test Accuracy: {accuracy * 100:.2f}%")


Test Accuracy: 9.07%


In [23]:
import random

print("\nSample Predictions:")
for i in random.sample(range(len(test_preds)), 10):
    print(f"Input: {test_data.iloc[i]['input']} | Predicted: {test_preds[i]} | Target: {test_data.iloc[i]['target']}")



Sample Predictions:
Input: gomoti | Predicted: গোমাতি | Target: গোমতি
Input: torunder | Predicted: তরণুদের | Target: তরুণদের
Input: haiben | Predicted: হাইবেন | Target: হইবেন
Input: architeectural | Predicted: আর্কিটেকচার | Target: আর্কিটেকচারাল
Input: myathu | Predicted: গঠুঃ | Target: ম্যাথু
Input: hoolmar | Predicted: হোল্মার | Target: হোমার
Input: guesht | Predicted: গণিত্য | Target: গেস্ট
Input: kaander | Predicted: কান্দের | Target: কাণ্ডের
Input: nishchit | Predicted: নিশিক | Target: নিশ্চিত
Input: matritwa | Predicted: মাতৃত্ব | Target: মাতৃত্ব


Question 2

In [None]:
!pip install -q kaggle transformers datasets


In [2]:
import kagglehub
import tensorflow as tf
from transformers import GPT2TokenizerFast, TFGPT2LMHeadModel, create_optimizer
import time
import numpy as np


In [3]:
# # Upload your kaggle.json file here (if using Google Colab)
# from google.colab import files
# files.upload()  # Upload the kaggle.json file here

# # Set the environment variable for Kaggle authentication
# import os
# os.environ['KAGGLE_CONFIG_DIR'] = '/content'  # If uploaded to /content in Colab


In [17]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("paultimothymooney/poetry")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/poetry


In [18]:
# Load the lyrics file for the chosen artist (example: Michael Jackson)
file_path = f"{path}/michael-jackson.txt"  # Update to your file location

def load_file():
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

text = load_file()


In [19]:
# Initialize the tokenizer
model_id = "gpt2-medium"
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)


In [20]:
# Define constants
MAX_LENGTH = 256
BATCH_SIZE = 2

# Tokenize the entire corpus
tokenized_data = tokenizer(
    text,
    truncation=True,
    max_length=MAX_LENGTH,
    return_overflowing_tokens=True,
    return_length=True
)

# Check tokenized data for debugging
print(tokenized_data)


{'input_ids': [[198, 198, 58, 4561, 4233, 37219, 47715, 198, 1639, 1683, 765, 1223, 220, 198, 5562, 345, 760, 345, 6584, 470, 423, 220, 198, 464, 517, 345, 760, 345, 6584, 470, 423, 340, 11, 220, 198, 464, 517, 345, 765, 340, 220, 198, 1870, 788, 530, 1110, 345, 651, 340, 11, 220, 198, 1026, 338, 523, 922, 1165, 220, 198, 1537, 340, 338, 655, 588, 616, 2576, 220, 198, 2215, 673, 338, 1088, 502, 220, 198, 40, 655, 1254, 523, 922, 11, 523, 922, 220, 198, 1537, 826, 783, 314, 655, 1254, 4692, 11, 523, 4692, 220, 198, 11028, 866, 284, 616, 11945, 220, 198, 6, 42323, 267, 1219, 986, 220, 198, 32, 259, 470, 645, 34488, 618, 673, 338, 3750, 220, 198, 1026, 338, 407, 5814, 618, 673, 338, 1497, 220, 198, 32, 259, 470, 645, 34488, 618, 673, 338, 3750, 220, 198, 1870, 673, 338, 1464, 3750, 1165, 890, 220, 198, 7149, 2435, 673, 2925, 1497, 220, 198, 198, 42337, 428, 640, 810, 673, 338, 3750, 220, 198, 42337, 611, 673, 338, 3750, 284, 2652, 220, 198, 32, 259, 470, 645, 34488, 618, 673, 338, 3750, 2

In [25]:
# Chunk the input text to avoid exceeding memory
text_chunks = [text[i:i + 1000] for i in range(0, len(text), 1000)]
def preprocess(example):
    try:
        outputs = tokenizer(
            example,
            truncation=True,
            max_length=MAX_LENGTH,
            return_overflowing_tokens=True,
            return_length=True,
        )

        input_batch = []
        for input_ids, length in zip(outputs["input_ids"], outputs["length"]):
            if length == MAX_LENGTH:
                input_batch.append(input_ids)

        if not input_batch:
            return {"input_ids": [], "labels": []}

        # Pad if batch is smaller than BATCH_SIZE
        while len(input_batch) < BATCH_SIZE:
            input_batch.append(input_batch[-1])  # pad with last valid sequence

        input_batch = input_batch[:BATCH_SIZE]  # ensure correct batch size

    except Exception as e:
        return {"input_ids": [], "labels": []}

    return {
        "input_ids": input_batch,
        "labels": input_batch  # language modeling: input_ids == labels
    }
def gen():
    for chunk in text_chunks:  # you must chunk long text into small parts
        yield preprocess(chunk)

tf_train_dataset = tf.data.Dataset.from_generator(
    gen,
    output_signature={
        "input_ids": tf.TensorSpec(shape=(BATCH_SIZE, MAX_LENGTH), dtype=tf.int32),
        "labels": tf.TensorSpec(shape=(BATCH_SIZE, MAX_LENGTH), dtype=tf.int32),
    }
)


In [26]:
def gen():
    for chunk in text_chunks:
        processed = preprocess(chunk)
        if processed["input_ids"]:
            yield {
                "input_ids": tf.constant(processed["input_ids"], dtype=tf.int32),
                "labels": tf.constant(processed["labels"], dtype=tf.int32)
            }

# Build TensorFlow dataset
tf_train_dataset = tf.data.Dataset.from_generator(
    gen,
    output_signature={
        "input_ids": tf.TensorSpec(shape=(BATCH_SIZE, MAX_LENGTH), dtype=tf.int32),
        "labels": tf.TensorSpec(shape=(BATCH_SIZE, MAX_LENGTH), dtype=tf.int32),
    }
)


In [27]:
# Calculate training steps
num_train_steps = len(text_chunks) * BATCH_SIZE
print(f"Number of training steps: {num_train_steps}")

# Load model
model = TFGPT2LMHeadModel.from_pretrained(model_id)

# Create optimizer
optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=1000,
    num_train_steps=num_train_steps
)

Number of training steps: 504


All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [28]:
# Load the GPT-2 model
# model = TFGPT2LMHeadModel.from_pretrained(model_id)

# # Create optimizer and schedule
# optimizer, schedule = create_optimizer(
#     init_lr=5e-5,
#     num_warmup_steps=1_000,
#     num_train_steps=num_train_steps
# )

# Compile the model
model.compile(optimizer=optimizer)


In [29]:
# Train the model
model.fit(tf_train_dataset, epochs=3)


Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x785231549bd0>

In [30]:
model.save_pretrained("./fine_tuned_gpt2_poetry")
tokenizer.save_pretrained("./fine_tuned_gpt2_poetry")


('./fine_tuned_gpt2_poetry/tokenizer_config.json',
 './fine_tuned_gpt2_poetry/special_tokens_map.json',
 './fine_tuned_gpt2_poetry/vocab.json',
 './fine_tuned_gpt2_poetry/merges.txt',
 './fine_tuned_gpt2_poetry/added_tokens.json',
 './fine_tuned_gpt2_poetry/tokenizer.json')

In [31]:
# Generate text
input_text = "true love shouldn't be this complicated"
input_ids = tokenizer(input_text, return_tensors="tf")["input_ids"]

start_time = time.time()
output_temp = model.generate(
    input_ids,
    max_length=256,
    do_sample=True,
    temperature=1.0,
    top_k=50
)
print(tokenizer.decode(output_temp[0], skip_special_tokens=True))
print(f"Generation Time: {time.time() - start_time:.2f} seconds")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


true love shouldn't be this complicated
This one, girl
Tell me when
Will you come with me
Your eyes will never fade
My soul is filled with love and
Nothing can control me, let's make it, it's my only choice I tell ya

[Chorus:]
When I'm lonely, I lie down
There's nothing I can do

[Michael]
How 'bout some love
When it's cold
It's my last wish to live
Even when my brother's dead
I'll sing for the things I can't see
Just know that love can fill the world
And that time together is the right time

I've got no life on earth
Just for the thrill, the thrill
That this fantasy is alive
I'll give it my very last shot

[Verse 2:]


How did you get on the side of the road
When there's nothing more to lose


You know what I like
I've never been alone
I can talk about anything without feeling lonely
But I'll always feel down
Don't be afraid,
Feel the pain like my father always did
Oh, don't cry, just wait
And pray
She was
Generation Time: 105.16 seconds
