## Importing Libraries

In [37]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
from sklearn.model_selection import train_test_split

## Loading Dataset

In [38]:
df = pd.read_csv('filtered_ghazals.csv')

In [39]:
print(f"Dataset shape: {df.shape}")
print("\nColumns:", df.columns.tolist())
df.head()

Dataset shape: (643, 2)

Columns: ['Poet', 'Poetry Text']


Unnamed: 0,Poet,Poetry Text
0,Unknown,kahnā ġhalat ġhalat to chhupānā sahīh sahīh qā...
1,Unknown,mast nazroñ se allāh bachā.e mah-jamāloñ se al...
2,Unknown,phirūñ DhūñDtā mai-kada tauba tauba mujhe āj-k...
3,Unknown,sahar qarīb hai tāroñ kā haal kyā hogā ab inti...
4,Unknown,merī āñkhoñ ko baḳhshe haiñ aañsū dil ko dāġh-...


## Tokenization using Sentence Piece Tokenizer

In [40]:
# Create word-based vocabulary
def create_word_vocab(texts, poet_names):
    # Split all texts into words, preserving newlines
    all_words = []
    for text in texts:
        # Split by whitespace while preserving \n
        words = []
        for line in text.split('\n'):
            words.extend(line.split())
            words.append('\n')  # Add newline as a token
        all_words.extend(words[:-1])  # Remove last extra \n

    # Add each word in poet names to vocabulary
    for poet in poet_names:
        all_words.extend(poet.split())  # Split multi-word poet names

    unique_words = sorted(list(set(all_words)))
    special_tokens = ['<pad>', '<unk>', '<poet>', '</poet>']
    vocab = special_tokens + unique_words

    word2idx = {word: idx for idx, word in enumerate(vocab)}
    idx2word = {idx: word for idx, word in enumerate(vocab)}

    return word2idx, idx2word, len(vocab)


In [41]:
# Function to tokenize text with poet context
def tokenize_with_poet(text, poet_name, word2idx):
    poet_context = f"<poet> {poet_name} </poet>"
    words = []
    # Split text by lines and add \n tokens
    for line in text.split('\n'):
        words.extend(line.split())
        words.append('\n')
    words = words[:-1]  # Remove last extra \n

    full_text = poet_context.split() + words
    return [word2idx.get(word, word2idx['<unk>']) for word in full_text]

# Create vocabulary
poetry_texts = df['Poetry Text'].tolist()
poets = df['Poet'].unique().tolist()
word2idx, idx2word, vocab_size = create_word_vocab(poetry_texts, poets)

# Create tokenized dataset
tokenized_data = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    tokens = tokenize_with_poet(row['Poetry Text'], row['Poet'], word2idx)
    tokenized_data.append(tokens)


100%|██████████| 643/643 [00:00<00:00, 8460.29it/s]


In [42]:
# Convert to PyTorch tensors and pad sequences
token_lengths = [len(x) for x in tokenized_data]
max_len = max(token_lengths)

padded_data = []
for tokens in tokenized_data:
    padded = tokens + [word2idx['<pad>']] * (max_len - len(tokens))
    padded_data.append(padded)

tensor_data = torch.LongTensor(padded_data)

print(f"Tokenized data shape: {tensor_data.shape}")
print(f"Vocabulary size: {vocab_size}")


Tokenized data shape: torch.Size([643, 929])
Vocabulary size: 12035


## GRU

In [43]:
class GRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout=0.1):
        super().__init__()

        # Define class attributes
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout_rate = dropout

        # Create the layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        embed = self.embedding(x)
        output, hidden = self.gru(embed, hidden)
        output = self.dropout(output)
        prediction = self.fc(output)
        return prediction, hidden

In [44]:
# Create PyTorch Dataset
class PoetryDataset(torch.utils.data.Dataset):
    def __init__(self, data, sequence_length):
        self.data = data
        self.sequence_length = sequence_length
        self.sequences = []

        # Create sequences in correct shape
        for i in range(len(data)):
            # Use only the first sequence_length tokens if longer
            if len(data[i]) > sequence_length:
                seq = data[i][:sequence_length]
                target = data[i][1:sequence_length+1]
                self.sequences.append((seq, target))

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence, target = self.sequences[idx]
        return torch.LongTensor(sequence), torch.LongTensor(target)


In [45]:
# Configuration dictionary
config = {
    # Model parameters
    'vocab_size': vocab_size,
    'embedding_dim': 256,
    'hidden_dim': 512,
    'num_layers': 2,
    'dropout': 0.1,

    # Training parameters
    'sequence_length': 100,
    'batch_size': 32,
    'epochs': 50,
    'learning_rate': 0.001,

    # Other settings
    'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    'save_path': 'poetry_generator.pth'
}

In [46]:
# Initialize model using config
model = GRU(
    vocab_size=config['vocab_size'],
    embedding_dim=config['embedding_dim'],
    hidden_dim=config['hidden_dim'],
    num_layers=config['num_layers'],
    dropout=config['dropout']
).to(config['device'])


In [47]:
# Convert tensor to numpy array and split into train and test
data_array = tensor_data.numpy()
train_data, test_data = train_test_split(data_array,test_size=0.1,random_state=42
)

# Create only training dataset
train_dataset = PoetryDataset(train_data, config['sequence_length'])

# Create only training dataloader
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=config['batch_size'],
    shuffle=True
)

In [48]:
optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'])
criterion = nn.CrossEntropyLoss(ignore_index=word2idx['<pad>'])

In [49]:

# Training loop
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0

    for batch_idx, (data, target) in enumerate(tqdm(dataloader)):
        # Ensure input is 2D: [batch_size, sequence_length]
        data = data.to(device)
        target = target.to(device)

        optimizer.zero_grad()
        output, _ = model(data)  # GRU will handle making it 3D internally

        # Reshape output and target for loss calculation
        output = output.view(-1, output.size(-1))
        target = target.view(-1)

        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [50]:
# Simpler training loop
print("Starting training...")
for epoch in range(config['epochs']):
    train_loss = train_epoch(model, train_dataloader, optimizer, criterion, config['device'])
    print(f'Epoch: {epoch+1}/{config["epochs"]}, Loss: {train_loss:.4f}')

    # Save the model on best loss
    if epoch == 0 or train_loss < best_loss:
        best_loss = train_loss
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': train_loss,
            'config': config
        }, config['save_path'])

print("Training completed and best model saved!")

Starting training...


100%|██████████| 19/19 [00:01<00:00, 15.59it/s]


Epoch: 1/50, Loss: 7.5167


100%|██████████| 19/19 [00:01<00:00, 16.77it/s]


Epoch: 2/50, Loss: 6.5543


100%|██████████| 19/19 [00:01<00:00, 16.66it/s]


Epoch: 3/50, Loss: 6.4333


100%|██████████| 19/19 [00:01<00:00, 16.68it/s]


Epoch: 4/50, Loss: 6.4574


100%|██████████| 19/19 [00:01<00:00, 16.79it/s]


Epoch: 5/50, Loss: 6.3784


100%|██████████| 19/19 [00:01<00:00, 16.70it/s]


Epoch: 6/50, Loss: 6.3373


100%|██████████| 19/19 [00:01<00:00, 16.68it/s]


Epoch: 7/50, Loss: 6.2647


100%|██████████| 19/19 [00:01<00:00, 16.47it/s]


Epoch: 8/50, Loss: 6.1541


100%|██████████| 19/19 [00:01<00:00, 16.43it/s]


Epoch: 9/50, Loss: 6.0519


100%|██████████| 19/19 [00:01<00:00, 16.51it/s]


Epoch: 10/50, Loss: 5.9710


100%|██████████| 19/19 [00:01<00:00, 16.46it/s]


Epoch: 11/50, Loss: 5.8128


100%|██████████| 19/19 [00:01<00:00, 16.21it/s]


Epoch: 12/50, Loss: 5.6953


100%|██████████| 19/19 [00:01<00:00, 16.30it/s]


Epoch: 13/50, Loss: 5.5746


100%|██████████| 19/19 [00:01<00:00, 16.34it/s]


Epoch: 14/50, Loss: 5.4105


100%|██████████| 19/19 [00:01<00:00, 16.27it/s]


Epoch: 15/50, Loss: 5.2894


100%|██████████| 19/19 [00:01<00:00, 16.22it/s]


Epoch: 16/50, Loss: 5.2107


100%|██████████| 19/19 [00:01<00:00, 16.17it/s]


Epoch: 17/50, Loss: 5.1337


100%|██████████| 19/19 [00:01<00:00, 16.07it/s]


Epoch: 18/50, Loss: 5.0519


100%|██████████| 19/19 [00:01<00:00, 16.04it/s]


Epoch: 19/50, Loss: 4.9602


100%|██████████| 19/19 [00:01<00:00, 15.97it/s]


Epoch: 20/50, Loss: 4.8849


100%|██████████| 19/19 [00:01<00:00, 16.01it/s]


Epoch: 21/50, Loss: 4.7782


100%|██████████| 19/19 [00:01<00:00, 16.03it/s]


Epoch: 22/50, Loss: 4.7036


100%|██████████| 19/19 [00:01<00:00, 16.05it/s]


Epoch: 23/50, Loss: 4.6168


100%|██████████| 19/19 [00:01<00:00, 15.99it/s]


Epoch: 24/50, Loss: 4.4920


100%|██████████| 19/19 [00:01<00:00, 15.96it/s]


Epoch: 25/50, Loss: 4.4360


100%|██████████| 19/19 [00:01<00:00, 16.03it/s]


Epoch: 26/50, Loss: 4.3030


100%|██████████| 19/19 [00:01<00:00, 15.98it/s]


Epoch: 27/50, Loss: 4.1987


100%|██████████| 19/19 [00:01<00:00, 15.94it/s]


Epoch: 28/50, Loss: 4.2109


100%|██████████| 19/19 [00:01<00:00, 16.14it/s]


Epoch: 29/50, Loss: 4.0438


100%|██████████| 19/19 [00:01<00:00, 16.16it/s]


Epoch: 30/50, Loss: 4.0092


100%|██████████| 19/19 [00:01<00:00, 16.12it/s]


Epoch: 31/50, Loss: 3.8673


100%|██████████| 19/19 [00:01<00:00, 16.23it/s]


Epoch: 32/50, Loss: 3.7999


100%|██████████| 19/19 [00:01<00:00, 16.30it/s]


Epoch: 33/50, Loss: 3.6789


100%|██████████| 19/19 [00:01<00:00, 16.29it/s]


Epoch: 34/50, Loss: 3.6122


100%|██████████| 19/19 [00:01<00:00, 16.25it/s]


Epoch: 35/50, Loss: 3.4761


100%|██████████| 19/19 [00:01<00:00, 16.26it/s]


Epoch: 36/50, Loss: 3.4151


100%|██████████| 19/19 [00:01<00:00, 16.46it/s]


Epoch: 37/50, Loss: 3.3398


100%|██████████| 19/19 [00:01<00:00, 16.51it/s]


Epoch: 38/50, Loss: 3.2345


100%|██████████| 19/19 [00:01<00:00, 16.37it/s]


Epoch: 39/50, Loss: 3.1119


100%|██████████| 19/19 [00:01<00:00, 16.43it/s]


Epoch: 40/50, Loss: 3.0215


100%|██████████| 19/19 [00:01<00:00, 16.44it/s]


Epoch: 41/50, Loss: 2.8934


100%|██████████| 19/19 [00:01<00:00, 16.43it/s]


Epoch: 42/50, Loss: 2.7875


100%|██████████| 19/19 [00:01<00:00, 16.54it/s]


Epoch: 43/50, Loss: 2.6309


100%|██████████| 19/19 [00:01<00:00, 16.32it/s]


Epoch: 44/50, Loss: 2.4955


100%|██████████| 19/19 [00:01<00:00, 16.40it/s]


Epoch: 45/50, Loss: 2.4728


100%|██████████| 19/19 [00:01<00:00, 16.46it/s]


Epoch: 46/50, Loss: 2.3632


100%|██████████| 19/19 [00:01<00:00, 16.55it/s]


Epoch: 47/50, Loss: 2.2337


100%|██████████| 19/19 [00:01<00:00, 16.41it/s]


Epoch: 48/50, Loss: 2.0731


100%|██████████| 19/19 [00:01<00:00, 16.44it/s]


Epoch: 49/50, Loss: 2.0203


100%|██████████| 19/19 [00:01<00:00, 16.46it/s]


Epoch: 50/50, Loss: 1.9028
Training completed and best model saved!


In [64]:
def generate_poetry(model, word2idx, idx2word, poet_name, seed_text=None, max_length=100, temperature=0.8, repetition_penalty=1.2, device='cuda'):
    model.eval()

    # Start with poet context
    context = f"<poet> {poet_name} </poet>"
    poet_context_length = len(context.split())

    if seed_text:
        context += " " + seed_text

    # Convert to tokens
    tokens = [word2idx.get(word, word2idx['<unk>']) for word in context.split()]
    initial_input = torch.LongTensor(tokens).unsqueeze(0).to(device)

    generated = tokens
    with torch.no_grad():
        hidden = None
        curr_input = initial_input

        for _ in range(max_length):
            output, hidden = model(curr_input, hidden)
            next_token_logits = output[0, -1, :] / temperature

            # Apply repetition penalty
            for idx in set(generated):
                next_token_logits[idx] /= repetition_penalty

            probs = torch.softmax(next_token_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)

            if next_token.item() == word2idx['<pad>']:
                break

            generated.append(next_token.item())
            curr_input = next_token.view(1, 1)

    # Convert indices back to words
    result = []
    for idx in generated[poet_context_length:]:
        word = idx2word[idx]
        if word == '\n':
            result.append('\n')
        elif word not in ['<poet>', '</poet>']:
            result.append(word)

    generated_text = ' '.join(result).replace(' \n', '\n')
    return generated_text.strip()


In [65]:
# Load saved model
checkpoint = torch.load('poetry_generator.pth')
model.load_state_dict(checkpoint['model_state_dict'])

# Generate poetry
# Generate poetry
poets = [
    "Mirza Ghalib",
    "Dagh Dehlvi",
    "Bashir Badr",
    "Ahmad Faraz",
    "Faiz Ahmad Faiz",
    "Shahryar",
    "Jaun Eliya",
    "Hasrat Mohani",
    "Ahmad Mushtaq",
    "Shakeel Badayuni"
]
seed_texts = [
    "dil",
    "ishq",
    "mohabbat"
]

for poet in poets:
    for seed in seed_texts:
        print(f"\nPoet: {poet}")
        print(f"Seed: {seed}")
        generated = generate_poetry(
            model=model,
            word2idx=word2idx,
            idx2word=idx2word,
            poet_name=poet,
            seed_text=seed,
            temperature=0.1,
            repetition_penalty=1.2  # Add repetition penalty
        )
        print(f"Generated:\n{generated}\n")
        print("-"*50)

  checkpoint = torch.load('poetry_generator.pth')



Poet: Mirza Ghalib
Seed: dil
Generated:
dil hī to hai na sang-o-ḳhisht dard se bhī ik tamāshā huā kyā hai
 tū aur ārā.ish-e-ḳham-e-kākul ishq kī davā ho jaa.e
 yuuñ un ko na samjheñge ki mudda.ā thā jo ai shauq! kyā hai but-e-ā.ina-sīmā mire aage
 phir dekhiye andāz-e-gul-afshānī-e-guftār rakh de ham bin kyuuñ na mai gar hotā hai
 ye parī-chehra log kaise haiñ ġhamza o ishva o adā kyā hai
 sabza o gul kahāñ se aa.e haiñ abr kyā chiiz hai havā kyā hai
 ham ne maanā ki taġhāful na karoge lekin is qadar dushman-e-arbāb-e-vafā ho ga.e hote
 ab tak ghar

--------------------------------------------------

Poet: Mirza Ghalib
Seed: ishq
Generated:
ishq kahāñ hī sahī dil hī to hai ki visāl-e-yār hotā na bane kyā huā koī
 mere ban ke liye aa jaa.e hai us ko magar ai jazba-e-dil tire ḳhat pe e'tibār huuñ agar mire aage
 ġhair phirtā hai kuchh aisī ki bin aa.e na de vo bhale haiñ ki bin kahe na ho aur ham-zaban bhī koī nahīñ hotā
 ye kya le kar koi na karoge ham karte haiñ is qadar kyuuñ na sakū

In [66]:
# Calculate perplexity on test dataset
def calculate_perplexity(model, data, word2idx, device):
    model.eval()
    total_loss = 0
    total_tokens = 0

    # Create test dataset and dataloader
    test_dataset = PoetryDataset(data, config['sequence_length'])
    test_dataloader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=config['batch_size'],
        shuffle=False
    )

    criterion = nn.CrossEntropyLoss(ignore_index=word2idx['<pad>'], reduction='sum')

    with torch.no_grad():
        for data, target in tqdm(test_dataloader, desc="Calculating perplexity"):
            data = data.to(device)
            target = target.to(device)

            output, _ = model(data)
            output = output.view(-1, output.size(-1))
            target = target.view(-1)

            # Calculate loss
            loss = criterion(output, target)

            # Count non-padding tokens
            non_pad_mask = target.ne(word2idx['<pad>'])
            num_tokens = non_pad_mask.long().sum().item()

            total_loss += loss.item()
            total_tokens += num_tokens

    # Calculate perplexity
    avg_loss = total_loss / total_tokens
    perplexity = np.exp(avg_loss)

    return perplexity

# Calculate and print perplexity on test dataset
print("\nCalculating perplexity on test dataset...")
perplexity = calculate_perplexity(model, test_data, word2idx, config['device'])
print(f"Test Perplexity: {perplexity:.2f}")


Calculating perplexity on test dataset...


Calculating perplexity: 100%|██████████| 3/3 [00:00<00:00, 39.69it/s]

Test Perplexity: 44.93



