In [6]:
# ------------------ IMPORTS ------------------
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense
from tensorflow.keras.utils import to_categorical
import numpy as np

# ------------------ STEP 1: Data Preparation ------------------
# Original Urdu Poetry Lines (Each line is a list of tokens)
GhazalsU = [
    ['زخم', 'کھاتے', 'ہیں', 'اور', 'مسکراتے', 'ہیں', 'ہم'],
    ['حوصلہ', 'اپنا', 'خود', 'آزماتے', 'ہیں', 'ہم'],
    ['آ', 'لگا', 'ہے', 'کنارے', 'سفینہ', 'مگر'],
    ['شور', 'تو', 'عادتاً', 'ہی', 'مچاتے', 'ہیں', 'ہم'],
    ['ہم', 'جو', 'ڈوبیں', 'تو', 'کوئی', 'نہ', 'پھر', 'بچ', 'سکے'],
    ['ایسا', 'ساگر', 'میں', 'طوفاں', 'اٹھاتے', 'ہیں', 'ہم'],
    ['چور', 'کر', 'بھی', 'چکے', 'دل', 'کے', 'شیشے', 'کو', 'وہ'],
    ['اپنی', 'ہمت', 'ہے', 'پھر', 'چوٹ', 'کھاتے', 'ہیں', 'ہم'],
    ['بے', 'رخی', 'سے', 'جو', 'دل', 'توڑ', 'دیتے', 'ہیں', 'جوشؔ'],
    ['ان', 'کے', 'ہی', 'پیار', 'کے', 'گیت', 'گاتے', 'ہیں', 'ہم']
]

# Combine into sentences
urdu_lines_forward = [' '.join(line) for line in GhazalsU]
urdu_lines_backward = [' '.join(line[::-1]) for line in GhazalsU]  # reverse each line

# ------------------ STEP 2: Tokenizer ------------------
tokenizer = Tokenizer()
tokenizer.fit_on_texts(urdu_lines_forward + urdu_lines_backward)  # fit on both sets

vocab_size = len(tokenizer.word_index) + 1  # Total vocabulary

# ------------------ STEP 3: Convert to Sequences ------------------
forward_sequences = tokenizer.texts_to_sequences(urdu_lines_forward)
backward_sequences = tokenizer.texts_to_sequences(urdu_lines_backward)

# ------------------ STEP 4: N-Gram Sequence Generator ------------------
def generate_sequences(sequences):
    input_seqs = []
    for seq in sequences:
        for i in range(1, len(seq)):
            input_seqs.append(seq[:i+1])
    return input_seqs

forward_input_sequences = generate_sequences(forward_sequences)
backward_input_sequences = generate_sequences(backward_sequences)

# ------------------ STEP 5: Padding ------------------
max_len = max(max(len(seq) for seq in forward_input_sequences),
              max(len(seq) for seq in backward_input_sequences))

forward_input_sequences = pad_sequences(forward_input_sequences, maxlen=max_len, padding='pre')
backward_input_sequences = pad_sequences(backward_input_sequences, maxlen=max_len, padding='pre')

# ------------------ STEP 6: Split Inputs & Targets ------------------
X_f, Y_f = forward_input_sequences[:, :-1], forward_input_sequences[:, -1]
X_b, Y_b = backward_input_sequences[:, :-1], backward_input_sequences[:, -1]

# One-hot encode the labels
Y_f = to_categorical(Y_f, num_classes=vocab_size)
Y_b = to_categorical(Y_b, num_classes=vocab_size)

print(f"✅ Forward X: {X_f.shape}, Y: {Y_f.shape}")
print(f"🔁 Backward X: {X_b.shape}, Y: {Y_b.shape}")


✅ Forward X: (67, 8), Y: (67, 56)
🔁 Backward X: (67, 8), Y: (67, 56)


In [8]:
import pandas as pd
import re

# Load Excel file
df = pd.read_excel("ALL_POETRY_MAPPED.xlsx")

# Clean Urdu poetry line
def clean_urdu_line(text):
    if not isinstance(text, str):
        return ""
    text = text.replace("‌", "")  # Remove invisible Unicode character (ZWNJ etc.)
    text = re.sub(r'[،۔؟!"“”‘’؛\.-]', '', text)  # Remove Urdu-specific punctuation
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)  # Keep only Urdu characters and spaces
    return text.strip()

# Apply cleaning to Urdu and Roman Urdu columns
df["GhazalsU"] = df["GhazalsU"].astype(str).apply(clean_urdu_line)
df["GhazalsR"] = df["GhazalsR"].astype(str).apply(clean_urdu_line)

# Optional: Save cleaned data
df.to_csv("Cleaned_Urdu_Poetry.csv", index=False, encoding='utf-8-sig')

print("✅ Cleaned sample:")
print(df[["GhazalsU", "GhazalsR"]].head())


✅ Cleaned sample:
                                            GhazalsU GhazalsR
0  زخم کھاتے ہیں اور مسکراتے ہیں ہم حوصلہ اپنا خو...         
1  گزرے جو اپنے یاروں کی صحبت میں چار دن ایسا لگا...         
2  اتنا احسان تو ہم پر وہ خدارا کرتے اپنے ہاتھوں ...         
3  آہ بھی حرف دعا ہو جیسے اک دکھی دل کی صدا ہو جی...         
4  ہر ملاقات میں لگتے ہیں وہ بیگانے سے فائدہ کیا ...         


In [9]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import random

# Step 1: Convert poetry column to list of lines
poetry_lines = df["GhazalsU"].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x)).tolist()


# Step 2: Tokenizer (limit vocab size)

tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(poetry_lines)

# Step 3: Create input sequences (shorter line length)
input_sequences = []
max_line_length = 40  # Lowered for faster processing

for line in poetry_lines:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, min(len(token_list), max_line_length)):
        input_sequences.append(token_list[:i+1])

# Step 4: Shuffle and limit total number of sequences
MAX_SEQUENCES = 30000
if len(input_sequences) > MAX_SEQUENCES:
    input_sequences = random.sample(input_sequences, MAX_SEQUENCES)

# Step 5: Pad sequences
max_seq_len = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')

# Step 6: Create X and y
X, y = input_sequences[:, :-1], input_sequences[:, -1]
vocab_size = len(tokenizer.word_index) + 1
y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)

print(f"✅ Total sequences: {len(input_sequences)}")
print(f"🟢 Vocabulary size: {vocab_size}")
print(f"📏 Max sequence length: {max_seq_len}")


✅ Total sequences: 30000
🟢 Vocabulary size: 25568
📏 Max sequence length: 40


In [9]:
# Save tokenizer
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

# Use the same vocab size as defined earlier
vocab_size = len(tokenizer.word_index) + 1

model_forward = Sequential([
    Embedding(input_dim=vocab_size, output_dim=32, input_length=X.shape[1]),
    GRU(128, return_sequences=False),
    Dense(vocab_size, activation='softmax')
])

model_forward.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model_forward.summary()




In [None]:
# Reverse token sequences
X_reversed = np.array([x[::-1] for x in X])

# Use the same vocab size cap as in previous blocks
vocab_size = len(tokenizer.word_index) + 1

model_backward = Sequential([
    Embedding(input_dim=vocab_size, output_dim=32, input_length=X.shape[1]),
    GRU(128, return_sequences=False),
    Dense(vocab_size, activation='softmax')
])

model_backward.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model_backward.summary()


In [None]:
# Use a smaller batch size to avoid crashing
BATCH_SIZE = 32  # You can try 8 or 32 based on your GPU/TPU

# Train forward model
model_forward.fit(X, y, epochs=50, batch_size=BATCH_SIZE)

# Train backward model
model_backward.fit(X_reversed, y, epochs=50, batch_size=BATCH_SIZE)


Epoch 1/2
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 69ms/step - accuracy: 0.0327 - loss: 6.3188
Epoch 2/2
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 68ms/step - accuracy: 0.0403 - loss: 6.1589
Epoch 1/2
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 72ms/step - accuracy: 0.0325 - loss: 8.3918
Epoch 2/2
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 81ms/step - accuracy: 0.0334 - loss: 6.4897


<keras.src.callbacks.history.History at 0x224af9e7280>

In [15]:
model_forward.save("forward_model.h5")
model_backward.save("backward_model.h5")






In [1]:
from tensorflow.keras.models import load_model

# Load saved models
model_forward = load_model("forward_model.h5")
model_backward = load_model("backward_model.h5")




In [10]:
def generate_poetry(center_word, num_words=5):
    # Check if center word exists in the tokenizer's vocabulary
    if center_word not in tokenizer.word_index:
        print(f"❌ '{center_word}' not found in vocabulary.")
        return

    index_word = {v: k for k, v in tokenizer.word_index.items()}

    def generate_forward_sequence(seed_text, num_words):
        generated = seed_text
        for _ in range(num_words):
            token_list = tokenizer.texts_to_sequences([generated])[0]
            token_list = pad_sequences([token_list], maxlen=X.shape[1], padding='pre')
            prediction = model_forward.predict(token_list, verbose=0)
            predicted_index = np.argmax(prediction)
            next_word = index_word.get(predicted_index, '')
            if not next_word:
                break
            generated += ' ' + next_word
        return generated.split()

    def generate_backward_sequence(seed_text, num_words):
        generated = seed_text
        for _ in range(num_words):
            token_list = tokenizer.texts_to_sequences([generated])[0][::-1]  # reverse token list
            token_list = pad_sequences([token_list], maxlen=X.shape[1], padding='pre')
            prediction = model_backward.predict(token_list, verbose=0)
            predicted_index = np.argmax(prediction)
            next_word = index_word.get(predicted_index, '')
            if not next_word:
                break
            generated = next_word + ' ' + generated  # prepend
        return generated.split()

    # Generate forward and backward sequences
    forward_words = generate_forward_sequence(center_word, num_words)
    backward_words = generate_backward_sequence(center_word, num_words)

    # Remove duplicated center word from backward and reverse the sequence
    backward_words = backward_words[:-1]
    backward_words.reverse()

    # Combine the results
    final_poem = ' '.join(backward_words + [center_word] + forward_words[1:])
    print("📝 Generated Poetry:\n", final_poem)


In [11]:
generate_poetry("حساب", num_words=5)


📝 Generated Poetry:
 اے خوش کیا کی کیا حساب دن عشق سے رسوائیاں کرے


In [79]:
model_backward.summary()

In [80]:
model_forward.summary()

In [29]:
[line for line in poetry_lines if "دل" in line]


["['زخم', 'کھاتے', 'ہیں', 'اور', 'مسکراتے', 'ہیں', 'ہم', 'حوصلہ', 'اپنا', 'خود', 'آزماتے', 'ہیں', 'ہم', 'آ', 'لگا', 'ہے', 'کنارے', 'سفینہ', 'مگر', 'شور', 'تو', 'عادتاً', 'ہی', 'مچاتے', 'ہیں', 'ہم', 'ہم', 'جو', 'ڈوبیں', 'تو', 'کوئی', 'نہ', 'پھر', 'بچ', 'سکے', 'ایسا', 'ساگر', 'میں', 'طوفاں', 'اٹھاتے', 'ہیں', 'ہم', 'چور', 'کر', 'بھی', 'چکے', 'دل', 'کے', 'شیشے', 'کو', 'وہ', 'اپنی', 'ہمت', 'ہے', 'پھر', 'چوٹ', 'کھاتے', 'ہیں', 'ہم', 'بے', 'رخی', 'سے', 'جو', 'دل', 'توڑ', 'دیتے', 'ہیں', 'جوشؔ', 'ان', 'کے', 'ہی', 'پیار', 'کے', 'گیت', 'گاتے', 'ہیں', 'ہم']",
 "['گزرے', 'جو', 'اپنے', 'یاروں', 'کی', 'صحبت', 'میں', 'چار', 'دن', 'ایسا', 'لگا', 'بسر', 'ہوئے', 'جنت', 'میں', 'چار', 'دن', 'عمر', 'خضر', 'کی', 'اس', 'کو', 'تمنا', 'کبھی', 'نہ', 'ہو', 'انسان', 'جی', 'سکے', 'جو', 'محبت', 'میں', 'چار', 'دن', 'جب', 'تک', 'جیے', 'نبھائیں', 'گے', 'ہم', 'ان', 'سے', 'دوستی', 'اپنے', 'رہے', 'جو', 'دوست', 'مصیبت', 'میں', 'چار', 'دن', 'اے', 'جان', 'آرزو', 'وہ', 'قیامت', 'سے', 'کم', 'نہ', 'تھے', 'کاٹے', 'ترے', 'بغیر', '

In [92]:

print(tokenizer.word_index.get("شہر"))


105
