In [10]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import pandas as pd

# Load dataset
df = pd.read_csv("dataset.csv")[:1000]

# Preprocess text: lowercase & remove special characters
df["Poetry"] = df["Poetry"].str.lower().str.replace(r'[^\w\s]', '', regex=True)

In [11]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df["Poetry"])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df["Poetry"])
vocab_size = len(tokenizer.word_index) + 1

# Prepare input-output sequences for training
input_sequences = []
for seq in sequences:
    for i in range(1, len(seq)):
        input_sequences.append(seq[:i+1])

# # Reduce dataset size to 50% for faster training
# input_sequences = input_sequences[:len(input_sequences)//2]

# Pad sequences
max_seq_length = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_length, padding='pre')

# Split into X and y
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

# Convert y to categorical (one-hot encoding)
y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)

In [12]:
# ✅ **LSTM Model with Dropout and Temperature Sampling**
model = Sequential([
    Embedding(vocab_size, 50, input_length=max_seq_length-1),  # Embedding layer
    LSTM(128, return_sequences=True),  # LSTM layer with more units
    Dropout(0.2),  # Dropout layer to prevent overfitting
    LSTM(64),  # Another LSTM layer
    Dropout(0.2),  # Dropout layer
    Dense(64, activation='relu'),  # Dense layer for better learning
    Dense(vocab_size, activation='softmax')  # Output layer
])

# Compile Model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [13]:
# ✅ **Train with More Epochs & Validation Split**
model.fit(X, y, epochs=100, batch_size=256, validation_split=0.1, verbose=1)


Epoch 1/100
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 328ms/step - accuracy: 0.0159 - loss: 7.5570 - val_accuracy: 0.0308 - val_loss: 6.7131
Epoch 2/100
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 198ms/step - accuracy: 0.0263 - loss: 6.3756 - val_accuracy: 0.0559 - val_loss: 6.7429
Epoch 3/100
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 203ms/step - accuracy: 0.0391 - loss: 6.2346 - val_accuracy: 0.0559 - val_loss: 6.8355
Epoch 4/100
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 181ms/step - accuracy: 0.0401 - loss: 6.2107 - val_accuracy: 0.0559 - val_loss: 6.8889
Epoch 5/100
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 188ms/step - accuracy: 0.0415 - loss: 6.2292 - val_accuracy: 0.0559 - val_loss: 6.9344
Epoch 6/100
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 197ms/step - accuracy: 0.0451 - loss: 6.2148 - val_accuracy: 0.0559 - val_loss: 6.9966
Epoch 7/100
[1m26/2

<keras.src.callbacks.history.History at 0x2ab52915d30>

In [21]:
# Save Model & Tokenizer
model.save("poetry_generator.h5")
import pickle
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

print("✅ Training Complete & Model Saved")



✅ Training Complete & Model Saved


In [22]:
# ✅ **Text Generation Function with Temperature Sampling**
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(preds), p=preds)

In [23]:
def generate_text(seed_text, model, tokenizer, max_seq_length, temperature=1.0, num_lines=1, words_per_line=5):
    generated_text = ""
    for _ in range(num_lines):  # Generate specified number of lines
        line = seed_text
        for _ in range(words_per_line):  # Generate words for each line
            token_list = tokenizer.texts_to_sequences([line])[0]
            token_list = pad_sequences([token_list], maxlen=max_seq_length-1, padding='pre')
            preds = model.predict(token_list, verbose=0)
            next_word_idx = sample(preds[0], temperature)
            next_word = tokenizer.index_word[next_word_idx]
            line += " " + next_word
        generated_text += line + "\n"
    return generated_text

# Example of getting user input and generating text
num_lines = int(input("Enter the number of lines to generate: "))
words_per_line = int(input("Enter the number of words in each line: "))
seed_text = input("Enter the seed text: ")

# Generate the text
generated_text = generate_text(seed_text, model, tokenizer, max_seq_length, temperature=0.8, num_lines=num_lines, words_per_line=words_per_line)
print("Generated Text:\n", generated_text)

Generated Text:
 dil ki baat hī so sāmne yūñhī
dil ki baat le kā rishte hai
dil ki baat bhī rāsta khultā nahīñ
dil ki baat rahā se ulajh aae



In [24]:
import tensorflow as tf
print(tf.__version__)


2.17.0
