<a href="https://colab.research.google.com/github/pratikshakaklij/PROJECT---AI/blob/main/AI_PROJECT_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>





This project aims to build an LSTM-based text generation model that learns poetic language patterns from a poem dataset and generates new, meaningful poems automatically.



STEP 1: Import Libraries

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

STEP 2: Load Poem Dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset("poem_sentiment")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/35.6k [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/6.16k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/892 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/105 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/104 [00:00<?, ? examples/s]

STEP 3: Use Subset (Training Control)

In [None]:

print(dataset['train'].column_names)

texts = dataset['train']['verse_text'][:30000]

['id', 'verse_text', 'label']


STEP 4: Text Cleaning

In [None]:
cleaned_text = []
for line in texts:
    line = line.lower().strip()
    if line:
        cleaned_text.append(line)

STEP 5: Tokenization

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(cleaned_text)

total_words = len(tokenizer.word_index) + 1

STEP 6: Create Input Sequence

In [None]:
input_sequences = []

for line in cleaned_text:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        input_sequences.append(token_list[:i+1])

"the moon shines bright"
 -> ["the"]
-> ["the moon"]
-> ["the moon shines"]
-> ["the moon shines bright"]


STEP 7: Padding Sequences

In [None]:
max_len = max(len(seq) for seq in input_sequences)

input_sequences = pad_sequences(
    input_sequences,
    maxlen=max_len,
    padding='pre'
)


STEP 8: Split Input & Output

In [None]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]


STEP 10: Build LSTM Mod

```
# This is formatted as code
```



STEP 9: One-Hot Encoding Output

In [None]:
y = tf.keras.utils.to_categorical(y, num_classes=total_words)


STEP 10: Build LSTM Mod

In [None]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_len-1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))




STEP 11: Compile Model

In [None]:
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)


STEP 12: Train Model

In [None]:
model.fit(X, y, epochs=15, batch_size=128)


Epoch 1/15
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 60ms/step - accuracy: 0.0458 - loss: 7.5174
Epoch 2/15
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 60ms/step - accuracy: 0.0618 - loss: 6.7875
Epoch 3/15
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 57ms/step - accuracy: 0.0585 - loss: 6.6106
Epoch 4/15
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 60ms/step - accuracy: 0.0585 - loss: 6.5969
Epoch 5/15
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 61ms/step - accuracy: 0.0563 - loss: 6.4656
Epoch 6/15
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 70ms/step - accuracy: 0.0650 - loss: 6.4412
Epoch 7/15
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 59ms/step - accuracy: 0.0688 - loss: 6.3048
Epoch 8/15
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 57ms/step - accuracy: 0.0717 - loss: 6.2814
Epoch 9/15
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x79051bcd1610>

STEP 13: Poem Text Generation Function

In [None]:
def generate_poem(seed_text, next_words):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)

        for word, index in tokenizer.word_index.items():
            if index == predicted:
                seed_text += " " + word
                break
    return seed_text


STEP 14: Generate Sample Poems

In [None]:
print(generate_poem("the moon", 3))
print(generate_poem("love is", 3))
print(generate_poem("in the night", 3))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
the moon of the heart
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
love is the heart of
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
in the night of the heart
