<a href="https://colab.research.google.com/github/saurabhmungale/DataScience_Assignements/blob/main/Gen_AI_Intro_%26_Text_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Que.1.Write a code to generate a random sentence using probabilistic modeling
(Markov Chain). Use the sentence "The cat is on the mat" as an example

In [None]:
import random
from collections import defaultdict

# Input sentence
sentence = "The cat is on the mat"

# Step 1: Tokenize the sentence into words
words = sentence.split()
# Add a special token for the end of the sentence
words.append("<END>")

# Step 2: Build the transition matrix (Markov Chain)
transition_matrix = defaultdict(list)
for i in range(len(words) - 1):
    current_word = words[i]
    next_word = words[i + 1]
    transition_matrix[current_word].append(next_word)

# Step 3: Generate a random sentence
def generate_sentence(transition_matrix, start_word="The"):
    current_word = start_word
    generated_sentence = [current_word]

    while current_word != "<END>":
        # Randomly choose the next word based on the current word
        next_words = transition_matrix[current_word]
        current_word = random.choice(next_words)
        if current_word != "<END>":
            generated_sentence.append(current_word)

    return " ".join(generated_sentence)

# Example usage
random_sentence = generate_sentence(transition_matrix)
print("Generated Sentence:", random_sentence)


Generated Sentence: The cat is on the mat


Que.2.Build a simple Autoencoder model using Keras to learn a compressed
representation of a given sentence. Use a dataset of your choice

In [None]:
!pip install tensorflow
!pip install keras

import numpy as np
from keras.models import Model
from keras.layers import Input, Dense, Embedding, LSTM, RepeatVector, TimeDistributed
from keras.utils import pad_sequences
from keras.preprocessing.sequence import pad_sequences
from keras.datasets import imdb


# Parameters
max_vocab_size = 5000
max_sequence_length = 100
embedding_dim = 50
latent_dim = 64

# Load and preprocess the dataset
(x_train, _), (x_test, _) = imdb.load_data(num_words=max_vocab_size)

# Pad sequences
x_train = pad_sequences(x_train, maxlen=max_sequence_length, padding='post')
x_test = pad_sequences(x_test, maxlen=max_sequence_length, padding='post')

# Step 1: Build the Autoencoder
# Encoder
input_sentence = Input(shape=(max_sequence_length,))
embedded_sentence = Embedding(max_vocab_size, embedding_dim, input_length=max_sequence_length)(input_sentence)
encoded = LSTM(latent_dim)(embedded_sentence)

# Decoder
decoded = RepeatVector(max_sequence_length)(encoded)
decoded = LSTM(embedding_dim, return_sequences=True)(decoded)
decoded = TimeDistributed(Dense(max_vocab_size, activation='softmax'))(decoded)

# Autoencoder model
autoencoder = Model(input_sentence, decoded)

# Compile the model
autoencoder.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Step 2: Prepare data for autoencoder
# Add a dimension to match the output shape
x_train_expanded = np.expand_dims(x_train, -1)
x_test_expanded = np.expand_dims(x_test, -1)

# Train the model
autoencoder.fit(
    x_train,
    x_train_expanded,
    epochs=10,
    batch_size=64,
    validation_data=(x_test, x_test_expanded)
)

# Encoder model (for compressed representations)
encoder = Model(input_sentence, encoded)

# Step 3: Test encoding and decoding
sample_sentence = x_test[0:1]  # Pick a sample test sentence
compressed = encoder.predict(sample_sentence)
print("Compressed Representation:", compressed)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step




Epoch 1/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1075s[0m 3s/step - loss: 6.6693 - val_loss: 5.9831
Epoch 2/10
[1m  5/391[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m12:14[0m 2s/step - loss: 6.0071

Que.3.Use the Hugging Face transformers library to fine-tune a pre-trained GPT-2
model on a custom text data and generate textI

In [None]:
pip install transformers datasets
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

# Step 1: Load GPT-2 tokenizer and model
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Step 2: Prepare dataset
def load_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size,
    )

def create_data_collator(tokenizer):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )

# File path to custom data
custom_data_path = "custom_data.txt"

# Load dataset and data collator
dataset = load_dataset(custom_data_path, tokenizer)
data_collator = create_data_collator(tokenizer)

# Step 3: Set up training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-fine-tuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_dir="./logs",
)

# Step 4: Fine-tune GPT-2
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

trainer.train()

# Step 5: Save the fine-tuned model
model.save_pretrained("./gpt2-fine-tuned")
tokenizer.save_pretrained("./gpt2-fine-tuned")

# Step 6: Generate text with the fine-tuned model
from transformers import pipeline

generator = pipeline("text-generation", model="./gpt2-fine-tuned", tokenizer=tokenizer)
output = generator("Once upon a time", max_length=50, num_return_sequences=1)
print("Generated Text:", output[0]["text"])


Que,4,Implement a text generation model using a simple Recurrent Neural
Network (RNN) in Keras. Train the model on a custom data and generate a
word

In [3]:
import numpy as np
#from keras.preprocessing.text import Tokenizer
#from keras.utils import to_categorical
#from keras.preprocessing.sequence import pad_sequences
#from keras.models import Sequential
#from keras.layers import Embedding, SimpleRNN, Dense

# Instead of:
# from keras.preprocessing.text import Tokenizer

# Use:
from tensorflow.keras.preprocessing.text import Tokenizer

# Instead of:
# from keras.utils import to_categorical

# Use:
from tensorflow.keras.utils import to_categorical

# Instead of:
# from keras.preprocessing.sequence import pad_sequences

# Use:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Instead of:
# from keras.models import Sequential

# Use:
from tensorflow.keras.models import Sequential

# Instead of:
# from keras.layers import Embedding, SimpleRNN, Dense

# Use:
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# Step 1: Load and preprocess the data
# Replace with the path to your custom dataset
custom_text = """
Once upon a time, there was a curious cat.
The cat loved to explore the world.
One day, the cat found a treasure in the woods.
"""

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([custom_text])
total_words = len(tokenizer.word_index) + 1  # Total vocabulary size
print("Vocabulary Size:", total_words)

# Create sequences of words
input_sequences = []
for line in custom_text.split("\n"):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)

# Pad sequences and prepare predictors/labels
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

X = input_sequences[:, :-1]  # Predictors
y = input_sequences[:, -1]   # Labels
y = to_categorical(y, num_classes=total_words)

# Step 2: Build the RNN Model
model = Sequential([
    Embedding(total_words, 10, input_length=max_sequence_len - 1),
    SimpleRNN(100, return_sequences=False),
    Dense(total_words, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Step 3: Train the Model
history = model.fit(X, y, epochs=500, verbose=1)

# Step 4: Generate Text
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == np.argmax(predicted):
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Generate text
seed_text = "Once upon"
generated_text = generate_text(seed_text, next_words=10, model=model, max_sequence_len=max_sequence_len)
print("Generated Text:", generated_text)


Vocabulary Size: 20




Epoch 1/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 0.0000e+00 - loss: 3.0020
Epoch 2/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 665ms/step - accuracy: 0.0870 - loss: 2.9738
Epoch 3/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.2174 - loss: 2.9450
Epoch 4/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.2174 - loss: 2.9141
Epoch 5/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.2174 - loss: 2.8796
Epoch 6/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.2174 - loss: 2.8408
Epoch 7/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.2609 - loss: 2.7983
Epoch 8/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.2174 - loss: 2.7556
Epoch 9/500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

Que.5.!I Write a program to generate a sequence of text using an LSTM-based
model in TensorFlow, trained on a custom data of sentences

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Step 1: Load your custom dataset
custom_text = """
Once upon a time, there was a curious cat.
The cat loved to explore the world around it.
One day, the cat discovered a treasure hidden in the forest.
It was a magical adventure that changed the cat's life forever.
"""

# Step 2: Preprocess the data
# Tokenize the text to convert it into a sequence of integers
tokenizer = Tokenizer()
tokenizer.fit_on_texts([custom_text])
total_words = len(tokenizer.word_index) + 1  # Total vocabulary size
print(f"Vocabulary size: {total_words}")

# Create input sequences and the corresponding labels
input_sequences = []
for line in custom_text.split("\n"):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)

# Pad sequences to ensure uniform length
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

# Prepare predictors (X) and labels (y)
X = input_sequences[:, :-1]  # All but last word for predictors
y = input_sequences[:, -1]   # Last word for labels
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Step 3: Build the LSTM-based model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len - 1))
model.add(LSTM(150, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(total_words, activation='softmax'))

model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Step 4: Train the model
model.fit(X, y, epochs=500, verbose=1)

# Step 5: Generate text using the trained model
def generate_text(seed_text, next_words, model, tokenizer, max_sequence_len):
    for _ in range(next_words):
        # Tokenize the seed text
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')

        # Predict the next word
        predicted_probabilities = model.predict(token_list, verbose=0)
        predicted_word_index = np.argmax(predicted_probabilities)

        # Convert the word index back to the word
        predicted_word = tokenizer.index_word[predicted_word_index]
        seed_text += " " + predicted_word

    return seed_text

# Step 6: Generate a sentence from a seed text
seed_text = "Once upon a time"
generated_text = generate_text(seed_text, next_words=10, model=model, tokenizer=tokenizer, max_sequence_len=max_sequence_len)
print("Generated Text:", generated_text)


Vocabulary size: 30




Epoch 1/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 81ms/step - accuracy: 0.0000e+00 - loss: 3.4005
Epoch 2/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.0868 - loss: 3.3879
Epoch 3/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.2025 - loss: 3.3775
Epoch 4/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.2315 - loss: 3.3637
Epoch 5/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.2211 - loss: 3.3513
Epoch 6/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.1447 - loss: 3.3327
Epoch 7/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.2604 - loss: 3.3087
Epoch 8/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.1921 - loss: 3.2814
Epoch 9/500
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[

Que.6. Build a program that uses GPT-2 from Hugging Face to generate a story
based on a custom prompt

In [6]:
!pip install transformers torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Step 1: Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"  # You can use different sizes like "gpt2-medium", "gpt2-large", etc.
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Step 2: Set the model to evaluation mode (inference)
model.eval()

# Step 3: Define a function to generate a story based on a prompt
def generate_story(prompt, max_length=300, num_return_sequences=1):
    # Tokenize the input prompt
    inputs = tokenizer.encode(prompt, return_tensors="pt")

    # Generate the text using the model
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=max_length,  # Total length of the generated text
            num_return_sequences=num_return_sequences,  # Number of texts to generate
            no_repeat_ngram_size=2,  # Prevent repetition of n-grams
            temperature=0.7,  # Controls randomness: 1.0 for randomness, <1 for more deterministic
            top_k=50,  # Controls diversity by sampling from top-k tokens
            top_p=0.95,  # Controls diversity by sampling from top-p tokens
            pad_token_id=tokenizer.eos_token_id  # EOS token for padding
        )

    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Step 4: Generate a story based on a custom prompt
prompt = "Once upon a time, in a land far away, there was a mysterious forest where"
story = generate_story(prompt, max_length=300)

# Step 5: Print the generated story
print(story)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Once upon a time, in a land far away, there was a mysterious forest where the sun was shining.

The sun had been shining for a long time. It was the first time that the moon had shone. The moon was about to be born. And the sky was filled with the light of the stars. But the night was still dark. There was no light. No light at all. Only the darkness. A dark place. That was where I was born, and where my father was. I had no idea what to do. My father had died. He had lost his life. His father's death was not a matter of fate. In fact, it was an event that had happened in the past. When I first came to this world, I knew that I would never be able to live in this place again. So I decided to go to the forest. To the place where there were no trees. Because there wasn't any. This place was called the Forest of Light. As I walked, my eyes were filled to overflowing with light, but I couldn't see anything. Instead, the only light I could see was my own eyes. They were all filled up with d

Que.7. Write a code to implement a simple text generation model using a GRUbased architecture in Keras

In [8]:
import numpy as np
# Instead of:
# from keras.preprocessing.text import Tokenizer

# Use:
from tensorflow.keras.preprocessing.text import Tokenizer

# Instead of:
# from keras.utils import to_categorical

# Use:
from tensorflow.keras.utils import to_categorical

# Instead of:
# from keras.preprocessing.sequence import pad_sequences

# Use:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Instead of:
# from keras.models import Sequential

# Use:
from tensorflow.keras.models import Sequential

# Instead of:
# from keras.layers import Embedding, GRU, Dense, Dropout

# Use:
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout
# Step 1: Load and prepare custom dataset
custom_text = """
Once upon a time, there was a curious cat.
The cat loved to explore the world around it.
One day, the cat discovered a treasure hidden in the forest.
It was a magical adventure that changed the cat's life forever.
"""

# Step 2: Preprocess the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts([custom_text])  # Fit tokenizer on text data
total_words = len(tokenizer.word_index) + 1  # Total number of words in the vocabulary

# Create sequences of words for training
input_sequences = []
for line in custom_text.split("\n"):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)

# Pad sequences to ensure uniform input length
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

# Prepare predictors (X) and labels (y)
X = input_sequences[:, :-1]  # All but last word for predictors
y = input_sequences[:, -1]   # Last word for labels
y = to_categorical(y, num_classes=total_words)  # Convert labels to one-hot encoding

# Step 3: Build the GRU-based model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len - 1))  # Embedding layer
model.add(GRU(128, return_sequences=False))  # GRU layer
model.add(Dropout(0.2))  # Dropout for regularization
model.add(Dense(total_words, activation='softmax'))  # Output layer

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Step 4: Train the model
model.fit(X, y, epochs=100, verbose=1)

# Step 5: Generate text based on a seed sentence
def generate_text(seed_text, next_words, model, tokenizer, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')

        # Predict the next word
        predicted_probabilities = model.predict(token_list, verbose=0)
        predicted_word_index = np.argmax(predicted_probabilities)

        # Convert the word index to the word
        predicted_word = tokenizer.index_word[predicted_word_index]
        seed_text += " " + predicted_word

    return seed_text

# Step 6: Generate a sentence
seed_text = "Once upon a time"
generated_text = generate_text(seed_text, next_words=10, model=model, tokenizer=tokenizer, max_sequence_len=max_sequence_len)
print("Generated Text:", generated_text)




Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.0868 - loss: 3.3976
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.1632 - loss: 3.3831
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.2211 - loss: 3.3650
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.2604 - loss: 3.3533
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.2500 - loss: 3.3394
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.2500 - loss: 3.3265
Epoch 7/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.2789 - loss: 3.3131
Epoch 8/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.2500 - loss: 3.2974
Epoch 9/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

Que.8.Create a script to implement GPT-2-based text generation with beam
search decoding to generate text

In [11]:
!pip install transformers torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Step 1: Load the pre-trained GPT-2 model and tokenizer
model_name = "gpt2"  # You can use other versions like "gpt2-medium", "gpt2-large", etc.
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Set the model to evaluation mode (inference)
model.eval()

# Step 2: Define the function for text generation using beam search
def generate_text_with_beam_search(prompt, beam_size=5, max_length=50):
    # Tokenize the input prompt
    inputs = tokenizer.encode(prompt, return_tensors="pt")

    # Generate text using beam search
    # 'num_beams' is the beam width (how many beams to explore)
    # 'no_repeat_ngram_size' helps to avoid repetition of the same n-grams
    # 'early_stopping' ensures the generation stops early when all beams have finished
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=max_length,
            num_beams=beam_size,  # Number of beams to consider
            no_repeat_ngram_size=2,  # Prevent repetition of n-grams
            early_stopping=True,  # Stop when all beams are finished
            temperature=1.0,  # Control the randomness
            top_k=50,  # Sampling from top-k tokens
            top_p=0.95,  # Nucleus sampling (top-p)
            pad_token_id=tokenizer.eos_token_id  # Padding token (end of sequence)
        )

    # Decode the generated token IDs back into text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Step 3: Generate text based on a custom prompt
prompt = "Once upon a time, in a land far away,"
generated_text = generate_text_with_beam_search(prompt, beam_size=5, max_length=100)

# Step 4: Print the generated text
print("Generated Text:\n", generated_text)






Generated Text:
 Once upon a time, in a land far away, there was a man who had come to the land of the dead. He was the son of an old man, and his father had been killed by the enemy. When he came to him, he said, "My father is dead, but my mother is alive." And when he saw that his mother was alive, the man said to his son: "Why do you say that I am dead?" And he answered: Because my father was


Que.9.Implement a text generation script using GPT-2 with a custom temperature
setting for diversity in output textI

In [14]:
!pip install transformers torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Step 1: Load the pre-trained GPT-2 model and tokenizer
model_name = "gpt2"  # You can also use "gpt2-medium", "gpt2-large", or "gpt2-xl"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Set the model to evaluation mode (inference)
model.eval()

# Step 2: Define a function to generate text with a custom temperature setting
def generate_text_with_temperature(prompt, temperature=1.0, max_length=50):
    # Tokenize the input prompt
    inputs = tokenizer.encode(prompt, return_tensors="pt")

    # Generate text using custom temperature setting
    # Higher temperature -> more randomness; Lower temperature -> more deterministic
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=max_length,  # Maximum length of the generated text
            temperature=temperature,  # Custom temperature for controlling randomness
            top_k=50,  # Sampling from top-k tokens
            top_p=0.95,  # Nucleus sampling (top-p)
            no_repeat_ngram_size=2,  # Prevent repetition of n-grams
            pad_token_id=tokenizer.eos_token_id,  # EOS token as padding
            early_stopping=True  # Stop when all sequences are finished
        )

    # Decode the generated token IDs back into text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Step 3: Generate text based on a custom prompt with a specific temperature
prompt = "In a distant land, there was a mysterious forest that"
temperature = 0.7  # Set temperature for controlling randomness
generated_text = generate_text_with_temperature(prompt, temperature=temperature, max_length=100)

# Step 4: Print the generated text
print("Generated Text:\n", generated_text)






Generated Text:
 In a distant land, there was a mysterious forest that was filled with the scent of the wind.

"I'm not sure if it's a forest or not, but it seems like it has a lot of trees. I wonder if there are any other people here?"
. . .
,
-
The forest was full of people. The people were all wearing the same clothes as the others. They were wearing a white robe, a black robe and a red robe. There


Que,10 Create a script to implement temperature sampling with GPT-2,
experimenting with different values to generate creative textIpip install transformers torch


In [16]:
!pip install transformers torch

from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Step 1: Load the pre-trained GPT-2 model and tokenizer
model_name = "gpt2"  # You can also use "gpt2-medium", "gpt2-large", or "g2p2-xl"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Set the model to evaluation mode (inference)
model.eval()

# Step 2: Define a function to generate text with a custom temperature setting
def generate_text_with_temperature(prompt, temperature=1.0, max_length=50):
    # Tokenize the input prompt
    inputs = tokenizer.encode(prompt, return_tensors="pt")

    # Generate text using custom temperature setting
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=max_length,  # Maximum length of the generated text
            temperature=temperature,  # Custom temperature for controlling randomness
            top_k=50,  # Sampling from top-k tokens
            top_p=0.95,  # Nucleus sampling (top-p)
            no_repeat_ngram_size=2,  # Prevent repetition of n-grams
            pad_token_id=tokenizer.eos_token_id,  # EOS token as padding
            early_stopping=True  # Stop when all sequences are finished
        )

    # Decode the generated token IDs back into text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Step 3: Experiment with different temperatures to generate creative text
prompt = "In a faraway kingdom, there was a magical creature who"
temperatures = [0.5, 1.0, 1.5]  # Different temperature settings

# Step 4: Generate and compare text for each temperature
for temp in temperatures:
    print(f"--- Generated Text at Temperature {temp} ---")
    generated_text = generate_text_with_temperature(prompt, temperature=temp, max_length=100)
    print(generated_text)
    print("\n" + "-"*50 + "\n")


--- Generated Text at Temperature 0.5 ---




In a faraway kingdom, there was a magical creature who could transform into a human.

"I'm not sure what it is, but it's a demon."
. . .
,
-
The demon was the same as the one who had been summoned by the demon. It was an old man who was in the middle of a battle with the demons. He was wearing a black robe and a white robe. The demon had a large, black-colored face. His

--------------------------------------------------

--- Generated Text at Temperature 1.0 ---
In a faraway kingdom, there was a magical creature who could transform into a human.

"I'm not sure what it is, but it's a demon."
. . .
,
-
The demon was the same as the one who had been summoned by the demon. It was an old man who was in the middle of a battle with the demons. He was wearing a black robe and a white robe. The demon had a large, black-colored face. His

--------------------------------------------------

--- Generated Text at Temperature 1.5 ---




In a faraway kingdom, there was a magical creature who could transform into a human.

"I'm not sure what it is, but it's a demon."
. . .
,
-
The demon was the same as the one who had been summoned by the demon. It was an old man who was in the middle of a battle with the demons. He was wearing a black robe and a white robe. The demon had a large, black-colored face. His

--------------------------------------------------



Que.11.Implement a simple LSTM-based text generation model from scratch using
Keras and train it on a custom dataI

In [21]:
!pip install tensorflow numpy
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load and preprocess the text data
def load_data(file_path):
    # Try to open the file. If it doesn't exist, create it with sample data.
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read().lower()  # Read the text file and convert to lowercase
    except FileNotFoundError:
        print(f"File '{file_path}' not found. Creating a sample file.")
        sample_text = "This is a sample text for training the LSTM model. You can replace this with your own data."
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(sample_text)
        text = sample_text.lower()
    return text

# Prepare the sequences for training
def prepare_sequences(text, seq_length=40):
    # Create a set of characters in the text
    chars = sorted(list(set(text)))  # Get all unique characters
    char_to_int = {char: index for index, char in enumerate(chars)}  # Char to index mapping
    int_to_char = {index: char for index, char in enumerate(chars)}  # Index to char mapping

    # Create input-output pairs (X: input sequence, y: target character)
    sequences = []
    next_chars = []
    for i in range(0, len(text) - seq_length, 1):
        seq_in = text[i:i + seq_length]
        seq_out = text[i + seq_length]
        sequences.append([char_to_int[char] for char in seq_in])
        next_chars.append(char_to_int[seq_out])

    # Reshape X into LSTM input format and normalize
    X = np.reshape(sequences, (len(sequences), seq_length, 1)) / float(len(chars))
    y = to_categorical(next_chars, num_classes=len(chars))

    return X, y, char_to_int, int_to_char, chars

# Build the LSTM model
def build_model(seq_length, num_classes):
    model = Sequential()
    model.add(LSTM(128, input_shape=(seq_length, 1), return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(128))
    model.add(Dropout(0.2))
    model.add(Dense(num_classes))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001))
    return model

# Generate text after training
def generate_text(model, int_to_char, char_to_int, seq_length, seed_text, num_generate=200):
    generated_text = seed_text
    sequence = [char_to_int[char] for char in seed_text]

    for _ in range(num_generate):
        # Reshape the sequence for the LSTM
        input_seq = np.reshape(sequence, (1, seq_length, 1)) / float(len(int_to_char))

        # Predict the next character
        predicted_prob = model.predict(input_seq, verbose=0)
        predicted_index = np.argmax(predicted_prob)
        predicted_char = int_to_char[predicted_index]

        # Append the predicted character to the generated text
        generated_text += predicted_char

        # Update the sequence for the next prediction
        sequence.append(predicted_index)
        sequence = sequence[1:]





Que.12.How can you implement text generation using it in a simple custom
attention-based architecture?

In [24]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, Attention, Concatenate, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

# Custom Attention Layer
class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self):
        super(AttentionLayer, self).__init__()

    def call(self, query, value):
        attention = tf.matmul(query, value, transpose_b=True)
        attention = tf.nn.softmax(attention, axis=-1)
        output = tf.matmul(attention, value)
        return output

# Load and preprocess the text data
def load_data(file_path='text_data.txt'):  # Set a default file path

  # Try to open the file. If it doesn't exist, create it with sample data.
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read().lower()  # Read the text file and convert to lowercase
    except FileNotFoundError:
        print(f"File '{file_path}' not found. Creating a sample file.")
        sample_text = "This is a sample text for training the LSTM model. You can replace this with your own data."
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(sample_text)
        text = sample_text.lower()
    return text
# Prepare the sequences for training
def prepare_sequences(text, seq_length=40):
    chars = sorted(list(set(text)))  # Get all unique characters
    char_to_int = {char: index for index, char in enumerate(chars)}  # Char to index mapping
    int_to_char = {index: char for index, char in enumerate(chars)}  # Index to char mapping

    # Create input-output pairs (X: input sequence, y: target character)
    sequences = []
    next_chars = []
    for i in range(0, len(text) - seq_length, 1):
        seq_in = text[i:i + seq_length]
        seq_out = text[i + seq_length]
        sequences.append([char_to_int[char] for char in seq_in])
        next_chars.append(char_to_int[seq_out])

    # Reshape X into LSTM input format and normalize
    X = np.reshape(sequences, (len(sequences), seq_length, 1)) / float(len(chars))
    y = to_categorical(next_chars, num_classes=len(chars))

    return X, y, char_to_int, int_to_char, chars

# Build the model with custom attention
def build_model(seq_length, num_classes):
    # Define input layer
    inputs = Input(shape=(seq_length, 1))

    # Add LSTM layers
    lstm_out = LSTM(128, return_sequences=True)(inputs)
    dropout_out = Dropout(0.2)(lstm_out)

    # Apply custom attention mechanism
    attention_out = AttentionLayer()(dropout_out, dropout_out)

    # Add a final LSTM layer to process the attention output
    lstm_final = LSTM(128)(attention_out)
    dropout_final = Dropout(0.2)(lstm_final)

    # Dense layer to predict the next character
    dense_out = Dense(num_classes)(dropout_final)
    output = Activation('softmax')(dense_out)

    # Build and compile the model
    model = Model(inputs, output)
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001))

    return model

# Generate text after training
def generate_text(model, int_to_char, char_to_int, seq_length, seed_text, num_generate=200):
    generated_text = seed_text
    # Ensure seed_text is of seq_length
    seed_text = seed_text.ljust(seq_length, ' ') # Pad with spaces if shorter
    sequence = [char_to_int[char] for char in seed_text]

    for _ in range(num_generate):
        # Reshape the sequence for the LSTM
        input_seq = np.reshape(sequence, (1, seq_length, 1)) / float(len(int_to_char))

        # Predict the next character
        predicted_prob = model.predict(input_seq, verbose=0)
        predicted_index = np.argmax(predicted_prob)
        predicted_char = int_to_char[predicted_index]

        # Append the predicted character to the generated text
        generated_text += predicted_char

        # Update the sequence for the next prediction
        sequence.append(predicted_index)
        sequence = sequence[1:]

    return generated_text

# Main function to run the model
def main():
    # Load and preprocess the data
    text = load_data('text_data.txt')  # Path to your text file
    seq_length = 40  # Sequence length
    X, y, char_to_int, int_to_char, chars = prepare_sequences(text, seq_length)

    # Build the model with attention
    model = build_model(seq_length, len(chars))

    # Train the model
    model.fit(X, y, epochs=50, batch_size=128)

    # Generate text using the trained model
    seed_text = "once upon a time"
    generated_text = generate_text(model, int_to_char, char_to_int, seq_length, seed_text, num_generate=300)

    print("Generated Text:\n", generated_text)

if __name__ == "__main__":
    main()


Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 3.0910
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - loss: 3.0769
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - loss: 3.0649
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - loss: 3.0462
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step - loss: 3.0293
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step - loss: 2.9920
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - loss: 2.9475
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - loss: 2.9152
Epoch 9/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step - loss: 2.9557
Epoch 10/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - loss: 2.9257
Epoch 11/50
[1m1/1