In [31]:
import os
import numpy as np
import tensorflow as tf
import random
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Concatenate, Attention
from tensorflow.keras.preprocessing.text import Tokenizer
from convokit import Corpus, download
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [32]:
# Prepare a directory to store the data
data_directory = "data"
os.makedirs(data_directory, exist_ok=True)

# Assign a path for storing the movie corpus
movie_corpus_path = os.path.join(data_directory, "movie-corpus")

# Download the Cornell Movie-Dialogs Corpus if it's not already present
if not os.path.exists(movie_corpus_path):
    print("Commencing download of the Cornell Movie-Dialogs Corpus...")
    dialog_corpus = Corpus(filename=download("movie-corpus", data_dir=data_directory))
else:
    print("Cornell Movie-Dialogs Corpus is now being loaded from existing data...")
    dialog_corpus = Corpus(filename=movie_corpus_path)

Cornell Movie-Dialogs Corpus is now being loaded from existing data...


In [33]:
# Extract dialog pairs
query_list, response_list = [], []
max_samples = 1000

for conv_id in dialog_corpus.conversations:
    if len(query_list) >= max_samples:
        break
    conversation = dialog_corpus.get_conversation(conv_id)
    utterance_ids = conversation.get_utterance_ids()
    for i in range(len(utterance_ids) - 1):
        if len(query_list) >= max_samples:
            break
        query_list.append(dialog_corpus.get_utterance(utterance_ids[i]).text)
        response_list.append(dialog_corpus.get_utterance(utterance_ids[i + 1]).text)

print(f"Total dialog pairs extracted: {len(query_list)}")

Total dialog pairs extracted: 1000


In [34]:
# Tokenization
dialog_tokenizer = Tokenizer(oov_token='<OOV>')
dialog_tokenizer.fit_on_texts(query_list + response_list)
word_count = len(dialog_tokenizer.word_index) + 1

In [35]:
# Convert dialogs into sequences
seq_queries = dialog_tokenizer.texts_to_sequences(query_list)
seq_responses = dialog_tokenizer.texts_to_sequences(response_list)

In [36]:
# Determine the maximum length for padding
sequence_max_length = max(max(len(item) for item in seq_queries), max(len(item) for item in seq_responses))
seq_queries_padded = pad_sequences(seq_queries, maxlen=sequence_max_length, padding='post')
seq_responses_padded = pad_sequences(seq_responses, maxlen=sequence_max_length, padding='post')


In [37]:
# Building the Seq2Seq model with Attention
embedding_dimension = 64
lstm_unit_size = 128

In [38]:
# Encoder
encoder_inputs = Input(shape=(sequence_max_length,))
encoder_embedding = Embedding(word_count, embedding_dimension, mask_zero=True)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(lstm_unit_size, return_state=True, return_sequences=True)(encoder_embedding)
encoder_states = [state_h, state_c]

In [39]:
# Decoder
decoder_inputs = Input(shape=(sequence_max_length,))
decoder_embedding = Embedding(word_count, embedding_dimension, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(lstm_unit_size, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)


In [40]:
# Attention Layer
attention_layer = Attention()
attention_result = attention_layer([decoder_outputs, encoder_outputs])

In [41]:
# Concatenate Attention output and Decoder LSTM output
decoder_concat_input = Concatenate(axis=-1)([decoder_outputs, attention_result])

In [42]:
# Dense layer
decoder_dense = Dense(word_count, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)

In [43]:
# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [44]:
# Compile and train the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [45]:
# Prepare the data for training
decoder_target_data = np.zeros(seq_responses_padded.shape)
decoder_target_data[:,:-1] = seq_responses_padded[:,1:]
decoder_target_data = np.expand_dims(decoder_target_data, -1)

model.fit([seq_queries_padded, seq_responses_padded], decoder_target_data, batch_size=16, epochs=10)


Epoch 1/10


2023-11-29 23:42:38.597955: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:693] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" model: "0" frequency: 2400 num_cores: 8 environment { key: "cpu_instruction_set" value: "ARM NEON" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 16384 l2_cache_size: 524288 l3_cache_size: 524288 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x3ad685870>

In [46]:
# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"  # Other variants like "gpt2-medium" or "gpt2-large" can be used for better performance
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
gpt2_model = GPT2LMHeadModel.from_pretrained(model_name)

In [47]:
# List of question templates
question_templates = [
    "How is Mission Impossible?",
    "How can I [action] [object]?",
    "Tell me more about [subject].",
    "What is the [adjective] way to [action] [object]?",
    "Can you explain [concept] to me?",
    "What are the benefits of [action]?",
    "Give me some information on [topic].",
    "What are your thoughts on [topic]?",
    "How does [process] work?",
    "Recommend a [adjective] [object] for me.",
]

# List of placeholders to fill in the question templates
placeholders = {
    "[action]": ["learn", "understand", "use", "improve", "explore"],
    "[object]": ["Python programming", "machine learning", "data analysis", "web development", "chatbots"],
    "[subject]": ["your favorite book", "the latest technology", "the weather in Paris", "your hobbies"],
    "[adjective]": ["best", "easiest", "fastest", "most popular", "recommended"],
    "[concept]": ["artificial intelligence", "blockchain", "quantum computing", "neural networks"],
    "[topic]": ["space exploration", "environmental sustainability", "the stock market", "travel destinations"],
    "[process]": ["photosynthesis", "protein synthesis", "cloud computing", "data encryption"],
}

In [48]:
# Function to generate a random user input question
def generate_random_user_input():
    question_template = random.choice(question_templates)
    for placeholder, options in placeholders.items():
        replacement = random.choice(options)
        question_template = question_template.replace(placeholder, replacement)
    return question_template

In [49]:
from transformers import set_seed

# Function to generate chatbot responses using GPT-2 with varied responses
def generate_chatbot_responses(user_input, num_responses=1, max_length=50):
    responses = []
    set_seed(random.randint(1, 10000))  # Setting a random seed for variability in responses
    for _ in range(num_responses):
        inputs = tokenizer.encode(user_input, return_tensors='pt')
        outputs = gpt2_model.generate(
            inputs, 
            max_length=max_length, 
            num_return_sequences=1,
            temperature=0.7,  # Adjusts randomness in response generation
            top_k=50,        # Limits the number of highest probability vocabulary tokens to consider for each step
            top_p=0.95,      # Nucleus sampling: chooses from the smallest possible set of tokens whose cumulative probability exceeds the threshold p
            no_repeat_ngram_size=2  # Prevents repeating n-grams in generated text
        )
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        responses.append(response)
    return responses



In [50]:
user_input_question = generate_random_user_input()
chatbot_generated_responses = generate_chatbot_responses(user_input_question)

print("User Input Question:", user_input_question)
print("Chatbot Responses:")
for i, response in enumerate(chatbot_generated_responses):
    print(f"Response {i + 1}: {response}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


User Input Question: How is Mission Impossible?
Chatbot Responses:
Response 1: How is Mission Impossible?

Mission Impossible is a film that is about a group of people who are trying to save the world from a mysterious alien invasion. The film is set in the year 2055, and the film's plot revolves around a
