In [None]:
# Conversational Chatbot from Improv Script

# Step 1: Install required libraries (uncomment below if needed)
# !pip install nltk scikit-learn transformers datasets

# Step 2: Import libraries
import re
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
import torch

# Step 3: Load and clean text data
def load_and_clean_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    cleaned_lines = []
    for line in lines:
        line = line.strip()
        if line:
            cleaned_lines.append(line)

    return " ".join(cleaned_lines)

# Replace with your actual file path
text_data = load_and_clean_text('your_improv_script.txt')

# Save cleaned data to file
with open("chatbot_dataset.txt", "w", encoding="utf-8") as f:
    f.write(text_data)

# Step 4: Load GPT-2 model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Step 5: Prepare dataset for training
def load_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

dataset = load_dataset("chatbot_dataset.txt", tokenizer)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Step 6: Define training arguments
training_args = TrainingArguments(
    output_dir="./chatbot_model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
)

# Step 7: Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

trainer.train()

# Step 8: Generate responses from the chatbot
def generate_reply(prompt, max_length=100):
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    outputs = model.generate(inputs, max_length=max_length, do_sample=True, top_k=50)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
user_input = "Hey, how are you doing today?"
print("Bot:", generate_reply(user_input))

# Step 9: Chat loop
print("Chatbot ready! Type 'quit' to exit.")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'quit':
        break
    print("Bot:", generate_reply(user_input))
