In [None]:
# GPT-2 Conversational Chatbot with Text-to-Speech
# This chatbot is fine-tuned on the Cornell Movie Dialogues dataset using GPT-2.

# Step 1: Install Required Libraries
!pip install torch==2.0.1 transformers==4.25.1 gtts requests

# Step 2: Download the Cornell Movie Dialogues Dataset using Python (for Windows)
import requests
import zipfile
import os

# Download the dataset
dataset_url = "http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip"
dataset_path = "cornell_movie_dialogs_corpus.zip"

if not os.path.exists(dataset_path):
    response = requests.get(dataset_url, stream=True)
    with open(dataset_path, 'wb') as f:
        f.write(response.content)
    print("Dataset downloaded!")

# Extract the dataset
extracted_folder = "cornell movie-dialogs corpus"
if not os.path.exists(extracted_folder):
    with zipfile.ZipFile(dataset_path, 'r') as zip_ref:
        zip_ref.extractall(".")
    print("Dataset extracted!")




Collecting torch==2.0.1
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Collecting transformers==4.25.1
  Downloading transformers-4.25.1-py3-none-any.whl.metadata (93 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.9/93.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gtts
  Downloading gTTS-2.5.3-py3-none-any.whl.metadata (4.1 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.1)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu11==8.5.0.96 (from torch==2.0.1)
  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylin

In [None]:
# Step 3: Load and Clean the Conversations
import re

def load_conversations():
    with open(f'{extracted_folder}/movie_lines.txt', 'r', encoding='iso-8859-1') as f:
        lines = f.readlines()
    return lines

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
    text = re.sub(r'[" "]+', " ", text)
    return text

conversations = load_conversations()
cleaned_conversations = [clean_text(conv) for conv in conversations]



In [None]:
# Step 4: Load GPT-2 Model and Tokenizer
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_name = 'gpt2-medium'  # Use 'gpt2-large' if your system can handle it
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Fix the pad_token issue
tokenizer.pad_token = tokenizer.eos_token



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

In [None]:
# Step 5: Prepare Dataset for Fine-Tuning
from torch.utils.data import Dataset, DataLoader

class ConversationDataset(Dataset):
    def __init__(self, conversations, tokenizer, max_length=512):
        self.conversations = conversations
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.conversations)

    def __getitem__(self, idx):
        conversation = self.conversations[idx]
        tokens = self.tokenizer(
            conversation,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
            padding='max_length'
        )
        return tokens.input_ids[0], tokens.attention_mask[0]

dataset = ConversationDataset(cleaned_conversations, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)



In [None]:
# Step 6: Fine-Tune the GPT-2 Model
import torch
from transformers import AdamW

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

optimizer = AdamW(model.parameters(), lr=1e-5)

for epoch in range(3):  # You can increase the number of epochs
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids, attention_mask = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1} completed. Average Loss: {avg_loss:.4f}")




In [None]:
# Step 7: Save the Fine-Tuned Model
model.save_pretrained('fine_tuned_gpt2_medium')  # Change to 'gpt2_large' if you used that model
tokenizer.save_pretrained('fine_tuned_gpt2_medium')


In [None]:
# Step 8: Generate Responses with the Fine-Tuned Model
model = GPT2LMHeadModel.from_pretrained('fine_tuned_gpt2_medium')
tokenizer = GPT2Tokenizer.from_pretrained('fine_tuned_gpt2_medium')
model.to(device)
model.eval()

def generate_response(input_text):
    input_ids = tokenizer.encode(input_text + tokenizer.eos_token, return_tensors='pt').to(device)
    output = model.generate(
        input_ids,
        max_length=100,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer.eos_token_id,
        early_stopping=True
    )
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response.strip()

In [None]:
# Step 9: Integrate Text-to-Speech (gTTS)
from gtts import gTTS
import os
import platform

def speak(text):
    tts = gTTS(text=text, lang='en')
    tts.save("response.mp3")
    if platform.system() == "Windows":
        os.system("start response.mp3")
    elif platform.system() == "Darwin":  # macOS
        os.system("afplay response.mp3")
    else:  # Linux
        os.system("mpg123 response.mp3")

# Step 10: Interactive Chat with Text-to-Speech
print("Start chatting with the bot (type 'exit' to stop)!")
while True:
    user_input = input("You: ")
    if user_input.lower() in ['exit', 'quit']:
        break
    reply = generate_response(user_input)
    print(f"Bot: {reply}")
    speak(reply)