# **04-TV-Show-trained-chatbot-creation**

## Step-by-Step Cleaning Pipeline

# 1. Load and Parse the File


In [38]:
def load_srt(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    dialogue_lines = []
    for line in lines:
        line = line.strip()
        if line == "" or line.isdigit() or "-->" in line:
            continue  # Skip blank lines, timestamps, numbering
        dialogue_lines.append(line)

    return dialogue_lines

In [39]:
file_path = r"data\suits-1x01-pilot.en.txt"
dialogues = load_srt(file_path)

# 2. Basic Cleaning

In [40]:
import re

def clean_text(text):
    text = re.sub(r"<.*?>", "", text)  # Remove HTML
    text = re.sub(r"[^a-zA-Z0-9.,!?'\s]", "", text)  # Remove extra characters
    return text.strip()

cleaned_dialogues = [clean_text(line) for line in dialogues]


# 3. Build Conversational Pairs

In [41]:
pairs = []
for i in range(len(cleaned_dialogues) - 1):
    input_text = cleaned_dialogues[i]
    response = cleaned_dialogues[i + 1]
    pairs.append((input_text, response))

# 4. Save as CSV for Chatbot Training

In [42]:
import csv

with open(r"data\04-chatbot_data_from-suits-s01-e01.csv", "w", encoding="utf-8", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["Input", "Response"])
    writer.writerows(pairs)

# 5.0 NOW, we can create a chatbot from this `cleaned CSV`

## 5.1 `Attempt 01`: 
### - The Rule-Based Chatbot (Quick and Easy)

### 5.1.1 Step 1: Load CSV

In [43]:
!ls data

04-chatbot_data_from-suits-s01-e01.csv
Suits - season 1.en.zip
suits-1x01-pilot.en.srt
suits-1x01-pilot.en.txt
suits.s01.e01.pilot.(2011).eng.1cd.(12843664).zip


In [44]:
import pandas as pd

df = pd.read_csv("data/04-chatbot_data_from-suits-s01-e01.csv")

### 5.1.2 Step 2: Build a Lookup Chatbot

In [45]:
import random

chat_pairs = dict(zip(df['Input'], df['Response']))

def chatbot_reply(user_input):
    # Exact match
    if user_input in chat_pairs:
        return chat_pairs[user_input]
    
    # Approximate match using cosine similarity (optional improvement)
    return "Sorry, I don't know how to respond to that yet."

### 5.1.3 Step 3: Interactive Loop

In [46]:
while True:
    user_input = input("You: ")
    if user_input.lower() in ["exit", "quit"]:
        break
    reply = chatbot_reply(user_input)
    print("Bot:", reply)


## 5.2 `Attempt 02`: 
### - Train a Seq2Seq Model (LSTM) with TensorFlow

If you want a chatbot that can generalize to unseen inputs, you’ll need a sequence-to-sequence (encoder-decoder) model:

### 5.2.1 Libraries

In [47]:
! pip install tensorflow



### 5.2.2 Preprocess Text

In [48]:
!pip install protobuf==3.20.*



In [49]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Prepare data
input_texts = df['Input'].astype(str).tolist()
target_texts = ["<start> " + t + " <end>" for t in df['Response'].astype(str)]

# Tokenize
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(input_texts + target_texts)

input_seqs = tokenizer.texts_to_sequences(input_texts)
target_seqs = tokenizer.texts_to_sequences(target_texts)

max_len_input = max(len(seq) for seq in input_seqs)
max_len_target = max(len(seq) for seq in target_seqs)

input_seqs = pad_sequences(input_seqs, maxlen=max_len_input, padding='post')
target_seqs = pad_sequences(target_seqs, maxlen=max_len_target, padding='post')

vocab_size = len(tokenizer.word_index) + 1

TypeError: Descriptors cannot be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates

### 5.2.3 Build Seq2Seq Model

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

embedding_dim = 256
latent_dim = 512

# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb = Embedding(vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

TypeError: Descriptors cannot be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates

### 5.2.4 Train the Model

In [None]:
import numpy as np

target_seqs_output = np.expand_dims(target_seqs, -1)  # Add output dim

model.fit([input_seqs, target_seqs], target_seqs_output,
          batch_size=64, epochs=50, validation_split=0.2)


NameError: name 'target_seqs' is not defined