In [1]:
import os
import re
import json
import numpy as np


In [2]:
#load the lines from movie_lines.txt
lines = {}
with open('dataset/movie_lines.txt', 'r', encoding='iso-8859-1') as f:
    for line in f:
        # Split the line by the " +++$+++ " delimiter
        parts = line.strip().split(' +++$+++ ')
        if len(parts) == 5:
            lines[parts[0]] = parts[4]

#create the conversation pairs from movie_conversations.txt
pairs = []
with open('dataset/movie_conversations.txt', 'r', encoding='iso-8859-1') as f:
    for line in f:
        parts = line.strip().split(' +++$+++ ')
        if len(parts) == 4:
            # The list of line IDs is the last part
            conversation_ids = parts[3].strip()[1:-1].replace("'", "").replace(" ", "").split(',')
            # Create pairs of consecutive lines in the conversation
            for i in range(len(conversation_ids) - 1):
                input_line = lines.get(conversation_ids[i])
                target_line = lines.get(conversation_ids[i+1])
                if input_line and target_line:
                    pairs.append([input_line, target_line])

In [3]:
# Print the total number of conversation pairs
print(f"Total conversation pairs: {len(pairs)}")
# Print the first pair for verification
print(f"Example pair: {pairs[0]}")


Total conversation pairs: 221282
Example pair: ['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.', "Well, I thought we'd start with pronunciation, if that's okay with you."]


In [4]:
def normalize_string(s):
    s = s.lower().strip()

    s = re.sub(r"i'm", "i am", s)
    s = re.sub(r"he's", "he is", s)
    s = re.sub(r"she's", "she is", s)
    s = re.sub(r"it's", "it is", s)
    s = re.sub(r"they're", "they are", s)
    s = re.sub(r"you're", "you are", s)
    s = re.sub(r"what's", "what is", s)
    s = re.sub(r"where's", "where is", s)
    # Remove all non-letter and non-number characters
    s = re.sub(r"[^a-z0-9\s]", "", s)
    # Remove extra whitespace
    s = re.sub(r"\s+", " ", s).strip()
    return s

# Apply the normalization to all pairs
cleaned_pairs = [[normalize_string(pair[0]), normalize_string(pair[1])] for pair in pairs if normalize_string(pair[0]) and normalize_string(pair[1])]

In [5]:
print(f"Total cleaned conversation pairs: {len(cleaned_pairs)}")
# Print the first cleaned pair for verification
print(f"Example cleaned pair: {cleaned_pairs[0]}")

Total cleaned conversation pairs: 221274
Example cleaned pair: ['can we make this quick roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad again', 'well i thought wed start with pronunciation if thats okay with you']


In [None]:
# Create a set of all unique words to build the vocabulary
all_words = set()
for pair in cleaned_pairs:
    for sentence in pair:
        for word in sentence.split(' '):
            all_words.add(word)

# Create the word-to-ID and ID-to-word dictionaries
word_to_id = {word: i for i, word in enumerate(list(all_words))}
id_to_word = {i: word for word, i in word_to_id.items()}

# Add special tokens for padding, start of sentence, and end of sentence
word_to_id['<PAD>'] = len(word_to_id)
word_to_id['<SOS>'] = len(word_to_id)
word_to_id['<EOS>'] = len(word_to_id)
id_to_word[len(id_to_word)] = '<PAD>'
id_to_word[len(id_to_word)] = '<SOS>'
id_to_word[len(id_to_word)] = '<EOS>'

VOCAB_SIZE = len(word_to_id)