In [1]:
import numpy as np
import tensorflow as tf
import re
import time

In [2]:
### Importing the dataset 
with open('dataset/cornell movie-dialogs corpus/movie_lines.txt', 'r+', encoding='utf-8', errors='ignore') as f:
    lines = f.readlines()
    
with open('dataset/cornell movie-dialogs corpus/movie_conversations.txt', 'r+', encoding='utf-8', errors='ignore') as f:
    conversations = f.readlines()

##### Making Dictionary of mapping between lines to Ids

In [3]:
id2line = {}
for line in lines:
    _line = line.split(" +++$+++ ")
    if(len(_line)) == 5:
        id2line[_line[0]] = _line[-1][:-1]




##### Creating a list of all the conversations

In [4]:
conversations_id = []

for conversation in conversations[:-1]:
    _conversation = conversation.split(' +++$+++ ')[-1][1:-2].replace("'", "").replace(" ", "")
    conversations_id.append(_conversation.split(","))
    
    
    

##### Making the list of questions and its answers

In [7]:
questions = []
answers = []

for conversation in conversations_id:
    for i in range(len(conversation) - 1):
        questions.append(id2line[conversation[i]])
        answers.append(id2line[conversation[i+1]])

##### Text Cleaning

In [8]:
def clean_text(phrase):
    phrase = phrase.lower()
    
    ## general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    
    ## specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"[-()+=*/$%@!~<>?.,;:#{}]", "", phrase)
    
    return phrase




##### getting cleaned questions and answers

In [9]:
clean_questions = [clean_text(text) for text in questions]
clean_answers = [clean_text(text) for text in answers]

In [10]:
print(questions[:5], end="\n\n")
print(clean_questions[:5])

['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.', "Well, I thought we'd start with pronunciation, if that's okay with you.", 'Not the hacking and gagging and spitting part.  Please.', "You're asking me out.  That's so cute. What's your name again?", "No, no, it's my fault -- we didn't have a proper introduction ---"]

['can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again', "well i thought we would start with pronunciation if that's okay with you", 'not the hacking and gagging and spitting part  please', "you are asking me out  that's so cute what's your name again", "no no it's my fault  we did not have a proper introduction "]


##### Creating a dictonary that will map our words with its no of occurences` 

In [11]:
word2count = {}

In [12]:
for sent in clean_questions:
    for word in sent.split():
        if word2count.get(word) is None:
            word2count[word] = 1
        else:
            word2count[word] += 1
for sent in clean_answers:
    for word in sent.split():
        if word2count.get(word) is None:
            word2count[word] = 1
        else:
            word2count[word] += 1
            
            
            

##### Creating two dictionaries of questions and answers which maps them to a unique integer

In [13]:
threshold = 20   ## hyperparameter
questionsword2int = {}
answersword2int = {}
word_index = 0

for word, count in word2count.items():
    if count >= threshold:
        questionsword2int[word] = word_index
        word_index += 1
        
word_index = 0 
for word, count in word2count.items():
    if count >= threshold:
        answersword2int[word] = word_index
        word_index += 1
        
        
        


##### Adding the last tokens inside our dictionary

In [14]:
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']
for token in tokens:
    questionsword2int[token] = len(questionsword2int) + 1
    
for token in tokens:
    answersword2int[token] = len(answersword2int) + 1
    
    
    

##### Creating the inverse dictionary for answersword2int dictionary

In [15]:
answersints2word = dict((value, key) for (key, value) in answersword2int.items())



##### Adding the EOS token in clean_answers list

In [20]:
for i in range(len(clean_answers)):
    clean_answers[i] += ' <EOS>'
    
    

##### Translating all the questions and answers into the integers
##### And converting the filtered words by OUT

In [23]:
questions_into_int = []
answers_into_int = []

for question in clean_questions:
    ints = []
    for word in question.split():
        if word not in questionsword2int.keys():
            ints.append(questionsword2int['<OUT>'])
        else:
            ints.append(questionsword2int[word])
    questions_into_int.append(ints)
    
for answer in clean_answers:
    ints = []
    for word in answer.split():
        if word not in answersword2int.keys():
            ints.append(answersword2int['<OUT>'])
        else:
            ints.append(answersword2int[word])
    answers_into_int.append(ints)
    
    
    

##### Sorting the questions and answers by length of questions

In [40]:
sorted_clean_questions = []
sorted_clean_answers = []

maxlength = 25

for length in range(1, maxlength+1):
    for i, question in enumerate(questions_into_int):
        if(len(question) == length):
            sorted_clean_questions.append(questions_into_int[i])
            sorted_clean_answers.append(answers_into_int[i])