# Vocabulary Embedding

In [1]:
import pandas as pd
import numpy as np
import csv
import pickle
import re
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


### Loading Data

##### Dataset: Cornell Movie Dialogues
- movie_conversations.txt (the structure of the conversations)
	- fields
		- characterID of the first character involved in the conversation
		- characterID of the second character involved in the conversation
		- movieID of the movie in which the conversation occurred
		- list of the utterances that make the conversation, in chronological 
			order: ['lineID1','lineID2',...,'lineIDN']
			has to be matched with movie_lines.txt to reconstruct the actual content
            
            
- movie_lines.txt (contains the actual text of each utterance)
	- fields:
		- lineID
		- characterID (who uttered this phrase)
		- movieID
		- character name
		- text of the utterance


In [2]:
paths = ["cornell_movie-dialogs_corpus/movie_conversations.txt", "cornell_movie-dialogs_corpus/movie_lines.txt"]
df = []
for path in paths:
    file = open(path)
    lines = []
    for line in file.readlines():
        lines.append(line.replace('\n', "").split(" +++$+++ "))
    df.append(pd.DataFrame(lines))

#### df[0] is movie conversations ("Dataframe of" "list of" dialogues (lineID) in a conversations)

In [3]:
# Assigning the column titles to the df
df[0].columns = ['characterID_1', 'characterID_2', 'movieID', 'dialogues_list']

# reducing the dataframe to required fields
df[0] = df[0]['dialogues_list']
df[0].head()

0    ['L194', 'L195', 'L196', 'L197']
1                    ['L198', 'L199']
2    ['L200', 'L201', 'L202', 'L203']
3            ['L204', 'L205', 'L206']
4                    ['L207', 'L208']
Name: dialogues_list, dtype: object

#### df[1] is movie lines (Dataframe of line ID and Dialogue text)

In [4]:
# Assigning the column titles to the df
df[1].columns = ['lineID', 'characterID', 'movieID', 'character_name', 'dialogue']

# reducing the dataframe to required fields
df[1] = df[1][['lineID', 'dialogue']]
df[1].head()

Unnamed: 0,lineID,dialogue
0,L1045,They do not!
1,L1044,They do to!
2,L985,I hope so.
3,L984,She okay?
4,L925,Let's go.


#### I. Following 2 cells will set basis for passing data in the model

##### Value of df[0] is in the form of list, but it returns a string, so replace characters ', [, ], space with empty string and make a list of 'lineIDs'

In [5]:
temp = df[0].iloc[1000].replace("\"", "").replace("\'", "").replace('[', "").replace("]", "").replace(" ", "").split(",")
temp

['L8200', 'L8201', 'L8202']

##### df[0] has conversations but the lines are in lineID format, thus use df[1] to get the actual dialogue

In [6]:
df[1].loc[df[1].lineID == temp[2], 'dialogue'].tolist()[0]

'Yeah, or turn you into toast.'

### Cleaning the text and creating corpus of all sentences and tokenizing it

In [7]:
df[1].dialogue = df[1].dialogue.apply(lambda sentences: re.sub(r"i'm", "i am", sentences))
df[1].dialogue = df[1].dialogue.apply(lambda sentences: re.sub(r"he's", "he is", sentences))
df[1].dialogue = df[1].dialogue.apply(lambda sentences: re.sub(r"she's", "she is", sentences))
df[1].dialogue = df[1].dialogue.apply(lambda sentences: re.sub(r"it's", "it is", sentences))
df[1].dialogue = df[1].dialogue.apply(lambda sentences: re.sub(r"that's", "that is", sentences))
df[1].dialogue = df[1].dialogue.apply(lambda sentences: re.sub(r"what's", "what is", sentences))
df[1].dialogue = df[1].dialogue.apply(lambda sentences: re.sub(r"where's", "where is", sentences))
df[1].dialogue = df[1].dialogue.apply(lambda sentences: re.sub(r"how's", "how is", sentences))
df[1].dialogue = df[1].dialogue.apply(lambda sentences: re.sub(r"\'ll", " will", sentences))
df[1].dialogue = df[1].dialogue.apply(lambda sentences: re.sub(r"\'ve", " have", sentences))
df[1].dialogue = df[1].dialogue.apply(lambda sentences: re.sub(r"\'re", " are", sentences))
df[1].dialogue = df[1].dialogue.apply(lambda sentences: re.sub(r"\'d", " would", sentences))
df[1].dialogue = df[1].dialogue.apply(lambda sentences: re.sub(r"won't", "will not", sentences))
df[1].dialogue = df[1].dialogue.apply(lambda sentences: re.sub(r"can't", "cannot", sentences))
df[1].dialogue = df[1].dialogue.apply(lambda sentences: re.sub(r"n't", " not", sentences))
df[1].dialogue = df[1].dialogue.apply(lambda sentences: re.sub(r"n'", "ng", sentences))
df[1].dialogue = df[1].dialogue.apply(lambda sentences: re.sub(r"'bout", "about", sentences))
df[1].dialogue = df[1].dialogue.apply(lambda sentences: re.sub(r"'til", "until", sentences))
df[1].dialogue = df[1].dialogue.apply(lambda sentences: re.sub("[\"#$%&()*+-/:;<=>@[\]^_`'{|}~]", '', sentences))

#### Creating a vocabulary of all the words used in the corpus and mapping words to indexes

In [8]:
vocab_size = 15000
max_len = 20

In [9]:
# take lineID to dialogue in lists
corpus = df[1].dialogue.tolist()
corpus = ["<start> " + sentence + " <eos>" for sentence in corpus]
line_id = df[1].lineID.tolist()

In [10]:
# now take id and dialogue in dictionary

dataframe_dict = dict(zip(line_id, corpus))

In [11]:
dataframe_dict["L180"]

'<start> No! You are not dating until your sister starts dating  End of discussion <eos>'

In [12]:
tokenizer = Tokenizer(num_words=vocab_size, filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')

In [13]:
tokenizer.fit_on_texts(corpus)

#### Split X and Y
##### Making "Message to Response" lists i.e. X and Y (using I)

##### Say a row in df[0] is ['L194', 'L195', 'L196', 'L197'], then we want (X[0] = L194, Y[0] = L195), (X[1] = L195, Y[1] = L196), and so on
##### We convert LineIDs to dialogues here itself

In [14]:
x_encoder = []
x_decoder = []
y_ = []
df_0_len = len(df[0])
for i in tqdm(range(df_0_len)):
    temp = re.sub("[\"\'[\]\\s+]", "", df[0].iloc[i]).split(",")
    length = len(temp) - 1
    for j in range(length):
        x = dataframe_dict[temp[j]]
        y = dataframe_dict[temp[j + 1]]
        
        x_encoder.append(x[8: len(x) - 6])
        x_decoder.append(y[:len(y) - 6])
        y_.append(y[8:])

100%|█████████████████████████████████████████████████████████████████████████| 83097/83097 [00:01<00:00, 61303.85it/s]


In [15]:
x_encoder[1500], x_decoder[1500], y_[1500]

('Jesus Christ look at all the dust on my carwhy in the hell do not he take it to a car wash?',
 '<start> Did not know you darker people went in for foreign jobs',
 'Did not know you darker people went in for foreign jobs <eos>')

### Tokenizing sequences into list of integers and padding to maxlen

In [16]:
# tokenizing to sequences
X_encoder = tokenizer.texts_to_sequences(x_encoder)
X_decoder = tokenizer.texts_to_sequences(x_decoder)
y = tokenizer.texts_to_sequences(y_)

In [17]:
# padding
X_encoder = pad_sequences(X_encoder, maxlen=max_len, dtype='int32', padding='post', truncating='post')
X_decoder = pad_sequences(X_decoder, maxlen=max_len, dtype='int32', padding='post', truncating='post')
y = pad_sequences(y, maxlen=max_len, dtype='int32', padding='post', truncating='post')

In [18]:
X_encoder.shape, X_decoder.shape, y.shape

((221616, 20), (221616, 20), (221616, 20))

In [19]:
max([max(temp) for temp in y])

14999

### Dumping X, Y to pickle for reuse

In [20]:
with open('x_and_y.pkl', 'wb') as f:
    pickle.dump([X_encoder, X_decoder, y], f, protocol=4)

# loading X and Y
with open('x_and_y.pkl', 'rb') as f:
    X_encoder, X_decoder, y = pickle.load(f)

### Indexing vocabulary

In [21]:
idx2word = {}
for word, idx in tokenizer.word_index.items():
    if idx <= vocab_size:
        idx2word[idx] = word
    if idx > vocab_size:
        continue

In [None]:
idx2word

In [23]:
word2idx = {}
for idx, word in idx2word.items():
    word2idx[word] = idx

In [None]:
word2idx

In [25]:
len(word2idx) == len(idx2word), len(word2idx) == 15000

(True, True)

### GloVe Embeddings pre-processing

In [26]:
# a function that returns the word vector for a given word (in string) from the dataframe of word vectors obtained in the next cell
def get_vector(word):
    return words.loc[word].values

##### Create a pandas dataframe of entries from the glove pre-trained vectors txt file as running a loop for getting word to vector mapping is expensive

In [27]:
#words = pd.read_table("glove.840B.300d/glove.840B.300d.txt", sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)

##### Since it is computationally expensive to make a dataframe from txt each time, dumping the dataframe to a pickle; load from "words.pkl"

In [28]:
# dumping the dataframe
#with open('words.pkl', 'wb') as f:
#    pickle.dump(words, f, protocol=4)

# loading the dataframe
with open('words.pkl', 'rb') as f:
    words = pickle.load(f)

### Create vocab_embedding matrix

In [29]:
vocab_embedding = np.zeros((len(word2idx), 300))     # embedding dim is 300

for i in tqdm(idx2word.keys()):
    try:
        temp = get_vector(idx2word[i])
        vocab_embedding[i] = temp
    except:
        pass

100%|████████████████████████████████████████████████████████████████████████████| 15000/15000 [07:56<00:00, 31.49it/s]


### Dump vocab_embedding to pickle

In [30]:
with open('embedding_weights.pkl', 'wb') as f:
    pickle.dump(vocab_embedding, f, protocol=4)

# loading vocab_embeddings
with open('embedding_weights.pkl', 'rb') as f:
    vocab_embedding = pickle.load(f)