# Vocabulary Embedding

In [1]:
import pandas as pd
import numpy as np
import re
from keras.preprocessing.sequence import pad_sequences
import pickle
import tqdm

Using TensorFlow backend.


### Loading Data

##### Dataset: Cornell Movie Dialogues
- movie_conversations.txt (the structure of the conversations)
	- fields
		- characterID of the first character involved in the conversation
		- characterID of the second character involved in the conversation
		- movieID of the movie in which the conversation occurred
		- list of the utterances that make the conversation, in chronological 
			order: ['lineID1','lineID2',...,'lineIDN']
			has to be matched with movie_lines.txt to reconstruct the actual content
            
            
- movie_lines.txt (contains the actual text of each utterance)
	- fields:
		- lineID
		- characterID (who uttered this phrase)
		- movieID
		- character name
		- text of the utterance


In [2]:
paths = ["cornell_movie-dialogs_corpus/movie_conversations.txt", "cornell_movie-dialogs_corpus/movie_lines.txt"]
df = []
for path in paths:
    file = open(path)
    lines = []
    for line in file.readlines():
        lines.append(line.replace('\n', "").split(" +++$+++ "))
    df.append(pd.DataFrame(lines))

#### df[0] is movie conversations ("Dataframe of" "list of" dialogues (lineID) in a conversations)

In [3]:
# Assigning the column titles to the df
df[0].columns = ['characterID_1', 'characterID_2', 'movieID', 'dialogues_list']

# reducing the dataframe to required fields
df[0] = df[0]['dialogues_list']
df[0].head()

0    ['L194', 'L195', 'L196', 'L197']
1                    ['L198', 'L199']
2    ['L200', 'L201', 'L202', 'L203']
3            ['L204', 'L205', 'L206']
4                    ['L207', 'L208']
Name: dialogues_list, dtype: object

#### df[1] is movie lines (Dataframe of line ID and Dialogue text)

In [4]:
# Assigning the column titles to the df
df[1].columns = ['lineID', 'characterID', 'movieID', 'character_name', 'dialogue']

# reducing the dataframe to required fields
df[1] = df[1][['lineID', 'dialogue']]
df[1].head()

Unnamed: 0,lineID,dialogue
0,L1045,They do not!
1,L1044,They do to!
2,L985,I hope so.
3,L984,She okay?
4,L925,Let's go.


##### Value of df[0] is in the form of list, but it returns a string, so replace characters ', [, ], space with empty string and make a list of 'lineIDs'

In [5]:
temp = df[0].iloc[1000].replace("\"", "").replace("\'", "").replace('[', "").replace("]", "").replace(" ", "").split(",")
temp

['L8200', 'L8201', 'L8202']

#### List of above notation

In [6]:
conversations = []
for row in df[0]:
    conversations.append(row.replace("\"", "").replace("\'", "").replace('[', "").replace("]", "").replace(" ", "").split(","))

conversations[:5]

[['L194', 'L195', 'L196', 'L197'],
 ['L198', 'L199'],
 ['L200', 'L201', 'L202', 'L203'],
 ['L204', 'L205', 'L206'],
 ['L207', 'L208']]

### Cleaning the text

In [7]:
def clean_text(sentence):
    sentence = sentence.lower()

    sentence = re.sub(r"i'm", "i am", sentence)
    sentence = re.sub(r"i’m", "i am", sentence)

    sentence = re.sub(r"he's", "he is", sentence)
    sentence = re.sub(r"he’s", "he is", sentence)

    sentence = re.sub(r"she's", "she is", sentence)
    sentence = re.sub(r"she’s", "she is", sentence)

    sentence = re.sub(r"it's", "it is", sentence)
    sentence = re.sub(r"it’s", "it is", sentence)

    sentence = re.sub(r"that's", "that is", sentence)
    sentence = re.sub(r"that’s", "that is", sentence)

    sentence = re.sub(r"what's", "what is", sentence)
    sentence = re.sub(r"what’s", "what is", sentence)

    sentence = re.sub(r"where's", "where is", sentence)
    sentence = re.sub(r"where’s", "where is", sentence)

    sentence = re.sub(r"there's", "there is", sentence)
    sentence = re.sub(r"there’s", "there is", sentence)

    sentence = re.sub(r"who's", "who is", sentence)
    sentence = re.sub(r"who’s", "who is", sentence)

    sentence = re.sub(r"how's", "how is", sentence)
    sentence = re.sub(r"how’s", "how is", sentence)

    sentence = re.sub(r"\'ll", " will", sentence)
    sentence = re.sub(r"’ll", " will", sentence)

    sentence = re.sub(r"\'ve", " have", sentence)
    sentence = re.sub(r"’ve", " have", sentence)

    sentence = re.sub(r"\'re", " are", sentence)
    sentence = re.sub(r"’re", " are", sentence)

    sentence = re.sub(r"\'d", " would", sentence)
    sentence = re.sub(r"’d", " would", sentence)

    sentence = re.sub(r"won't", "will not", sentence)
    sentence = re.sub(r"won’t", "will not", sentence)

    sentence = re.sub(r"can't", "cannot", sentence)
    sentence = re.sub(r"can’t", "cannot", sentence)

    sentence = re.sub(r"n't", " not", sentence)
    sentence = re.sub(r"n’t", " not", sentence)

    sentence = re.sub(r"n'", "ng", sentence)
    sentence = re.sub(r"n’", "ng", sentence)

    sentence = re.sub(r"'bout", "about", sentence)
    sentence = re.sub(r"’bout", "about", sentence)

    sentence = re.sub(r"'til", "until", sentence)
    sentence = re.sub(r"’til", "until", sentence)

    sentence = re.sub(r"c'mon", "come on", sentence)
    sentence = re.sub(r"c’mon", "come on", sentence)

    sentence = re.sub("[-*/()\"’'#/@;:<>{}`+=~|.!?,]", "", sentence)
    
    return sentence

In [8]:
df[1].dialogue[79]

'Would you mind getting me a drink, Cameron?'

In [9]:
df[1].dialogue = df[1].dialogue.apply(lambda sentences: clean_text(sentences))

In [10]:
df[1].dialogue[79]

'would you mind getting me a drink cameron'

### Taking a dict to map lineIDs to corresponding Dialogues

In [11]:
# take lineID to dialogue in lists
dialogues = df[1].dialogue.tolist()
line_id = df[1].lineID.tolist()

In [12]:
# now take id and dialogue in dictionary
id2dialogue = dict(zip(line_id, dialogues))

#### Split X and Y
##### Making "Message to Response" lists i.e. X and Y

##### Say a conversation is ['L194', 'L195', 'L196', 'L197'], then we want (X[0] = L194, Y[0] = L195), (X[1] = L195, Y[1] = L196), and so on

In [13]:
x_encoder = []
x_decoder = []
y_ = []

for conversation in conversations:
    for i in range(len(conversation) - 1):
        x_encoder.append(id2dialogue[conversation[i]])
        x_decoder.append(id2dialogue[conversation[i + 1]])
        y_.append(id2dialogue[conversation[i + 1]])

In [14]:
x_encoder[1500], x_decoder[1500], y_[1500]

('jesus christ look at all the dust on my carwhy in the hell do not he take it to a car wash',
 'did not know you darker people went in for foreign jobs',
 'did not know you darker people went in for foreign jobs')

#### We take max_len as 20 since 85% of the sentences have a length approximately close to that

In [15]:
max_len = 20

In [16]:
# Filter out the x that are too long
x_encoder_temp = []
x_decoder_temp = []
y_temp = []

i = 0
for x in x_encoder:
    if len(x.split()) <= max_len:
        x_encoder_temp.append(x)
        x_decoder_temp.append(x_decoder[i])
        y_temp.append(y_[i])
    i += 1

In [17]:
# Filter out the y that are too long
x_encoder = []
x_decoder = []
y_ = []

i = 0
for y in y_temp:
    if len(y.split()) <= max_len:
        x_decoder.append(y)
        y_.append(y)
        x_encoder.append(x_encoder_temp[i])
    i += 1

In [18]:
len(x_encoder), len(x_decoder), len(y_)

(167126, 167126, 167126)

#### Creating a vocabulary of all the words used in the corpus and mapping words to indexes
#### We take the word frequency for indexing of the vocab

In [19]:
vocab = {}

for x in x_encoder:
    for word in x.split():
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1

for y in y_:
    for word in y.split():
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1

#### Remove rare words from the vocabulary.
#### We will aim to replace fewer than 5% of words with unk

In [20]:
threshold = 5
count = 0
for word, freq in vocab.items():
    if freq >= threshold:
        count += 1

In [21]:
len(vocab), count

(46200, 14356)

In [22]:
# thus we take
vocab_size = 15000

### Tokenizing sequences into list of integers and padding to maxlen

In [23]:
# first we sort the vocab according to the word frequency
vocab = dict(sorted(vocab.items(), key=lambda x: x[1], reverse=True))

In [24]:
# index vocab based on frequency
idx2word = {0: '<pad>', 1: '<eos>', 2: '<unk>', 3: '<go>'}
idx = 4
for word in vocab.keys():
    idx2word[idx] = word
    idx += 1
    if idx == vocab_size:
        break

In [25]:
word2idx = {}
for idx, word in idx2word.items():
    word2idx[word] = idx

In [26]:
len(idx2word), len(word2idx)

(15000, 15000)

In [None]:
idx2word

In [28]:
for i in range(len(x_decoder)):
    x_decoder[i] = "<go> " + x_decoder[i]

In [29]:
for i in range(len(y_)):
    y_[i] += " <eos>"

In [30]:
x_decoder[3000], y_[3000]

('<go> lay off asshole', 'lay off asshole <eos>')

In [31]:
def tokenize(sentence, word_count, unk_count):
    tokens = []
    for word in sentence.split():
        if word not in word2idx:
            tokens.append(word2idx['<unk>'])
            unk_count += 1
        else:
            tokens.append(word2idx[word])
            word_count += 1
            
    return tokens, word_count, unk_count

In [32]:
# tokenizing to sequences
# the words occuring in sentences that are not present in vocab are replace by unk
X_encoder = []
X_decoder = []
Y = []

word_count = 0
unk_count = 0

for x in x_encoder:
    tokens, word_count, unk_count = tokenize(x, word_count, unk_count)
    X_encoder.append(tokens)
    
for x in x_decoder:
    tokens, word_count, unk_count = tokenize(x, word_count, unk_count)    
    X_decoder.append(tokens)
    
for y in y_:
    tokens, _, _ = tokenize(y, word_count, unk_count)
    Y.append(tokens)

In [33]:
# % of unk words in word_count
(unk_count / word_count) * 100

2.178284142784869

In [34]:
# padding
X_encoder = pad_sequences(X_encoder, maxlen=max_len, dtype='int32', padding='post', truncating='post')
X_decoder = pad_sequences(X_decoder, maxlen=max_len, dtype='int32', padding='post', truncating='post')
Y = pad_sequences(Y, maxlen=max_len, dtype='int32', padding='post', truncating='post')

In [35]:
X_encoder.shape, X_decoder.shape, Y.shape

((167126, 20), (167126, 20), (167126, 20))

In [36]:
X_encoder[30], X_decoder[30], Y[30]

(array([24,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0]),
 array([  3,  85,   4,  15, 119, 107,   9, 650,  43,   9, 580,   0,   0,
          0,   0,   0,   0,   0,   0,   0]),
 array([ 85,   4,  15, 119, 107,   9, 650,  43,   9, 580,   1,   0,   0,
          0,   0,   0,   0,   0,   0,   0]))

### Dumping word2idx and idx2word

In [37]:
with open('word_idx.pkl', 'wb') as f:
    pickle.dump([word2idx, idx2word], f, protocol=4)

# loading X and Y
with open('word_idx.pkl', 'rb') as f:
    word2idx, idx2word = pickle.load(f)

### Dumping X, Y to pickle for reuse

In [38]:
with open('x_and_y.pkl', 'wb') as f:
    pickle.dump([X_encoder, X_decoder, Y], f, protocol=4)

# loading X and Y
with open('x_and_y.pkl', 'rb') as f:
    X_encoder, X_decoder, Y = pickle.load(f)

### GloVe Embeddings pre-processing

In [39]:
# a function that returns the word vector for a given word (in string) from the dataframe of word vectors obtained in the next cell
def get_vector(word):
    return words.loc[word].values

##### Create a pandas dataframe of entries from the glove pre-trained vectors txt file as running a loop for getting word to vector mapping is expensive

In [40]:
#words = pd.read_table("glove.840B.300d/glove.840B.300d.txt", sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)

##### Since it is computationally expensive to make a dataframe from txt each time, dumping the dataframe to a pickle; load from "words.pkl"

In [41]:
# dumping the dataframe
#with open('words.pkl', 'wb') as f:
#    pickle.dump(words, f, protocol=4)

# loading the dataframe
with open('words.pkl', 'rb') as f:
    words = pickle.load(f)

### Create vocab_embedding matrix

In [None]:
vocab_embedding = np.zeros((len(word2idx), 300))     # embedding dim is 300

for i in idx2word.keys():
    try:
        temp = get_vector(idx2word[i])
        vocab_embedding[i] = temp
    except:
        print(idx2word[i] + " not in glove")

### Dump vocab_embedding to pickle

In [43]:
with open('embedding_weights.pkl', 'wb') as f:
    pickle.dump(vocab_embedding, f, protocol=4)

# loading vocab_embeddings
with open('embedding_weights.pkl', 'rb') as f:
    vocab_embedding = pickle.load(f)