Run below when reading data from gdrive location or from kaggle

In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/workspaces/"
base_dir = root_dir + 'data/'

kaggle_json = root_dir + 'kaggle.json'

! mkdir -p ~/.kaggle/
! cp "$kaggle_json" ~/.kaggle/

Mounted at /content/gdrive


In [2]:
# data url: http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip
! ls -ltrh "{base_dir}cornell_movie_dialogs_corpus"

total 41M
-rw------- 1 root root 284K May  7 14:51 chameleons.pdf
-rw------- 1 root root 690K May  7 14:51 movie_characters_metadata.txt
-rw------- 1 root root 4.1K May  7 14:51 README.txt
-rw------- 1 root root  55K May  7 14:51 raw_script_urls.txt
-rw------- 1 root root  66K May  7 14:51 movie_titles_metadata.txt
-rw------- 1 root root 6.5M May  7 14:51 movie_conversations.txt
-rw------- 1 root root  34M May  7 14:52 movie_lines.txt


In [3]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [0]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline

In [0]:
# Read files
with open(base_dir + "cornell_movie_dialogs_corpus/movie_conversations.txt", 'r', encoding="latin1") as conv_file:
    conv_raw = pd.read_csv(conv_file, sep=" \+\+\+\$\+\+\+ ", header = None, engine = 'python')
    conv_raw.columns = ['person1', 'person2', 'movie', 'conv_seq']

with open(base_dir + "cornell_movie_dialogs_corpus/movie_lines.txt", 'r', encoding="latin1") as conv_file:
    conv_lines = pd.read_csv(conv_file, sep=" \+\+\+\$\+\+\+ ", header = None, engine = 'python')
    conv_lines.columns = ['line_num', 'person', 'movie', 'person_name', 'dialog']

In [0]:
# Read files
with open("//QATLPCFS001/Users/skiran/Downloads/cornell_movie_dialogs_corpus/movie_conversations.txt", 'r') as conv_file:
    conv_raw = pd.read_csv(conv_file, sep=" \+\+\+\$\+\+\+ ", header = None, engine = 'python')
    conv_raw.columns = ['person1', 'person2', 'movie', 'conv_seq']

with open("//QATLPCFS001/Users/skiran/Downloads/cornell_movie_dialogs_corpus/movie_lines.txt", 'r') as conv_file:
    conv_lines = pd.read_csv(conv_file, sep=" \+\+\+\$\+\+\+ ", header = None, engine = 'python')
    conv_lines.columns = ['line_num', 'person', 'movie', 'person_name', 'dialog']

In [6]:
conv_lines.head(5)

Unnamed: 0,line_num,person,movie,person_name,dialog
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.


In [7]:
# Looks like some error reading few lines of conv_lines dataset
# fixing it here
print(sum(conv_lines.dialog.map(lambda x: x is None)))
dialog_none = conv_lines['dialog'].isnull()
conv_lines.loc[dialog_none, 'person_name'] = conv_lines.loc[dialog_none, 'person_name'].map(lambda x: x.replace(' +++$+++', ''))
conv_lines.dialog.fillna(value="", inplace=True)
print(sum(conv_lines.dialog.map(lambda x: x is None)))

267
0


In [8]:
conv_raw.head(5)

Unnamed: 0,person1,person2,movie,conv_seq
0,u0,u2,m0,"['L194', 'L195', 'L196', 'L197']"
1,u0,u2,m0,"['L198', 'L199']"
2,u0,u2,m0,"['L200', 'L201', 'L202', 'L203']"
3,u0,u2,m0,"['L204', 'L205', 'L206']"
4,u0,u2,m0,"['L207', 'L208']"


In [0]:
# Let's work on conv_raw
# the 'conv_seq' column is in the form of a array, let's use it
# and also add couple of columns for first and last lines in the dialog
from ast import literal_eval
conv_raw['conv_seq'] = conv_raw['conv_seq'].apply(lambda x: [line.replace('L', '') for line in literal_eval(x)])
conv_raw['first_line'] = conv_raw['conv_seq'].apply(lambda x: x[0])
conv_raw['last_line'] = conv_raw['conv_seq'].apply(lambda x: x[-1])

In [10]:
# Let's add a few more columns similar to index
conv_raw.sort_values(by = ['movie', 'person1', 'person2', 'first_line']).reset_index(drop = True)
conv_raw['unique_ppm'] = conv_raw['person1'] + conv_raw['person2'] + conv_raw['movie']
conv_raw['unique_row_id'] = conv_raw['unique_ppm'] + "_" + conv_raw['last_line']
conv_raw.head(10)

Unnamed: 0,person1,person2,movie,conv_seq,first_line,last_line,unique_ppm,unique_row_id
0,u0,u2,m0,"[194, 195, 196, 197]",194,197,u0u2m0,u0u2m0_197
1,u0,u2,m0,"[198, 199]",198,199,u0u2m0,u0u2m0_199
2,u0,u2,m0,"[200, 201, 202, 203]",200,203,u0u2m0,u0u2m0_203
3,u0,u2,m0,"[204, 205, 206]",204,206,u0u2m0,u0u2m0_206
4,u0,u2,m0,"[207, 208]",207,208,u0u2m0,u0u2m0_208
5,u0,u2,m0,"[271, 272, 273, 274, 275]",271,275,u0u2m0,u0u2m0_275
6,u0,u2,m0,"[276, 277]",276,277,u0u2m0,u0u2m0_277
7,u0,u2,m0,"[280, 281]",280,281,u0u2m0,u0u2m0_281
8,u0,u2,m0,"[363, 364]",363,364,u0u2m0,u0u2m0_364
9,u0,u2,m0,"[365, 366]",365,366,u0u2m0,u0u2m0_366


In [0]:
# I'm trying to concatenate dialogues between the same two people
# and if the dialogues are in succession but broken into different rows in the data
g = (conv_raw['unique_ppm'] + "_" + (conv_raw['first_line'].map(int) - 1).map(str) != conv_raw.shift().fillna(method='bfill')['unique_row_id']).cumsum().rename('group')
conv_agg = conv_raw.groupby(['person1', 'person2', 'movie', g])['conv_seq'].apply(list).reset_index().drop('group',axis=1)
conv_agg['conv_seq'] = conv_agg['conv_seq'].apply(lambda l: [item for sublist in l for item in sublist])

In [12]:
print(conv_agg.head(10))
print(conv_raw.shape, conv_agg.shape)

  person1 person2 movie                                           conv_seq
0      u0     u11    m0                          [179, 180, 181, 182, 183]
1      u0     u11    m0                                         [189, 190]
2      u0     u11    m0                          [517, 518, 519, 520, 521]
3      u0     u11    m0                                         [523, 524]
4      u0     u11    m0                          [536, 537, 538, 539, 540]
5      u0     u11    m0                                    [544, 545, 546]
6      u0     u11    m0                [878, 879, 880, 881, 882, 883, 884]
7      u0     u11    m0                                         [922, 923]
8      u0      u2    m0  [194, 195, 196, 197, 198, 199, 200, 201, 202, ...
9      u0      u2    m0                [271, 272, 273, 274, 275, 276, 277]
(83097, 8) (60699, 4)


In [0]:
# Creating a function to clean out the input sentences
import re

def clean_sentence(raw_sent):
    raw_sent = raw_sent.lower()
    # remove html tags like <i>, </u>
    raw_sent = re.sub(r'<\/*[a-z]?>', '', raw_sent)
    raw_sent = re.sub(r'[<>]', '', raw_sent)
    # special character for 'pause' (..., ---) replaced with "<pause>"
    raw_sent = re.sub(r'\s*(\.\.+|--+|…+)\s*', ' <pause> ', raw_sent)
    # - replaced with space
    raw_sent = re.sub(r'-|—', ' ', raw_sent)
    # remove double quotes
    raw_sent = re.sub(r'\"', '', raw_sent)
    # remove single quotes around words
    raw_sent = re.sub(r'[‘’]', '\'', raw_sent)
    raw_sent = re.sub(r'\'([a-z]+)\'', r'\1', raw_sent)
    raw_sent = re.sub(r' \'([a-z]+)\b', r' \1', raw_sent)
    raw_sent = re.sub(r'( [a-z]+s\') ', r'\1s ', raw_sent)
    # words like doin', goin' to be replaced with doing, going
    raw_sent = re.sub(r'([a-z]+)in\'', r'\1ing', raw_sent)
    # you've, there've replaced with you have, there have
    raw_sent = re.sub(r'([a-z]+)\'ve', r'\1 have', raw_sent)
    # we're replaced with we are
    raw_sent = re.sub(r'([a-z]+)\'re', r'\1 are', raw_sent)
    # treat commas and other EOS tags as words (put space around them)
    raw_sent = re.sub(r'\s*([\.,!\?]+)\s*', r' \1 ', raw_sent)
    # 's to be seperated from the word with some exceptions
    raw_sent = re.sub(r'([a-z]+)(\'s)\b', r'\1 \2 ', raw_sent)
    exceptions = r"\b(it|that|there|he|she|let) (\'s)"
    raw_sent = re.sub(exceptions, r'\1\2', raw_sent)
    # remove leading and trailing spaces
    raw_sent = re.sub(r'^\s*|\s*$', '', raw_sent)
    raw_sent = re.sub(r'\s+', ' ', raw_sent)
    # add <eos> tag at the end of sentence
    raw_sent = raw_sent + " <eos>"
    return(raw_sent)

In [14]:
# Just testing regex for clean_sentence function, delete this when done function building
import re
raw_sent = "   -Sai --Kiran---V's-123.. <a> Sai   \"you've been,   've doin' . somethin' but\" it 's ?everythin flat's ans' 'knows'  "
print(raw_sent)
exceptions = r"\b(it|that|there|he|she|let) (\'s)"
print(re.sub(exceptions, r'\1\2', raw_sent))
clean_sentence(raw_sent)

   -Sai --Kiran---V's-123.. <a> Sai   "you've been,   've doin' . somethin' but" it 's ?everythin flat's ans' 'knows'  
   -Sai --Kiran---V's-123.. <a> Sai   "you've been,   've doin' . somethin' but" it's ?everythin flat's ans' 'knows'  


"sai <pause> kiran <pause> v 's 123 <pause> sai you have been , ve doing . something but it s ? everythin flat 's ans 's knows <eos>"

In [15]:
all_dialogs = conv_lines.dialog.map(clean_sentence).tolist()
all_dialogs[74]

'sometimes i wonder if the guys we are supposed to want to go out with are the ones we actually want to go out with , you know ? <eos>'

In [16]:
import tensorflow as tf
import keras
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential

Using TensorFlow backend.


In [17]:
# integer encode text
tokenizer = Tokenizer(filters='#$%&()*+-/:;=@[\\]^_`{|}~\t\n“')
tokenizer.fit_on_texts(all_dialogs)

print(all_dialogs[144])
print(tokenizer.texts_to_sequences([all_dialogs[144]]))

# Removing words with frequency less than a threshold
count_thres = 2
low_count_words = [w for w,c in tokenizer.word_counts.items() if c < count_thres]
for w in low_count_words:
    del tokenizer.word_index[w]
    del tokenizer.word_docs[w]
    del tokenizer.word_counts[w]

print(tokenizer.texts_to_sequences([all_dialogs[144]]))

yeah , he's your freak friend mandella 's boyfriend . i guess since i'm not allowed to go out , i should obsess over a dead guy , too . <eos>
[[79, 3, 84, 30, 1883, 283, 23998, 29, 1161, 1, 5, 222, 325, 27, 35, 1687, 8, 63, 48, 3, 5, 137, 31163, 121, 10, 215, 170, 3, 110, 1, 2]]
[[79, 3, 84, 30, 1883, 283, 23998, 29, 1161, 1, 5, 222, 325, 27, 35, 1687, 8, 63, 48, 3, 5, 137, 121, 10, 215, 170, 3, 110, 1, 2]]


In [18]:
print("Number of words with low counts:", len(low_count_words))
print("Number of remaining words after removing low count words:", len(tokenizer.word_index))

Number of words with low counts: 19869
Number of remaining words after removing low count words: 31150


In [0]:
# Function to convert input sequence into ([train_seq], label) list
def create_lm_pairs(text_seq, train_seq_len = 3):
  text_seq_len = len(text_seq)
  train_pairs = []
  if (text_seq_len > train_seq_len):
    for i in range(text_seq_len - train_seq_len):
      train_pairs.append((text_seq[i:(i+train_seq_len)], text_seq[i+train_seq_len]))
  return(train_pairs)

In [0]:
# Function to encode input texts and return ([train_seq], label) list
def texts_to_lm_pairs(tokenizer, sentence_list, train_seq_len = 3):
  encoded = tokenizer.texts_to_sequences(sentence_list)
  all_train_pairs = [t for text_seq in tqdm(encoded) for t in create_lm_pairs(text_seq, train_seq_len)]
  return(all_train_pairs)

In [21]:
# convert all dialogs to text sentence pairs 
lm_ready_data = texts_to_lm_pairs(tokenizer, all_dialogs, 3)

100%|██████████| 304713/304713 [00:07<00:00, 42286.85it/s]


In [22]:
print("Number of total sequences for language model =", len(lm_ready_data))
lm_ready_data[:10]

Number of total sequences for language model = 3363372


[([41, 26, 35], 18),
 ([26, 35, 18], 2),
 ([41, 26, 8], 18),
 ([26, 8, 18], 2),
 ([5, 364, 47], 1),
 ([364, 47, 1], 2),
 ([68, 112, 6], 2),
 ([187, 63, 1], 2),
 ([112, 9, 4], 14),
 ([9, 4, 14], 118)]

### Download Glove word vectors for initializing embeddings

In [0]:
# Defining a function to show download progress
import progressbar
import urllib.request

pbar = None

def show_progress(block_num, block_size, total_size):
    global pbar
    if pbar is None:
        pbar = progressbar.ProgressBar(maxval=total_size)

    downloaded = block_num * block_size
    if downloaded < total_size:
        pbar.update(downloaded)
    else:
        pbar.finish()
        pbar = None

In [0]:
! mkdir -p /data/glove_6B_extracted

In [18]:
urllib.request.urlretrieve("http://nlp.stanford.edu/data/glove.6B.zip", "/data/glove.6B.zip", reporthook = show_progress)

100% (862182613 of 862182613) |##########| Elapsed Time: 0:05:09 Time:  0:05:09


('/data/glove.6B.zip', <http.client.HTTPMessage at 0x7f2eb8e24be0>)

In [0]:
import zipfile
zip_ref = zipfile.ZipFile("/data/glove.6B.zip", 'r')
zip_ref.extractall("/data/glove_6B_extracted/")
zip_ref.close()

In [18]:
! ls "/data/glove_6B_extracted/"

glove.6B.100d.txt  glove.6B.200d.txt  glove.6B.300d.txt  glove.6B.50d.txt


In [23]:
print('Indexing word vectors.')

embeddings_index = {}
f = open('/data/glove_6B_extracted/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [0]:
vocab_size = len(tokenizer.word_index)

# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i - 1] = embedding_vector

In [25]:
embedding_matrix.shape

(31150, 100)

In [26]:
# define model
model = Sequential()
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=3)
model.add(embedding_layer)
model.add(LSTM(50, return_sequences = True))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 3, 100)            3115000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 3, 50)             30200     
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_1 (Dense)              (None, 31150)             1588650   
Total params: 4,754,050
Trainable params: 4,754,050
Non-trainable params: 0
_________________________________________________________________
None


In [29]:
X = [tr[0] for tr in lm_ready_data]
labels = [tr[1] for tr in lm_ready_data]
print(X[:5])
print(labels[:5])

[[41, 26, 35], [26, 35, 18], [41, 26, 8], [26, 8, 18], [5, 364, 47]]
[18, 2, 18, 2, 1]


In [0]:
# fit the model
model.fit([X, labels], labels, epochs=50, verbose=0)
# evaluate the model
# loss, accuracy = model.evaluate(X, labels, labels, verbose=0)
# print('Accuracy: %f' % (accuracy*100))