In [0]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/workspaces/"
base_dir = root_dir + 'data/'

kaggle_json = root_dir + 'kaggle.json'

! mkdir -p ~/.kaggle/
! cp "$kaggle_json" ~/.kaggle/

Mounted at /content/gdrive


In [0]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [0]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import tqdm
%matplotlib inline

In [0]:
# data url: http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip
! ls -ltrh "{base_dir}cornell_movie_dialogs_corpus"

total 41M
-rw------- 1 root root 284K May  7 14:51 chameleons.pdf
-rw------- 1 root root 690K May  7 14:51 movie_characters_metadata.txt
-rw------- 1 root root 4.1K May  7 14:51 README.txt
-rw------- 1 root root  55K May  7 14:51 raw_script_urls.txt
-rw------- 1 root root  66K May  7 14:51 movie_titles_metadata.txt
-rw------- 1 root root 6.5M May  7 14:51 movie_conversations.txt
-rw------- 1 root root  34M May  7 14:52 movie_lines.txt


In [0]:
# Read files
with open(base_dir + "cornell_movie_dialogs_corpus/movie_conversations.txt", 'r', encoding="latin1") as conv_file:
    conv_raw = pd.read_csv(conv_file, sep=" \+\+\+\$\+\+\+ ", header = None, engine = 'python')
    conv_raw.columns = ['person1', 'person2', 'movie', 'conv_seq']

with open(base_dir + "cornell_movie_dialogs_corpus/movie_lines.txt", 'r', encoding="latin1") as conv_file:
    conv_lines = pd.read_csv(conv_file, sep=" \+\+\+\$\+\+\+ ", header = None, engine = 'python')
    conv_lines.columns = ['line_num', 'person', 'movie', 'person_name', 'dialog']

In [0]:
conv_raw.head(5)

Unnamed: 0,person1,person2,movie,conv_seq
0,u0,u2,m0,"['L194', 'L195', 'L196', 'L197']"
1,u0,u2,m0,"['L198', 'L199']"
2,u0,u2,m0,"['L200', 'L201', 'L202', 'L203']"
3,u0,u2,m0,"['L204', 'L205', 'L206']"
4,u0,u2,m0,"['L207', 'L208']"


In [0]:
conv_lines.head(5)

Unnamed: 0,line_num,person,movie,person_name,dialog
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.


In [0]:
# Let's work on conv_raw
# the 'conv_seq' column is in the form of a array, let's use it
# and also add couple of columns for first and last lines in the dialog
from ast import literal_eval
conv_raw['conv_seq'] = conv_raw['conv_seq'].apply(lambda x: [line.replace('L', '') for line in literal_eval(x)])
conv_raw['first_line'] = conv_raw['conv_seq'].apply(lambda x: x[0])
conv_raw['last_line'] = conv_raw['conv_seq'].apply(lambda x: x[-1])

In [0]:
# Let's add a few more columns similar to index
conv_raw.sort_values(by = ['movie', 'person1', 'person2', 'first_line']).reset_index(drop = True)
conv_raw['unique_ppm'] = conv_raw['person1'] + conv_raw['person2'] + conv_raw['movie']
conv_raw['unique_row_id'] = conv_raw['unique_ppm'] + "_" + conv_raw['last_line']
conv_raw.head(10)

Unnamed: 0,person1,person2,movie,conv_seq,first_line,last_line,unique_ppm,unique_row_id
0,u0,u2,m0,"[194, 195, 196, 197]",194,197,u0u2m0,u0u2m0_197
1,u0,u2,m0,"[198, 199]",198,199,u0u2m0,u0u2m0_199
2,u0,u2,m0,"[200, 201, 202, 203]",200,203,u0u2m0,u0u2m0_203
3,u0,u2,m0,"[204, 205, 206]",204,206,u0u2m0,u0u2m0_206
4,u0,u2,m0,"[207, 208]",207,208,u0u2m0,u0u2m0_208
5,u0,u2,m0,"[271, 272, 273, 274, 275]",271,275,u0u2m0,u0u2m0_275
6,u0,u2,m0,"[276, 277]",276,277,u0u2m0,u0u2m0_277
7,u0,u2,m0,"[280, 281]",280,281,u0u2m0,u0u2m0_281
8,u0,u2,m0,"[363, 364]",363,364,u0u2m0,u0u2m0_364
9,u0,u2,m0,"[365, 366]",365,366,u0u2m0,u0u2m0_366


In [0]:
# I'm trying to concatenate dialogues between the same two people
# and if the dialogues are in succession but broken into different rows in the data
g = (conv_raw['unique_ppm'] + "_" + (conv_raw['first_line'].map(int) - 1).map(str) != conv_raw.shift().fillna(method='bfill')['unique_row_id']).cumsum().rename('group')
conv_agg = conv_raw.groupby(['person1', 'person2', 'movie', g])['conv_seq'].apply(list).reset_index().drop('group',axis=1)
conv_agg['conv_seq'] = conv_agg['conv_seq'].apply(lambda l: [item for sublist in l for item in sublist])

In [0]:
print(conv_agg.head(10))
print(conv_raw.shape, conv_agg.shape)

  person1 person2 movie                                           conv_seq
0      u0     u11    m0                          [179, 180, 181, 182, 183]
1      u0     u11    m0                                         [189, 190]
2      u0     u11    m0                          [517, 518, 519, 520, 521]
3      u0     u11    m0                                         [523, 524]
4      u0     u11    m0                          [536, 537, 538, 539, 540]
5      u0     u11    m0                                    [544, 545, 546]
6      u0     u11    m0                [878, 879, 880, 881, 882, 883, 884]
7      u0     u11    m0                                         [922, 923]
8      u0      u2    m0  [194, 195, 196, 197, 198, 199, 200, 201, 202, ...
9      u0      u2    m0                [271, 272, 273, 274, 275, 276, 277]
(83097, 8) (60699, 4)


In [0]:
# Creating a function to clean out the input sentences
import re

def clean_sentence(raw_sent):
    raw_sent = raw_sent.lower()
    # remove html tags like <i>, </u>
    raw_sent = re.sub(r'<\/*[a-z]?>', '', raw_sent)
    # special character for 'pause' (..., ---) replaced with "<pause>"
    raw_sent = re.sub(r'\s*(\.\.+|--+)\s*', ' <pause> ', raw_sent)
    # words like doin', goin' to be replaced with doing, going
    raw_sent = re.sub(r'([a-z]+)in\'', r'\1ing', raw_sent)
    # you've, there've replaced with you have, there have
    raw_sent = re.sub(r'([a-z]+)\'ve', r'\1 have', raw_sent)
    # remove double quotes
    raw_sent = re.sub(r'\"', '', raw_sent)
    # treat commas and other EOS tags as words (put space around them)
    raw_sent = re.sub(r'\s*([\.,!\?]+)\s*', r' \1 ', raw_sent)
    # 's to be seperated from the word with some exceptions
    raw_sent = re.sub(r'([a-z]+)(\'s)', r'\1 \2', raw_sent)
    exceptions = r"\b(it|that|there|he|she) (\'s)"
    raw_sent = re.sub(exceptions, r'\1\2', raw_sent)
    # remove leading and trailing spaces
    raw_sent = re.sub(r'^\s*|\s*$', '', raw_sent)
    raw_sent = re.sub(r'\s+', ' ', raw_sent)
    # add <eos> tag at the end of sentence
    # raw_sent = raw_sent + " <eos>"
    return(raw_sent)

In [0]:
# Just testing regex for next function, delete this
import re
raw_sent = "   -Sai --Kiran---V's-123.. <a> Sai   \"you've been,   've doin' . somethin' but\" it 's ?everythin   "
print(raw_sent)
exceptions = r"\b(it|that|there|he|she|let) (\'s)"
print(re.sub(exceptions, r'\1\2', raw_sent))
clean_sentence(raw_sent)

   -Sai --Kiran---V's-123.. <a> Sai   "you've been,   've doin' . somethin' but" it 's ?everythin   
   -Sai --Kiran---V's-123.. <a> Sai   "you've been,   've doin' . somethin' but" it's ?everythin   


"-sai <pause> kiran <pause> v 's-123 <pause> sai you have been , 've doing . something but it's ? everythin"

In [0]:
print(sum(conv_lines.dialog.map(lambda x: x is None)))
# Looks like some error reading few lines of conv_lines dataset
for i in conv_lines[conv_lines['dialog'].isnull()].index.tolist():
  conv_lines.loc[i, 'person_name'] = conv_lines.loc[i, 'person_name'].replace(' +++$+++', '')
conv_lines.dialog.fillna(value="", inplace=True)
print(sum(conv_lines.dialog.map(lambda x: x is None)))

In [0]:
all_dialogs = np.array(conv_lines.dialog)
all_dialogs = clean_sentence(' '.join(all_dialogs))

In [0]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential

In [0]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_dialogs)