## Load dialogs from Cornell Movie Corpus, Analyze, Condition, Store
https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html

Retain:
- lines with >=5 and <=60 words
- words with occurance >= 210 times in all dialog remaining dialog lines
- lines with < 20% occurance of rare words (occurance < 210 times)

In [1]:
import numpy as np
import pandas as pd
import re
import pickle

In [7]:
# conversations txt file associate line numbers as sequential dialog -- remove unnecessary metadata
F_c = open('data\cornell movie-dialogs corpus\movie_conversations.txt')
F_cw = open('data\cornell movie-dialogs corpus\movie_conversations2.txt','w')
for row in F_c:
    cur = row.split(r' +++$+++ ')[3].rstrip()[1:-1]
    F_cw.write(cur+'\n')
F_cw.close()
F_c.close()

In [9]:
# Construct dictionary of "clean" movie dialog lines keyed to line number
D_lines = {}
F_l = open('data\cornell movie-dialogs corpus\movie_lines.txt')
for row in F_l:
    loc = row.find(r' +++$+++ ')
    loc2 = row.rfind(r' +++$+++ ')
    line_ind = row[:loc]
    mline = row[loc2+9:].replace("'", " '")
    mline = re.sub(r'((?:[^A-Za-z\s]|\s)+)', lambda x: ' ' if ' ' in x.group(0) else '', mline.rstrip().lower())
    mline = re.sub(' +', ' ', mline)
    D_lines[line_ind] = mline
F_l.close()

In [4]:
file_name = 'data/D_lines.pkl'
file_obj = open(file_name,'wb') 
pickle.dump(D_lines,file_obj)   
file_obj.close()

In [5]:
# 304,713 total lines available (not the same as paired lines in dialog)
len(D_lines.keys())

304713

In [3]:
# maximum number of words in a line = 563 ... too many
max_words = 0
for line in D_lines.values():
    cur = len(line.split(' '))
    if cur > max_words:
        max_words = cur
max_words

563

In [4]:
# restrict number of words in dialog lines to what *might* be reasonable >=5 words, <= 60 words ... leaves 211,628 lines
D_midlines = {}
for k in D_lines:
    if len(D_lines[k].split(' ')) <= 60 and len(D_lines[k].split(' ')) >= 5:
        D_midlines[k] = D_lines[k]
len(D_midlines.keys())

211628

In [5]:
conv = pd.read_table('data\cornell movie-dialogs corpus\movie_conversations2.txt', header=None)
conv.head()

Unnamed: 0,0
0,"'L194', 'L195', 'L196', 'L197'"
1,"'L198', 'L199'"
2,"'L200', 'L201', 'L202', 'L203'"
3,"'L204', 'L205', 'L206'"
4,"'L207', 'L208'"


In [6]:
conv2 = []
for row in conv[0]:
    l_s = row.split(',')
    row2 = []
    for L in l_s:
        row2.append(L[L.find("'")+1:L.rfind("'")])
    conv2.append(row2)
conv2[:10]

[['L194', 'L195', 'L196', 'L197'],
 ['L198', 'L199'],
 ['L200', 'L201', 'L202', 'L203'],
 ['L204', 'L205', 'L206'],
 ['L207', 'L208'],
 ['L271', 'L272', 'L273', 'L274', 'L275'],
 ['L276', 'L277'],
 ['L280', 'L281'],
 ['L363', 'L364'],
 ['L365', 'L366']]

In [7]:
# map dialog sequences to back-and-forth individual initiating sequence/response sequence pairs
forth = []
back = []
for row in conv2:
    for line in range(len(row)-1):
        if (row[line] in D_midlines) and (row[line+1] in D_midlines):
            forth.append(row[line])
            back.append(row[line+1])
print(len(forth))
for i in range(10):
    print('forth: ', forth[i], ', back: ', back[i])

107662
forth:  L194 , back:  L195
forth:  L195 , back:  L196
forth:  L196 , back:  L197
forth:  L202 , back:  L203
forth:  L207 , back:  L208
forth:  L271 , back:  L272
forth:  L272 , back:  L273
forth:  L273 , back:  L274
forth:  L276 , back:  L277
forth:  L363 , back:  L364


In [8]:
# construct set of all unique words -- 56,174 words in lines of 5 to 60 words
word_list = set()
for line in D_midlines.values():
    for word in line.split(' '):
        word_list.add(word)
len(word_list)

56174

In [55]:
# dictionary counting instances of [word] in all dialog lines
word_count = dict()
for word in word_list:
    word_count[word] = 0
for line in D_midlines.values():
    for word in line.split(' '):
        word_count[word] += 1
word_ordered = []
for word in word_count:
    word_ordered.append([word, word_count[word]])
word_ordered.sort(key = lambda x : x[1])
count_thrt_m = 0
i = 0
while True:
    if word_ordered[i][1] < 210:  # define "rare" words as those occuring <14 in all lines
        count_thrt_m += 1
        i += 1
    else:
        break
print('Used 13 times or less = ', count_thrt_m, ', ', count_thrt_m/len(word_list)*100, '% of all words')
print('Total words = ', len(word_list), ', words used 14+ times = ', 
      len(word_list)-count_thrt_m, ', ',
      (len(word_list)-count_thrt_m)/len(word_list)*100,'%')
print('10 most common')
for i in range(10):
    print(word_ordered[-1-i])

Used 13 times or less =  55193 ,  98.25364047424075 % of all words
Total words =  56174 , words used 14+ times =  981 ,  1.7463595257592481 %
10 most common
['you', 128756]
['i', 121549]
['the', 86658]
['to', 72910]
['a', 62274]
['s', 57012]
['it', 56315]
['t', 49626]
['that', 39180]
['and', 38207]


In [56]:
# include lines that have high (>20% occurance) of rare words in discard set
high_rare = set()
for (line_n, line) in D_midlines.items():
    line_s = line.split()
    num_w = len(line_s)
    count_r = 0
    for word in line_s:
        if word_count[word] < 210:
            count_r += 1
    if count_r/num_w > 0.20:
            high_rare.add(line_n)
print('Lines with 15%+ rare word (<14 uses) content ', len(high_rare),', ', len(high_rare)/len(D_midlines)*100, '% of lines')

Lines with 15%+ rare word (<14 uses) content  53769 ,  25.407318502277583 % of lines


In [61]:
# construct conditioned dialog sets -- x = initiating sequence, y = response sequence
x_train_test = []
y_train_test = []
x_vec = []
y_vec = []
num_wrds_in = 0
num_wrds_out = 0
wrd_set = set()
wrd_set_in = set()
wrd_set_out = set()
forth_nr = []
back_nr = []
t = 0
for i in range(len(forth)):
    if (forth[i] in high_rare) or (back[i] in high_rare):
        continue
    else:
        forth_nr.append(forth[i])
        back_nr.append(back[i])
        cur1 = D_midlines[forth[i]].split()
        for (k,wrd) in enumerate(cur1):
            if word_count[wrd] < 210:
                cur1[k] = 'UNK'
        x_train_test.append(cur1)
        num_wrds_in += len(cur1)
        for wrd in cur1:
            wrd_set.add(wrd)
            wrd_set_in.add(wrd)
        cur2 = D_midlines[back[i]].split()
        for (k,wrd) in enumerate(cur2):
            if word_count[wrd] < 210:
                cur2[k] = 'UNK'
        y_train_test.append(cur2)
        num_wrds_out += len(cur2)
        for wrd in cur2:
            wrd_set.add(wrd)
            wrd_set_out.add(wrd)
        t += 1
print('Num conversations (non-rare): ', len(x_train_test))
print('Num words in intiating exchange: ', num_wrds_in)
print('Num unique words in intiating exchange: ', len(wrd_set_in))
print('Example initiating exchange:')
print(x_train_test[:10])
print()
print('Num words in response: ', num_wrds_out)
print('Num unique words in response: ', len(wrd_set_out))
print('Example response:')
print(y_train_test[:10])
print()
print('Total unique words -- initiations and responses in exchanges', len(wrd_set))

Num conversations (non-rare):  63689
Num words in intiating exchange:  844080
Num unique words in intiating exchange:  981
Example initiating exchange:
[['right', 'see', 'you', 're', 'ready', 'for', 'the', 'UNK'], ['i', 'don', 't', 'want', 'to', 'know', 'how', 'to', 'say', 'that', 'though', 'i', 'want', 'to', 'know', 'UNK', 'things', 'like', 'where', 'the', 'good', 'UNK', 'are', 'how', 'much', 'does', 'UNK', 'cost', 'stuff', 'like', 'UNK', 'i', 'have', 'never', 'in', 'my', 'life', 'had', 'to', 'point', 'out', 'my', 'head', 'to', 'someone'], ['how', 'is', 'our', 'little', 'find', 'the', 'UNK', 'a', 'date', 'plan', 'UNK'], ['you', 'got', 'something', 'on', 'your', 'mind'], ['i', 'really', 'really', 'really', 'wanna', 'go', 'but', 'i', 'can', 't', 'not', 'unless', 'my', 'sister', 'goes'], ['so', 'that', 's', 'the', 'kind', 'of', 'guy', 'she', 'likes', 'pretty', 'ones'], ['sometimes', 'i', 'wonder', 'if', 'the', 'guys', 'we', 're', 'supposed', 'to', 'want', 'to', 'go', 'out', 'with', 'are'

In [29]:
file_name = 'data/x_train_test5.pkl'
file_obj = open(file_name,'wb') 
pickle.dump(x_train_test,file_obj)   
file_obj.close()
file_name = 'data/y_train_test5.pkl'
file_obj = open(file_name,'wb') 
pickle.dump(y_train_test,file_obj)   
file_obj.close()
file_name = 'data/forth_nr5.pkl'
file_obj = open(file_name,'wb') 
pickle.dump(forth_nr,file_obj)   
file_obj.close()
file_name = 'data/back_nr5.pkl'
file_obj = open(file_name,'wb') 
pickle.dump(back_nr,file_obj)   
file_obj.close()