## Encode Words as 50 Dimension Vectors Based on Stanford NLP GLoVe Pre-Trained Wikipedia 2014 + GigaWord 5
https://nlp.stanford.edu/projects/glove/

In [1]:
import numpy as np
import pickle

In [2]:
# load conditioned movie dialogs
file_name = 'data/x_train_test4.pkl'
file_obj = open(file_name,'rb') 
x_words = pickle.load(file_obj)   
file_obj.close()
file_name = 'data/y_train_test4.pkl'
file_obj = open(file_name,'rb') 
y_words = pickle.load(file_obj)   
file_obj.close()

In [3]:
# colapse all to lower-case
for i in range(len(x_words)):
    x_words[i] = list(map(lambda x: x.lower(), x_words[i]))
    y_words[i] = list(map(lambda x: x.lower(), y_words[i]))

In [4]:
# 91,458 total dialogs in available corpus
len(x_words)

91458

In [4]:
# Complete GLoVe dictionary
f = open('data/glove_6B_50d.txt',encoding='utf-8')
wrd_vec_all = {}
for row in f:
    row_vec = row.split()
    for j in range(1,len(row_vec)):
        row_vec[j] = float(row_vec[j])
    wrd_vec_all[row_vec[0]] = row_vec[1:]
f.close()

In [5]:
# subset -- dictionary of local corpus intersection
wrd_vec = dict()
for i in range(len(x_words)):
    for (j,word) in enumerate(x_words[i]):
        if word in wrd_vec_all:
            wrd_vec[word] = wrd_vec_all[word]
        else:
            x_words[i][j] = 'unk'
for i in range(len(y_words)):
    for (j,word) in enumerate(y_words[i]):
        if word in wrd_vec_all:
            wrd_vec[word] = wrd_vec_all[word]
        else:
            y_words[i][j] = 'unk'
wrd_vec['unk'] = wrd_vec_all['_____']  # define vector for low-frequency and not-found words
wrd_vec['*start*'] = wrd_vec_all['*']  # unused icon as start word for decoder input
wrd_vec['*stop*'] = wrd_vec_all['^']  # unused icon as stop word for decoder target/output

In [6]:
# randomly split into training and test sets  -- 60,000 dialogs for training; 20,000 for test
import random
random.seed(32)
random.shuffle(x_words)
random.seed(32)
random.shuffle(y_words)
x_train = x_words[:60000]
x_test = x_words[60000:80000]
y_train = y_words[:60000]
y_test = y_words[60000:80000]

In [21]:
# encode training vectors
zer_vec = [0]*50
x_tr_vec = []
y_tr_in_vec = []
y_tr_tar_vec = []
x_tr_r_vec = []
for i in range(len(x_train)):
    for j in range(60):
        if j == 0:
            x_tr_vec.append([wrd_vec[x_train[i][j]]])
        elif j < len(x_train[i]):
            x_tr_vec[i].append(wrd_vec[x_train[i][j]])
        else:
            x_tr_vec[i].append(zer_vec[:])
    x_tr_r_vec.append([x_tr_vec[i][-1]])
    for j in range(1,60):
        x_tr_r_vec[i].append(x_tr_vec[i][-1-j])
for i in range(len(y_train)):
    for j in range(len(y_train[i])+1):
        if j == 0:
            y_tr_in_vec.append([wrd_vec['*start*']])
            y_tr_tar_vec.append([wrd_vec[y_train[i][j]]])
        else:
            y_tr_in_vec[i].append(wrd_vec[y_train[i][j-1]])
            if j == len(y_train[i]):
                y_tr_tar_vec[i].append(wrd_vec['*stop*'])
            else:
                y_tr_tar_vec[i].append(wrd_vec[y_train[i][j]])
for i in range(len(y_train)):
    for j in range(len(y_train[i]),60):
        y_tr_in_vec[i].append(zer_vec[:])
        y_tr_tar_vec[i].append(zer_vec[:])

In [23]:
# save training vectors
file_name = 'data/x_tr_vec4.pkl'
file_obj = open(file_name,'wb') 
pickle.dump(x_tr_vec, file_obj)   
file_obj.close()
file_name = 'data/x_tr_r_vec4.pkl'
file_obj = open(file_name,'wb') 
pickle.dump(x_tr_r_vec, file_obj)   
file_obj.close()
file_name = 'data/y_tr_in_vec4.pkl'
file_obj = open(file_name,'wb') 
pickle.dump(y_tr_in_vec, file_obj)   
file_obj.close()
file_name = 'data/y_tr_tar_vec4.pkl'
file_obj = open(file_name,'wb') 
pickle.dump(y_tr_tar_vec, file_obj)   
file_obj.close()
file_name = 'data/wrd_vec4.pkl'
file_obj = open(file_name,'wb') 
pickle.dump(wrd_vec, file_obj)   
file_obj.close()

del x_tr_vec, x_tr_r_vec, y_tr_in_vec, y_tr_tar_vec

In [24]:
# encode test vectors
zer_vec = [0]*50
x_ts_vec = []
y_ts_in_vec = []
y_ts_tar_vec = []
x_ts_r_vec = []
for i in range(len(x_test)):
    for j in range(60):
        if j == 0:
            x_ts_vec.append([wrd_vec[x_test[i][j]]])
        elif j < len(x_test[i]):
            x_ts_vec[i].append(wrd_vec[x_test[i][j]])
        else:
            x_ts_vec[i].append(zer_vec[:])
    x_ts_r_vec.append([x_ts_vec[i][-1]])
    for j in range(1,60):
        x_ts_r_vec[i].append(x_ts_vec[i][-1-j])
for i in range(len(y_test)):
    for j in range(len(y_test[i])+1):
        if j == 0:
            y_ts_in_vec.append([wrd_vec['*start*']])
            y_ts_tar_vec.append([wrd_vec[y_test[i][j]]])
        else:
            y_ts_in_vec[i].append(wrd_vec[y_test[i][j-1]])
            if j == len(y_test[i]):
                y_ts_tar_vec[i].append(wrd_vec['*stop*'])
            else:
                y_ts_tar_vec[i].append(wrd_vec[y_test[i][j]])
for i in range(len(y_test)):
    for j in range(len(y_test[i]),60):
        y_ts_in_vec[i].append(zer_vec[:])
        y_ts_tar_vec[i].append(zer_vec[:])

In [27]:
# save test vectors
file_name = 'data/x_ts_vec4.pkl'
file_obj = open(file_name,'wb') 
pickle.dump(x_ts_vec, file_obj)   
file_obj.close()
file_name = 'data/x_ts_r_vec4.pkl'
file_obj = open(file_name,'wb') 
pickle.dump(x_ts_r_vec, file_obj)   
file_obj.close()
file_name = 'data/y_ts_in_vec4.pkl'
file_obj = open(file_name,'wb') 
pickle.dump(y_ts_in_vec, file_obj)   
file_obj.close()
file_name = 'data/y_ts_tar_vec4.pkl'
file_obj = open(file_name,'wb') 
pickle.dump(y_ts_tar_vec, file_obj)   
file_obj.close()

In [7]:
# construct training and test vectors for seq2seq bot detector (40,000 training, 40,000 test)
import random
random.seed(48)
random.shuffle(x_words)
random.seed(48)
random.shuffle(y_words)
x = x_words[:40000]
y = y_words[:40000]

In [9]:
zer_vec = [0]*50
x_tur_vec = []
y_tur_in_vec = []
y_hum_tar_vec = []
x_tur_r_vec = []
for i in range(len(x)):
    for j in range(60):
        if j == 0:
            x_tur_vec.append([wrd_vec[x[i][j]]])
        elif j < len(x[i]):
            x_tur_vec[i].append(wrd_vec[x[i][j]])
        else:
            x_tur_vec[i].append(zer_vec[:])
    x_tur_r_vec.append([x_tur_vec[i][-1]])
    for j in range(1,60):
        x_tur_r_vec[i].append(x_tur_vec[i][-1-j])
for i in range(len(y)):
    for j in range(len(y[i])+1):
        if j == 0:
            y_tur_in_vec.append([wrd_vec['*start*']])
            y_hum_tar_vec.append([wrd_vec[y[i][j]]])
        else:
            y_tur_in_vec[i].append(wrd_vec[y[i][j-1]])
            if j == len(y[i]):
                y_hum_tar_vec[i].append(wrd_vec['*stop*'])
            else:
                y_hum_tar_vec[i].append(wrd_vec[y[i][j]])
for i in range(len(y)):
    for j in range(len(y[i]),60):
        y_tur_in_vec[i].append(zer_vec[:])
        y_hum_tar_vec[i].append(zer_vec[:])

In [10]:
file_name = 'data/x_tur_vec4.pkl'
file_obj = open(file_name,'wb') 
pickle.dump(x_tur_vec, file_obj)   
file_obj.close()
file_name = 'data/x_tur_r_vec4.pkl'
file_obj = open(file_name,'wb') 
pickle.dump(x_tur_r_vec, file_obj)   
file_obj.close()
file_name = 'data/y_tur_in_vec4.pkl'
file_obj = open(file_name,'wb') 
pickle.dump(y_tur_in_vec, file_obj)   
file_obj.close()
file_name = 'data/y_hum_tar_vec4.pkl'
file_obj = open(file_name,'wb') 
pickle.dump(y_hum_tar_vec, file_obj)   
file_obj.close()