In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
le = LabelEncoder()
ohe = OneHotEncoder(sparse=False)

In [3]:
path_to_corpus = '../corpora/cornell-movie/dialogues.txt'

In [29]:
with open(path_to_corpus, 'r') as f:
    lines = list(f)
        
# each line is a discrete dialogue, with utterances
# tab-delimited
dialogues = []
t_dialogues = []
utterances = []
t_utterances = []
words = []
for i, line in enumerate(lines):
    dialogue = line.split('\t')[:-1]
    
    t_dialogue = []
    for utt in dialogue:
        t_utt = utt.split(' ')
        t_dialogue += [t_utt]
        words += t_utt
    
    dialogues += [dialogue] # add the dialogue (i.e., list of utterances)
    utterances += dialogue # add the utterances to the flattened list of all utterances
    t_dialogues += [t_dialogue]
    t_utterances += t_dialogue

In [30]:
dialogues[0]

['can we make this quick ? <unk> <unk> and andrew barrett are having an incredibly <unk> public <unk> up on the <unk> . again .',
 "well , i thought we 'd start with <unk> , if that 's okay with you .",
 'not the <unk> and <unk> and spitting part . please .',
 "okay ... then how 'bout we try out some french <unk> . saturday ? night ?"]

In [31]:
utterances[0]

'can we make this quick ? <unk> <unk> and andrew barrett are having an incredibly <unk> public <unk> up on the <unk> . again .'

In [32]:
len(words)

4180266

In [33]:
words[0]

'can'

In [34]:
t_dialogues[0]

[['can',
  'we',
  'make',
  'this',
  'quick',
  '?',
  '<unk>',
  '<unk>',
  'and',
  'andrew',
  'barrett',
  'are',
  'having',
  'an',
  'incredibly',
  '<unk>',
  'public',
  '<unk>',
  'up',
  'on',
  'the',
  '<unk>',
  '.',
  'again',
  '.'],
 ['well',
  ',',
  'i',
  'thought',
  'we',
  "'d",
  'start',
  'with',
  '<unk>',
  ',',
  'if',
  'that',
  "'s",
  'okay',
  'with',
  'you',
  '.'],
 ['not',
  'the',
  '<unk>',
  'and',
  '<unk>',
  'and',
  'spitting',
  'part',
  '.',
  'please',
  '.'],
 ['okay',
  '...',
  'then',
  'how',
  "'bout",
  'we',
  'try',
  'out',
  'some',
  'french',
  '<unk>',
  '.',
  'saturday',
  '?',
  'night',
  '?']]

In [35]:
t_utterances[0]

['can',
 'we',
 'make',
 'this',
 'quick',
 '?',
 '<unk>',
 '<unk>',
 'and',
 'andrew',
 'barrett',
 'are',
 'having',
 'an',
 'incredibly',
 '<unk>',
 'public',
 '<unk>',
 'up',
 'on',
 'the',
 '<unk>',
 '.',
 'again',
 '.']

In [45]:
vocab = set(words)

In [48]:
len(list(vocab))

10001

In [47]:
ie_words = le.fit_transform(list(vocab))

In [49]:
integer_encoded = le.transform(t_utterances[0])

In [50]:
integer_encoded

array([1382, 9651, 5410, 8971, 7051,  123,  121,  121,  422,  424,  802,
        536, 4092,  411, 4473,  121, 6973,  121, 9387, 6211, 8920,  121,
         50,  278,   50])

In [52]:
ohe.fit(ie_words.reshape(len(ie_words), 1))

OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=False)

In [55]:
ohe_encoded = ohe.transform(integer_encoded.reshape(len(integer_encoded), 1))

In [56]:
ohe_encoded

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [57]:
from numpy import argmax

In [62]:
ie_encoded_test = [argmax(ohe_encoded[i, :]) for i in range(len(ohe_encoded))]

In [65]:
ie_encoded_test == integer_encoded

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True], dtype=bool)

In [66]:
max_utterance_length = max([len(u) for u in t_utterances])

In [67]:
max_utterance_length

684

In [68]:
max_utterance_index = argmax([len(u) for u in t_utterances])

In [69]:
utterances[max_utterance_index]

"then let 's begin with the story itself . it 's a story of the grail myth ... and although there are several <unk> , my favorite begins with the fisher king as a young boy ... who had to spend a night alone in the forest to prove his courage ... and during that night , he is visited by a sacred vision . out of the fire , appears the holy grail - god 's highest symbol of divine grace . and a voice says to the boy , `` you shall be the guardian of the grail , that it may heal the hearts of men '' ... but the boy was overcome ... innocent and foolish , he was <unk> by greater <unk> - a life ahead filled with beauty and glory , hope and power ... tears filled his eyes as he sensed his own ... <unk> . a boy 's tears of naive wonder and inspiration . and in this state of ... radical <unk> ... he felt for a brief moment , not like a boy , but like god ... ... and so he reached into the fire to take the grail . and the grail vanished . and the boy hands were left caught in the flames ... leav

In [74]:
seqlens_dia = []
seqlens_utt = []
for t_dialogue in t_dialogues:
    # get dialogue length
    seqlens_dia += [len(t_dialogue)]
    
    # get constituent utterances lengths
    lens = [len(u) for u in t_dialogue]
    seqlens_utt += [lens]
    
seqlens_utt[0]

[25, 17, 11, 16]

In [75]:
len(t_dialogues[0])

4

In [76]:
len(t_dialogues[0][0])

25

In [77]:
seqlens_dia[0]

4

In [78]:
ie = LabelEncoder()
ohe = OneHotEncoder(sparse=False)

vocab = list(set(words))
        
# fit the encoders to the corpus vocabulary
ie_vocab = ie.fit_transform(vocab)
ohe_vocab = ohe.fit_transform(ie_vocab.reshape(len(ie_vocab), 1))

In [79]:
ohe_vocab[0]

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [80]:
argmax(ohe_vocab[0])

4075

In [81]:
le.inverse_transform(argmax(ohe_vocab[0]))

'hate'

In [82]:
words[0]

'can'

In [83]:
vocab[0]

'hate'

In [85]:
utterance = t_utterances[0]

In [86]:
utterance

['can',
 'we',
 'make',
 'this',
 'quick',
 '?',
 '<unk>',
 '<unk>',
 'and',
 'andrew',
 'barrett',
 'are',
 'having',
 'an',
 'incredibly',
 '<unk>',
 'public',
 '<unk>',
 'up',
 'on',
 'the',
 '<unk>',
 '.',
 'again',
 '.']

In [87]:
# vectorize an utterance
ie_utterance = ie.transform(utterance)
ohe_utterance = ohe.transform(ie_utterance.reshape(len(ie_utterance), 1))
ohe_utterance

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [88]:
# now transform it back
ie_utterance = [argmax(w) for w in ohe_utterance]
utterance = le.inverse_transform(ie_utterance)
utterance

array(['can', 'we', 'make', 'this', 'quick', '?', '<unk>', '<unk>', 'and',
       'andrew', 'barrett', 'are', 'having', 'an', 'incredibly', '<unk>',
       'public', '<unk>', 'up', 'on', 'the', '<unk>', '.', 'again', '.'],
      dtype='<U16')