In [2]:
file_path = "./en_es_data/train.es"

In [5]:
import nltk

In [6]:
from collections import Counter
from docopt import docopt
from itertools import chain
import json
import torch
from typing import List
from utils import read_corpus, pad_sents


class VocabEntry(object):
    """ Vocabulary Entry, i.e. structure containing either
    src or tgt language terms.
    """
    def __init__(self, word2id=None):
        """ Init VocabEntry Instance.
        @param word2id (dict): dictionary mapping words 2 indices
        """
        if word2id:
            self.word2id = word2id
        else:
            self.word2id = dict()
            self.word2id['<pad>'] = 0   # Pad Token
            self.word2id['<s>'] = 1 # Start Token
            self.word2id['</s>'] = 2    # End Token
            self.word2id['<unk>'] = 3   # Unknown Token
        self.unk_id = self.word2id['<unk>']
        self.id2word = {v: k for k, v in self.word2id.items()}

    def __getitem__(self, word):
        """ Retrieve word's index. Return the index for the unk
        token if the word is out of vocabulary.
        @param word (str): word to look up.
        @returns index (int): index of word 
        """
        return self.word2id.get(word, self.unk_id)

    def __contains__(self, word):
        """ Check if word is captured by VocabEntry.
        @param word (str): word to look up
        @returns contains (bool): whether word is contained    
        """
        return word in self.word2id

    def __setitem__(self, key, value):
        """ Raise error, if one tries to edit the VocabEntry.
        """
        raise ValueError('vocabulary is readonly')

    def __len__(self):
        """ Compute number of words in VocabEntry.
        @returns len (int): number of words in VocabEntry
        """
        return len(self.word2id)

    def __repr__(self):
        """ Representation of VocabEntry to be used
        when printing the object.
        """
        return 'Vocabulary[size=%d]' % len(self)

    def id2word(self, wid):
        """ Return mapping of index to word.
        @param wid (int): word index
        @returns word (str): word corresponding to index
        """
        return self.id2word[wid]

    def add(self, word):
        """ Add word to VocabEntry, if it is previously unseen.
        @param word (str): word to add to VocabEntry
        @return index (int): index that the word has been assigned
        """
        if word not in self:
            wid = self.word2id[word] = len(self)
            self.id2word[wid] = word
            return wid
        else:
            return self[word]

    def words2indices(self, sents):
        """ Convert list of words or list of sentences of words
        into list or list of list of indices.
        @param sents (list[str] or list[list[str]]): sentence(s) in words
        @return word_ids (list[int] or list[list[int]]): sentence(s) in indices
        """
        if type(sents[0]) == list:
            return [[self[w] for w in s] for s in sents]
        else:
            return [self[w] for w in sents]

    def indices2words(self, word_ids):
        """ Convert list of indices into words.
        @param word_ids (list[int]): list of word ids
        @return sents (list[str]): list of words
        """
        return [self.id2word[w_id] for w_id in word_ids]

    def to_input_tensor(self, sents: List[List[str]], device: torch.device) -> torch.Tensor:
        """ Convert f_var: tensor of (max_sentence_length, batch_size)
        """
        word_ids = self.words2indices(sents)
        sents_t = pad_sents(word_ids, self['<pad>'])
        sents_var = torch.tensor(sents_t, dtype=torch.long, device=device)
        return torch.t(sents_var)

    @staticmethod
    def from_corpus(corpus, size, freq_cutoff=2):
        """ Given a corpus construct a Vocab Entry.
        @param corpus (list[str]): corpus of text produced by read_corpus function
        @param size (int): # of words in vocabulary
        @param freq_cutoff (int): if word occurs n < freq_cutoff times, drop the word
        @returns vocab_entry (VocabEntry): VocabEntry instance produced from provided corpus
        """
        vocab_entry = VocabEntry()
        word_freq = Counter(chain(*corpus))
        valid_words = [w for w, v in word_freq.items() if v >= freq_cutoff]
        print('number of word types: {}, number of word types w/ frequency >= {}: {}'
              .format(len(word_freq), freq_cutoff, len(valid_words)))
        top_k_words = sorted(valid_words, key=lambda w: word_freq[w], reverse=True)[:size]
        for word in top_k_words:
            vocab_entry.add(word)
        return vocab_entry

In [7]:
def read_corpus(file_path, source):
    # Understood
    """ Read file, where each sentence is dilineated by a `\n`.
    @param file_path (str): path to file containing corpus
    @param source (str): "tgt" or "src" indicating whether text
        is of the source language or target language
    """
    data = []
    for line in open(file_path):
        sent = nltk.word_tokenize(line)
        # only append <s> and </s> to the target sentence
        if source == 'tgt':
            sent = ['<s>'] + sent + ['</s>']
        data.append(sent)

    return data

In [8]:
file_path = "./en_es_data/train.es"

In [9]:
src_sents = read_corpus(file_path, 'src')

In [13]:
len(src_sents)

216617

In [15]:
src = VocabEntry.from_corpus(src_sents, 50000, 100)

number of word types: 93195, number of word types w/ frequency >= 100: 3095


In [17]:
src.word2id

{'<pad>': 0,
 '<s>': 1,
 '</s>': 2,
 '<unk>': 3,
 ',': 4,
 '.': 5,
 'de': 6,
 'que': 7,
 'la': 8,
 'en': 9,
 'y': 10,
 'el': 11,
 'a': 12,
 'es': 13,
 'un': 14,
 'los': 15,
 'una': 16,
 'no': 17,
 'Y': 18,
 'se': 19,
 'lo': 20,
 'las': 21,
 'para': 22,
 'con': 23,
 'por': 24,
 'del': 25,
 'ms': 26,
 '?': 27,
 'como': 28,
 ':': 29,
 '``': 30,
 "''": 31,
 'al': 32,
 'su': 33,
 'me': 34,
 'si': 35,
 'Pero': 36,
 'o': 37,
 'muy': 38,
 'este': 39,
 'mi': 40,
 'esto': 41,
 'son': 42,
 'pero': 43,
 'esta': 44,
 'est': 45,
 'eso': 46,
 'No': 47,
 'As': 48,
 'cuando': 49,
 'hacer': 50,
 'ser': 51,
 'aos': 52,
 'todo': 53,
 'La': 54,
 'hay': 55,
 'algo': 56,
 'porque': 57,
 'qu': 58,
 'nos': 59,
 'mundo': 60,
 'Es': 61,
 'El': 62,
 'sus': 63,
 'era': 64,
 'sobre': 65,
 'cosas': 66,
 'En': 67,
 'gente': 68,
 '--': 69,
 'puede': 70,
 'personas': 71,
 'todos': 72,
 'vez': 73,
 'uno': 74,
 'fue': 75,
 'estn': 76,
 'ver': 77,
 'as': 78,
 'aqu': 79,
 'tiene': 80,
 'vida': 81,
 'ha': 82,
 'tambin': 83,

In [53]:
len(vocab.src)

77

In [7]:
i = 0
source = 'src'
for line in open(file_path):
    sent = nltk.word_tokenize(line)
    # only append <s> and </s> to the target sentence
    if source == 'tgt':
        sent = ['<s>'] + sent + ['</s>']
    data.append(sent)
    print (line)
    i+=1
    if i==10:
        break

Muchas gracias Chris. Y es en verdad un gran honor tener la oportunidad de venir a este escenario por segunda vez. Estoy extremadamente agradecido.

He quedado conmovido por esta conferencia, y deseo agradecer a todos ustedes sus amables comentarios acerca de lo que tena que decir la otra noche.

Y digo eso sinceramente, en parte porque -- (Sollozos fingidos) -- lo necesito!  Pnganse en mi posicin!

Vol en el avin vicepresidencial por ocho aos.

Ahora tengo que quitarme mis zapatos o botas para subirme a un avin!

Les dir una rpida historia para ilustrar lo que ha sido para m.

Es una historia verdadera -- cada parte de esto es verdad.

Poco despus de que Tipper y yo dejamos la -- (Sollozos fingidos) -- Casa Blanca --  -- estbamos viajando desde nuestra casa en Nashville a una pequea granja que tenemos 50 millas al este de Nashville --

conduciendo nosotros mismos.

S que suena como cualquier cosa para ustedes, pero --  -- mir en el retrovisor y de repente simplemente me golpe. No haba

In [8]:
data

[['Muchas',
  'gracias',
  'Chris',
  '.',
  'Y',
  'es',
  'en',
  'verdad',
  'un',
  'gran',
  'honor',
  'tener',
  'la',
  'oportunidad',
  'de',
  'venir',
  'a',
  'este',
  'escenario',
  'por',
  'segunda',
  'vez',
  '.',
  'Estoy',
  'extremadamente',
  'agradecido',
  '.'],
 ['He',
  'quedado',
  'conmovido',
  'por',
  'esta',
  'conferencia',
  ',',
  'y',
  'deseo',
  'agradecer',
  'a',
  'todos',
  'ustedes',
  'sus',
  'amables',
  'comentarios',
  'acerca',
  'de',
  'lo',
  'que',
  'tena',
  'que',
  'decir',
  'la',
  'otra',
  'noche',
  '.'],
 ['Y',
  'digo',
  'eso',
  'sinceramente',
  ',',
  'en',
  'parte',
  'porque',
  '--',
  '(',
  'Sollozos',
  'fingidos',
  ')',
  '--',
  'lo',
  'necesito',
  '!',
  'Pnganse',
  'en',
  'mi',
  'posicin',
  '!'],
 ['Vol', 'en', 'el', 'avin', 'vicepresidencial', 'por', 'ocho', 'aos', '.'],
 ['Ahora',
  'tengo',
  'que',
  'quitarme',
  'mis',
  'zapatos',
  'o',
  'botas',
  'para',
  'subirme',
  'a',
  'un',
  'avin'

In [1]:
data = []
for line in open(file_path):
    sent = nltk.word_tokenize(line)
    # only append <s> and </s> to the target sentence
    if source == 'tgt':
        sent = ['<s>'] + sent + ['</s>']
    data.append(sent)

return data

NameError: name 'file_path' is not defined