In [37]:
import numpy as np
import pandas as pd
import pickle
import os
from torch.utils.data import Dataset
import torch

In [2]:
SOS_token = 0
EOS_token = 1
UNK_IDX = 2
PAD_IDX = 3

In [3]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = ["SOS","EOS","UKN","PAD"]
        self.n_words = 4  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word.append(word)
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [4]:
def read_dataset(file):
    f = open(file)
    list_l = []
    for line in f:
        list_l.append(line.strip())
    df = pd.DataFrame()
    df['data'] = list_l
    return df

In [5]:
def token2index_dataset(df, source_lang_obj, target_lang_obj):
    for lan in ['source','target']:
        indices_data = []
        if lan=='source':
            lang_obj = source_lang_obj
        else:
            lang_obj = target_lang_obj
            
        for tokens in df[lan+'_tokenized']:
            
            index_list = [lang_obj.word2index[token] if token in lang_obj.word2index else UNK_IDX for token in tokens]
            index_list.append(EOS_token)
            indices_data.append(index_list)
            
        df[lan+'_indized'] = indices_data
        
    return df

In [6]:
def load_or_create_language_obj(source_name, source_lang_obj_path, source_data):
    
    if not os.path.exists(source_lang_obj_path):
        os.makedirs(source_lang_obj_path)
    
    full_file_path = os.path.join(source_lang_obj_path, source_name+'_lang_obj.p')
    
    if os.path.isfile(full_file_path):
        source_lang_obj = pickle.load( open( full_file_path, "rb" ) );
    else:
        source_lang_obj = Lang(source_name);
        for line in source_data:
            source_lang_obj.addSentence(line);
        pickle.dump( source_lang_obj, open(full_file_path , "wb" ) )
        
    return source_lang_obj

In [7]:
def load_language_pairs(source_path, target_path, source_name = 'en', target_name = 'vi',
                        lang_obj_path = '.', Max_Len = 10):
    source = read_dataset(source_path);
    target = read_dataset(target_path);
    
    main_df = pd.DataFrame();
    main_df['source_data'] = source['data'];
    main_df['target_data'] = target['data'];
    
    source_lang_obj = load_or_create_language_obj(source_name, lang_obj_path, main_df['source_data']);
    target_lang_obj = load_or_create_language_obj(target_name, lang_obj_path, main_df['target_data']);
    
    for x in ['source', 'target']:
        main_df[x+'_tokenized'] = main_df[x + "_data"].apply(lambda x:x.lower().split() );
        main_df[x+'_len'] = main_df[x+'_tokenized'].apply(lambda x: len(x)+1) #+1 for EOS
    
    main_df = token2index_dataset(main_df, source_lang_obj, target_lang_obj);
    
    main_df = main_df[ np.logical_and( np.logical_and(main_df['source_len'] >=2, main_df['target_len'] >=2) , 
                                  np.logical_and( main_df['source_len'] <= Max_Len, main_df['target_len'] <= Max_Len) ) ];
    
    return main_df, source_lang_obj, target_lang_obj
    

In [8]:
en_train_path = 'Data/iwslt-vi-en/train.tok.en'
vi_train_path = 'Data/iwslt-vi-en/train.tok.vi'

en_val_path = 'Data/iwslt-vi-en/dev.tok.en'
vi_val_path = 'Data/iwslt-vi-en/dev.tok.vi'

In [12]:
temp, _, _ = load_language_pairs(en_val_path, vi_val_path, 'en', 'vi')

In [14]:
temp.head()

Unnamed: 0,source_data,target_data,source_tokenized,source_len,target_tokenized,target_len,source_indized,target_indized
1,And I was very proud .,Tôi đã rất tự_hào về đất_nước tôi .,"[and, i, was, very, proud, .]",7,"[tôi, đã, rất, tự_hào, về, đất_nước, tôi, .]",9,"[16, 2, 6, 29, 30, 27, 1]","[5, 32, 33, 34, 35, 15, 5, 31, 1]"
9,I was so shocked .,Tôi đã bị sốc .,"[i, was, so, shocked, .]",6,"[tôi, đã, bị, sốc, .]",6,"[2, 6, 121, 126, 27, 1]","[5, 32, 152, 153, 31, 1]"
21,But many die .,Nhưng rất nhiều người đã chết .,"[but, many, die, .]",5,"[nhưng, rất, nhiều, người, đã, chết, .]",8,"[42, 181, 125, 27, 1]","[51, 33, 40, 115, 32, 149, 31, 1]"
35,She &apos;s not North Korean . &quot;,"Nó không phải là người Bắc Triều_Tiên . ""","[she, &apos;s, not, north, korean, ., &quot;]",8,"[nó, không, phải, là, người, bắc, triều_tiên, ...",10,"[954, 96, 80, 3465, 2, 27, 23, 1]","[185, 53, 29, 14, 115, 3504, 2, 31, 24, 1]"
38,These girls were so lucky .,những cô gái này đã rất may_mắn .,"[these, girls, were, so, lucky, .]",7,"[những, cô, gái, này, đã, rất, may_mắn, .]",9,"[483, 336, 130, 121, 337, 27, 1]","[128, 274, 349, 130, 32, 33, 350, 31, 1]"


In [15]:
class LanguagePair(Dataset):
    def __init__(self, source_name, target_name, source_path, target_path, 
                    lang_obj_path, max_len):
        
        self.source_name = source_name;
        self.target_name = target_name; 
        
        self.main_df, self.source_lang_obj, self.target_lang_obj = load_language_pairs(source_path, target_path, 
                                                                              source_name, target_name, lang_obj_path,
                                                                              max_len);
        
    def __len__(self):
        return len( self.main_df )
    
    def __getitem__(self, idx):
        
        return [self.main_df.iloc[idx]['source_indized'], self.main_df.iloc[idx]['target_indized'], 
                    self.main_df.iloc[idx]['source_len'], self.main_df.iloc[idx]['target_len'] ]

In [16]:
temp_dataset = LanguagePair('en', 'vi', en_val_path, vi_val_path, 'Language_Objects', 10)

In [17]:
temp_dataset[10]

[[2, 208, 596, 8, 597, 16, 29, 598, 27, 1],
 [2, 595, 596, 8, 597, 20, 33, 598, 8, 1],
 10,
 10]

In [25]:
print([temp_dataset.source_lang_obj.index2word[x] for x in temp_dataset[177][0] ])
temp.iloc[177]

['we', 'got', 'dirty', ',', 'and', 'we', 'loved', 'it', '.', 'EOS']


source_data                          We got dirty , and we loved it .
target_data                  Chúng_tôi lem_luốc , nhưng ai cũng vui .
source_tokenized           [we, got, dirty, ,, and, we, loved, it, .]
source_len                                                         10
target_tokenized    [chúng_tôi, lem_luốc, ,, nhưng, ai, cũng, vui, .]
target_len                                                          9
source_indized            [33, 495, 3321, 8, 16, 33, 985, 253, 27, 1]
target_indized                [38, 3458, 8, 51, 186, 230, 850, 31, 1]
Name: 1081, dtype: object

In [33]:
def vocab_collate_func(batch, MAX_LEN):
    source_data = []
    target_data = []
    source_len = []
    target_len = []

    for datum in batch:
        source_len.append(datum[2])
        target_len.append(datum[3])
    # padding
    for datum in batch:
        if datum[2]>MAX_LEN:
            padded_vec_s1 = np.array(datum[0])[:MAX_LEN]
        else:
            padded_vec_s1 = np.pad(np.array(datum[0]),
                                pad_width=((0,MAX_LEN - datum[2])),
                                mode="constant", constant_values=PAD_IDX)
        if datum[3]>MAX_LEN:
            padded_vec_s2 = np.array(datum[1])[:MAX_LEN]
        else:
            padded_vec_s2 = np.pad(np.array(datum[1]),
                                pad_width=((0,MAX_LEN - datum[3])),
                                mode="constant", constant_values=PAD_IDX)
        source_data.append(padded_vec_s1)
        target_data.append(padded_vec_s2)
        
    return [torch.from_numpy(np.array(source_data)), torch.from_numpy(np.array(target_data)),
            torch.from_numpy(np.array(source_len)), torch.from_numpy(np.array(target_len))]

In [30]:
from functools import partial
from torch.utils.data import DataLoader


In [38]:
temp_dataloader = DataLoader(temp_dataset, batch_size=32, collate_fn = partial(vocab_collate_func, MAX_LEN=10),
                                shuffle = True, num_workers=0)