In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import spacy
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torch.utils.tensorboard import SummaryWriter

from torchtext.vocab import vocab
from collections import Counter, OrderedDict
from torchtext.vocab import Vocab
from torch.utils.data import DataLoader

In [2]:
small_data = pd.read_csv('/home/prassanna/M/DL/TEXT/seq2seq_learn/small_data.csv')
train, testtemp = train_test_split(small_data, test_size=0.3, shuffle=True)
val, test = train_test_split(testtemp, test_size=0.5, shuffle=True)

In [3]:
train

Unnamed: 0.1,Unnamed: 0,hindi,english
671,671,डिफोल्टों को शामिल करें,Include defaults
137,137,टिप्पणीः परिवर्तन सत्रांत (लोगआउट) के बाद ही प...,Note: Changes only take effect after logout.
445,445,निरपेक्ष स्थिति,Absolute position
711,711,अभिभावक में% s सूची शिशु सूची सेमेल नहीं खाती,% s index in parent does not match child index
50,50,घटक,Co _ mponent
...,...,...,...
412,412,घटना मानिटर,Event monitor
934,934,% (rolename) s सूची% (num) d पंक्ति और स्तंभ स...,% (rolename) s index% (num) d does not match r...
923,923,मूलभूत जीयूआई अनुप्रयोग पहुंचनीयता का परीक्षण ...,Tests fundamental GUI application accessibility
787,787,क्षैतिज विभाजन,Horizontal split


In [4]:
def tokenize_hi(text):
    return [tok.text for tok in spacy_hi.tokenizer(text)]

def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

In [5]:
def get_counter(tok_lang):
    counter_lang = Counter()
    for i in range(len(tok_lang)):
        counter_lang.update(tok_lang[i]) 
    return counter_lang

def build_vocab_with_spl(tok_lang):
    counter_lang = get_counter(tok_lang)
    sorted_by_freq_tuples_lang = sorted(counter_lang.items(), key=lambda x: x[1], reverse=True)
    #text as a key,index as a value
    ordered_dict_lang = OrderedDict(sorted_by_freq_tuples_lang)
    voc_lang = vocab(ordered_dict_lang, min_freq=1)
    spl_tok = ['<pad>', '<bos>', '<eos>', '<unk>']
    spl_tok_idx = [0,1,2,3]
    for (spl_tok,spl_tok_idx) in zip(spl_tok,spl_tok_idx):
        voc_lang.insert_token(spl_tok, spl_tok_idx)
    
    default_index = voc_lang['<unk>']
    voc_lang.set_default_index(default_index)
    return voc_lang

def get_vocabulary(data_lang,toknizer_lang):
    tok_lang = [(toknizer_lang(sent)) for sent in data_lang]
    voc_lang = build_vocab_with_spl(tok_lang)
    return voc_lang

In [6]:
def get_sent_index_array(sent, voc_lang):
    arr_lang = [voc_lang([word]) for word in sent]
    return arr_lang, len(arr_lang)


def get_seq_and_len_and_join_spl(data_lang, voc_lang, tokenize_lang):
    list_seq_tensor_lang = []
    len_seq_list_lang = []
    for_max = []
    for sent in data_lang:
        arr, length = get_sent_index_array(tokenize_lang(sent), voc_lang)
        list_seq_tensor_lang.append(torch.tensor(arr))
        len_seq_list_lang.append(torch.tensor(length))
        for_max.append(length)
        ## as we will add 2 tokens to evry sentence
        max_sequence = torch.LongTensor(for_max).max() + 2

    BOS_IDX = voc_lang['<bos>']
    EOS_IDX = voc_lang['<eos>']
    print(len(list_seq_tensor_lang))
    for i in range(len(list_seq_tensor_lang)):
        templist = []
        list_seq_tensor_lang[i] = torch.cat([torch.LongTensor([[BOS_IDX]]),
                                             list_seq_tensor_lang[i], torch.LongTensor([[EOS_IDX]])], dim=0)

    list_seq = list_seq_tensor_lang

    return list_seq, len_seq_list_lang, max_sequence

In [7]:
def pad_sequences(data_lang, voc_lang, tokenize_lang):
    list_seq, len_seq_list_lang, max_sequence = get_seq_and_len_and_join_spl(data_lang, voc_lang, tokenize_lang)
    flist_seq_tensors_lang = []

    for (idx, (seq, len_seq)) in enumerate(zip(list_seq, len_seq_list_lang)):
        seq_tensor = torch.zeros((1, max_sequence))
        seq_tensor[0, :len_seq + 2] = seq.squeeze(1)
        flist_seq_tensors_lang.append(seq_tensor)
    return flist_seq_tensors_lang


def make_variables_lang(data_lang, voc_lang, tokenize_lang):
    return pad_sequences(data_lang, voc_lang, tokenize_lang)


def createmydataset(data_lang1, data_lang2, voc_lang1, voc_lang2, tokenize_lang1, tokenize_lang2):
    which_dataset = []
    list_lang1_tensors = make_variables_lang(data_lang1, voc_lang1, tokenize_lang1)
    list_lang2_tensors = make_variables_lang(data_lang1, voc_lang2, tokenize_lang2)

    for (lang1_tensors, lang2_tensors) in zip(list_lang1_tensors, list_lang2_tensors):
        which_dataset.append((lang1_tensors.squeeze(), lang2_tensors.squeeze()))
        ## we have to squeeze so we will get 1 dimensional tensor
    return which_dataset

def get_loader(dataset,batch_size,num_workers):
    dl = DataLoader(ds, batch_size=batch_size,
                          shuffle=True, num_workers=num_workers, pin_memory=True)
    return dl


In [8]:
%%time

batch_size = 32
num_workers = 0

hindi = train.iloc[:,1].values
eng = train.iloc[:,2].values
hindi_tokenizer = tokenize_hi
eng_tokenizer = tokenize_eng

spacy_eng = spacy.load("en_core_web_sm")
spacy_hi = spacy.load("xx_sent_ud_sm") #multilang model


CPU times: user 2.09 s, sys: 308 ms, total: 2.39 s
Wall time: 2.88 s


In [9]:
%%time
voc_hi = get_vocabulary(hindi, hindi_tokenizer)
voc_eng = get_vocabulary(eng, eng_tokenizer)

CPU times: user 137 ms, sys: 134 ms, total: 272 ms
Wall time: 1.13 s


In [10]:
print(len(voc_eng),len(voc_hi))

396 432


In [13]:
%%time
ds = createmydataset(eng,hindi,voc_eng,voc_hi,eng_tokenizer,hindi_tokenizer)

print(len(ds))

print(ds[56][0].shape)
print(type(ds[76]))


700
700
700
torch.Size([30])
<class 'tuple'>
CPU times: user 419 ms, sys: 345 µs, total: 419 ms
Wall time: 275 ms


In [14]:
train_dl = get_loader(ds,batch_size, num_workers)

for x,y in train_dl:
    print(y.shape)
    print(x.shape)
    break

torch.Size([32, 30])
torch.Size([32, 30])
