In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
#from torchtext.datasets import Multi30k
import torchtext
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter

In [2]:
df = pd.read_csv('/home/prassanna/M/DL/TEXT/seq2seq/dataset/hindi_english_parallel.csv')

## use multi language spacy model

### spacy tokenizer is better than inbuilt in torchtext

In [3]:
spacy_hi = spacy.load("xx_sent_ud_sm") #multilang model
spacy_en = spacy.load("en_core_web_sm") 

In [4]:
def tokenize_hi(text):
    return [tok.text for tok in spacy_hi.tokenizer(text)]


def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

### with million vocabulary we will have one hot vectors of million size instaed we can use mebeddings for encoding and finding relationsjhip between words

### so without one hot, we will index the tokens in vocabulary and vector embedding we be created

In [5]:
sample = tokenize_hi('खाद्य और जल सुरक्षा; पर्यावरण की दृष्टि से जल जल पर्यावरण वहन अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें')

from torchtext.vocab import vocab
from collections import Counter, OrderedDict

counter = Counter(sample)
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True) #text as a key,index as a value
ordered_dict = OrderedDict(sorted_by_freq_tuples)
voc = vocab(ordered_dict, min_freq=1)


In [6]:
voc['सुरक्षा']

4

In [7]:
for i in sample:
    print(voc[i])

2
3
0
4
5
1
6
7
8
0
0
1
9
10
11
12
13
14
15
16
17


### convert these vocab index into tensors

### so do it inside embedding object

In [8]:
input_size = len(voc)
embedding_size = 30
embedding = nn.Embedding(input_size, embedding_size)

In [9]:
em = embedding(torch.tensor(voc[sample[5]]))
print(sample[5])
print(type(em))
print(em)
print(em.shape)


पर्यावरण
<class 'torch.Tensor'>
tensor([-0.8865,  0.6406,  1.2427, -0.2640,  0.1283, -0.6640,  0.0627,  0.1165,
         0.4013,  1.3648,  1.1717, -0.8545,  0.5106,  0.6137, -1.9844, -1.6924,
         0.2055,  0.6504, -0.5373,  0.6741, -1.4867,  1.3909,  0.5437, -0.3545,
        -0.0921,  2.6490,  0.7317, -1.0307,  1.7192,  0.7988],
       grad_fn=<EmbeddingBackward0>)
torch.Size([30])


### We can always use ptrtrained embeddings such as FASTTEXT, GLOVE etc

we will have 3d tensor -> (batch_size, sequence_size, embeding size)
we will use embedding_size = 300

so (batchsize, padded_sequence, 300)

In [5]:
df

Unnamed: 0,hindi,english
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,Accerciser Accessibility Explorer
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the bottom panel
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the top panel
4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,A list of plugins that are disabled by default
...,...,...
1561836,स्पष्टीकरण.–जहां इस उपधारा के अधीन हानि और लाभ...,स्पष्टीकरण.–जहां इस उपधारा के अधीन हानि और लाभ...
1561837,मैंने गौर किया है कि यह न केवल अपने महत्त्वपूर...,है। I note that this is a landmark meeting – n...
1561838,उन्होंने मेरे समक्ष जो प्रदर्शन किया उसमें से ...,है। In the presentations that they made before...
1561839,खाद्य और जल सुरक्षा; पर्यावरण की दृष्टि से वहन...,्त है। Issues such as food and water security;...


In [6]:
sent = df.iloc[76775, 0]

for rnn we will use sequence, batch, embed
so  (s,b,e)

In [7]:
dataset = df.iloc[1000:1005,]
dataset

Unnamed: 0,hindi,english
1000,अवधि को हाइलाइट रकें,Highlight duration
1001,पहुंचनीय आसंधि नोड को चुनते समय हाइलाइट बक्से ...,The duration of the highlight box when selecti...
1002,भराई के रंग को हाइलाइट करें,Highlight fill color
1003,हाइलाइट किया गया भराई का रंग और पारदर्शिता।,The color and opacity of the highlight fill.
1004,सीमांत (बोर्डर) के रंग को हाइलाइट करें,Highlight border color


#### 1 batch = 5 sentence

In [8]:
dataset.iloc[0,]

hindi      अवधि को हाइलाइट रकें
english      Highlight duration
Name: 1000, dtype: object

In [9]:
dataset.values[0][1]

'Highlight duration'

In [10]:
hindi = dataset.iloc[:,0].values
eng = dataset.iloc[:,1].values
hindi

array(['अवधि को हाइलाइट रकें',
       'पहुंचनीय आसंधि नोड को चुनते समय हाइलाइट बक्से की अवधि. ',
       'भराई के रंग को हाइलाइट करें',
       'हाइलाइट किया गया भराई का रंग और पारदर्शिता। ',
       'सीमांत (बोर्डर) के रंग को हाइलाइट करें'], dtype=object)

In [11]:
type(hindi)

numpy.ndarray

In [12]:
tok_hindi =[(tokenize_hi(sent)) for sent in hindi]
tok_hindi


[['अवधि', 'को', 'हाइलाइट', 'रकें'],
 ['पहुंचनीय',
  'आसंधि',
  'नोड',
  'को',
  'चुनते',
  'समय',
  'हाइलाइट',
  'बक्से',
  'की',
  'अवधि',
  '.'],
 ['भराई', 'के', 'रंग', 'को', 'हाइलाइट', 'करें'],
 ['हाइलाइट', 'किया', 'गया', 'भराई', 'का', 'रंग', 'और', 'पारदर्शिता', '।'],
 ['सीमांत', '(', 'बोर्डर', ')', 'के', 'रंग', 'को', 'हाइलाइट', 'करें']]

In [13]:
len(tok_hindi)

5

In [14]:
from torchtext.vocab import vocab
from collections import Counter, OrderedDict

def get_counter(data):
    counter = Counter()
    for i in range(len(data)):
        c = Counter(data[i]) 
        counter += c
    return counter


In [15]:
def get_counter2(data):
    counter = Counter()
    for i in range(len(data)):
        counter.update(data[i]) 
    return counter

In [16]:
counter2 = get_counter(tok_hindi)
print(counter2)

Counter({'हाइलाइट': 5, 'को': 4, 'रंग': 3, 'अवधि': 2, 'भराई': 2, 'के': 2, 'करें': 2, 'रकें': 1, 'पहुंचनीय': 1, 'आसंधि': 1, 'नोड': 1, 'चुनते': 1, 'समय': 1, 'बक्से': 1, 'की': 1, '.': 1, 'किया': 1, 'गया': 1, 'का': 1, 'और': 1, 'पारदर्शिता': 1, '।': 1, 'सीमांत': 1, '(': 1, 'बोर्डर': 1, ')': 1})


In [17]:
counter = get_counter(tok_hindi)
print(counter)

Counter({'हाइलाइट': 5, 'को': 4, 'रंग': 3, 'अवधि': 2, 'भराई': 2, 'के': 2, 'करें': 2, 'रकें': 1, 'पहुंचनीय': 1, 'आसंधि': 1, 'नोड': 1, 'चुनते': 1, 'समय': 1, 'बक्से': 1, 'की': 1, '.': 1, 'किया': 1, 'गया': 1, 'का': 1, 'और': 1, 'पारदर्शिता': 1, '।': 1, 'सीमांत': 1, '(': 1, 'बोर्डर': 1, ')': 1})


In [18]:
counter.items()

dict_items([('अवधि', 2), ('को', 4), ('हाइलाइट', 5), ('रकें', 1), ('पहुंचनीय', 1), ('आसंधि', 1), ('नोड', 1), ('चुनते', 1), ('समय', 1), ('बक्से', 1), ('की', 1), ('.', 1), ('भराई', 2), ('के', 2), ('रंग', 3), ('करें', 2), ('किया', 1), ('गया', 1), ('का', 1), ('और', 1), ('पारदर्शिता', 1), ('।', 1), ('सीमांत', 1), ('(', 1), ('बोर्डर', 1), (')', 1)])

In [19]:
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
print(sorted_by_freq_tuples)

[('हाइलाइट', 5), ('को', 4), ('रंग', 3), ('अवधि', 2), ('भराई', 2), ('के', 2), ('करें', 2), ('रकें', 1), ('पहुंचनीय', 1), ('आसंधि', 1), ('नोड', 1), ('चुनते', 1), ('समय', 1), ('बक्से', 1), ('की', 1), ('.', 1), ('किया', 1), ('गया', 1), ('का', 1), ('और', 1), ('पारदर्शिता', 1), ('।', 1), ('सीमांत', 1), ('(', 1), ('बोर्डर', 1), (')', 1)]


In [20]:
ordered_dict = OrderedDict(sorted_by_freq_tuples)
print(ordered_dict)

OrderedDict([('हाइलाइट', 5), ('को', 4), ('रंग', 3), ('अवधि', 2), ('भराई', 2), ('के', 2), ('करें', 2), ('रकें', 1), ('पहुंचनीय', 1), ('आसंधि', 1), ('नोड', 1), ('चुनते', 1), ('समय', 1), ('बक्से', 1), ('की', 1), ('.', 1), ('किया', 1), ('गया', 1), ('का', 1), ('और', 1), ('पारदर्शिता', 1), ('।', 1), ('सीमांत', 1), ('(', 1), ('बोर्डर', 1), (')', 1)])


In [21]:
v = vocab(ordered_dict, min_freq=1)
v['रंग']


2

In [22]:
def build_vocab(data):
    counter = get_counter(data)
    sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True) 
    #text as a key,index as a value
    ordered_dict = OrderedDict(sorted_by_freq_tuples)
    voc = vocab(ordered_dict, min_freq=1)
    return voc

In [23]:
voc = build_vocab(tok_hindi)
print(len(voc))
print(voc['बक्से'])
print(type(voc['चुनते']))

26
13
<class 'int'>


# --------------------------------------------------------------------

In [24]:
def get_sent_indexarray(sent):
    arr = [voc([word]) for word in sent]
    return arr, len(arr)

In [25]:
def get_seq_len(data):
    tok_data = [get_sent_indexarray(tokenize_hi(sent)) for sent in data]
    seq_vec = [i[0] for i in tok_data]
    seq_len = torch.LongTensor([i[1] for i in tok_data])
    return seq_vec, seq_len

In [26]:
seq_vec, seq_len = get_seq_len(hindi)

In [27]:
len(seq_vec)

5

In [43]:
seq_vec

[[[3], [1], [0], [7]],
 [[8], [9], [10], [1], [11], [12], [0], [13], [14], [3], [15]],
 [[4], [5], [2], [1], [0], [6]],
 [[0], [16], [17], [4], [18], [2], [19], [20], [21]],
 [[22], [23], [24], [25], [5], [2], [1], [0], [6]]]

In [44]:
seq_len

tensor([ 4, 11,  6,  9,  9])

In [45]:
seq_vec[0]

[[3], [1], [0], [7]]

In [98]:
torch.LongTensor(seq_vec[0]).squeeze()

tensor([3, 1, 0, 7])

# ------------------------------------------------------------

In [53]:
seq_tensor = torch.zeros((len(seq_vec), seq_len.max())).long()
seq_tensor

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [73]:
int(seq_len[0].numpy())

4

In [104]:
seq_tensor[0,:4]

tensor([0, 0, 0, 0])

In [65]:
torch.tensor([[0,0,0,0]]).shape

torch.Size([1, 4])

In [30]:
def pad_sequences(seq_vec, seq_len):
    seq_tensor = torch.zeros((len(seq_vec), seq_len.max())).long()
    for idx, (seq_vec, seq_len) in enumerate(zip(seq_vec, seq_len)):
        seq_tensor[idx,:seq_len] = torch.LongTensor(seq_vec[idx])
    return seq_tensor

In [31]:
pad_sequences(seq_vec, seq_len)

tensor([[3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0],
        [9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9],
        [2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0],
        [4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0],
        [5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0]])

In [102]:
def make_variables_hindi(data):
    seq_vec, seq_len = get_seq_len(data)
    return pad_sequences(seq_vec, seq_len)

In [103]:
make_variables_hindi(hindi)

tensor([[ 3,  1,  0,  7,  0,  0,  0,  0,  0,  0,  0],
        [ 8,  9, 10,  1, 11, 12,  0, 13, 14,  3, 15],
        [ 4,  5,  2,  1,  0,  6,  0,  0,  0,  0,  0],
        [ 0, 16, 17,  4, 18,  2, 19, 20, 21,  0,  0],
        [22, 23, 24, 25,  5,  2,  1,  0,  6,  0,  0]])

In [106]:
variable = make_variables_hindi(hindi)
variable.shape

torch.Size([5, 11])

In [108]:
variable.dtype

torch.int64

## 5 = batch size
## 11 = sequence size

### after embedding we will get (5,11,300)

In [None]:
def pad_sequences(vectorized_seqs, seq_lengths, countries):
    seq_tensor = torch.zeros((len(vectorized_seqs), seq_lengths.max())).long()
    for idx, (seq, seq_len) in enumerate(zip(vectorized_seqs, seq_lengths)):
        seq_tensor[idx, :seq_len] = torch.LongTensor(seq)

    # Sort tensors by their length
    seq_lengths, perm_idx = seq_lengths.sort(0, descending=True)
    seq_tensor = seq_tensor[perm_idx]

    # Also sort the target (countries) in the same order
    target = countries2tensor(countries)
    if len(countries):
        target = target[perm_idx]

    # Return variables
    # DataParallel requires everything to be a Variable
    return create_variable(seq_tensor), \
        create_variable(seq_lengths), \
        create_variable(target)