In [1]:
import sklearn
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
#from torchtext.datasets import Multi30k
import torchtext
from torchtext.legacy.data import Field, BucketIterator
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter


In [2]:
df = pd.read_csv('/home/prassanna/M/DL/TEXT/seq2seq/dataset/hindi_english_parallel.csv')

In [3]:
df

Unnamed: 0,hindi,english
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,Accerciser Accessibility Explorer
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the bottom panel
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the top panel
4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,A list of plugins that are disabled by default
...,...,...
1561836,स्पष्टीकरण.–जहां इस उपधारा के अधीन हानि और लाभ...,स्पष्टीकरण.–जहां इस उपधारा के अधीन हानि और लाभ...
1561837,मैंने गौर किया है कि यह न केवल अपने महत्त्वपूर...,है। I note that this is a landmark meeting – n...
1561838,उन्होंने मेरे समक्ष जो प्रदर्शन किया उसमें से ...,है। In the presentations that they made before...
1561839,खाद्य और जल सुरक्षा; पर्यावरण की दृष्टि से वहन...,्त है। Issues such as food and water security;...


## use multi language spacy model

In [4]:
tokenize_hi = torchtext.data.utils.get_tokenizer(None, language='hi')

In [5]:
tokenize_hi(" खाद्य और जल सुरक्षा; पर्यावरण की दृष्टि से वहन.")

['खाद्य', 'और', 'जल', 'सुरक्षा;', 'पर्यावरण', 'की', 'दृष्टि', 'से', 'वहन.']

### spacy tokenizer is better than inbuilt in torchtext

In [6]:
spacy_hi = spacy.load("xx_sent_ud_sm") #multilang model
spacy_en = spacy.load("en_core_web_sm")

In [11]:
def tokenize_hi(text):
    return [tok.text for tok in spacy_hi.tokenizer(text)]


def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [59]:
def vectorize_hi(text):
    return [tok.vector for tok in spacy_hi.tokenizer(text)]
tokenize_hi('द्य और जल सु')
vectorize_hi('द्य और जल सु')

[array([], dtype=float32),
 array([], dtype=float32),
 array([], dtype=float32),
 array([], dtype=float32)]

#### If you use the en_core_web_lg model, the pre-trained word vectors are used as features in the model, on the way to calculating the vectors that are stored in doc.tensor. The en_core_web_sm model doesn't have pre-trained vectors.

In [58]:
doc = spacy_hi('my name')
doc[0].vector
doc[1].vector

array([], dtype=float32)

### so for hindi the model with multilang will not have vectorized pretarined vectors.

### so we have to vectorize

In [77]:
sample = tokenize_hi('खाद्य और जल सुरक्षा; पर्यावरण की दृष्टि से जल जल पर्यावरण वहन अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें')
from torchtext.vocab import vocab

from collections import Counter, OrderedDict
counter = Counter(sample)
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)
vocab_freq = vocab(ordered_dict, min_freq=1)


from torchtext.vocab import Vocab
vocab = Vocab(sample)

In [78]:
print(len(vocab_freq), len(vocab))

18 21


In [90]:
print(vocab_freq['पर्यावरण'], vocab_freq['पहुंचनीयता'],vocab_freq['सुरक्षा'],vocab_freq['खाद्य'], vocab_freq['जल'] )

1 13 4 2 0


In [91]:
vocab_freq[';']

5

In [82]:
vocab[0][4]

'य'

In [84]:
sample[0]

'खाद्य'

In [93]:
sample[0][2]

'द'

In [21]:
all_tokens = []
for i in range(len(vocab)):
    all_tokens.append(vocab[i])

In [22]:
all_tokens

['खाद्य',
 'और',
 'जल',
 'सुरक्षा',
 ';',
 'पर्यावरण',
 'की',
 'दृष्टि',
 'से',
 'वहन',
 'अपने',
 'अनुप्रयोग',
 'को',
 'पहुंचनीयता',
 'व्यायाम',
 'का',
 'लाभ',
 'दें']

In [127]:
def token_index(token):
    return all_tokens.index(token)

In [128]:
token_index('व्यायाम')

14

In [140]:
token_index('खाद्य')

0

### with million vocabulary we will have one hot vectors of million size instaed we can use mebeddings for encoding and finding relationsjhip between words

### so without one hot, we will index the tokens in vocabulary and vector embedding we be created

In [96]:
sample = tokenize_hi('खाद्य और जल सुरक्षा; पर्यावरण की दृष्टि से जल जल पर्यावरण वहन अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें')

from torchtext.vocab import vocab
from collections import Counter, OrderedDict

counter = Counter(sample)
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True) #text as a key,index as a value
ordered_dict = OrderedDict(sorted_by_freq_tuples)
voc = vocab(ordered_dict, min_freq=1)


In [109]:
for i in sample:
    print(voc[i])

2
3
0
4
5
1
6
7
8
0
0
1
9
10
11
12
13
14
15
16
17


### convert these vocab index into tensors

In [111]:
for i in sample:
    voc[i]=torch.tensor(voc[i])

TypeError: 'Vocab' object does not support item assignment

### so do it inside embedding object

In [102]:
input_size = len(voc)
embedding_size = 30
embedding = nn.Embedding(input_size, embedding_size)

In [115]:
em = embedding(torch.tensor(voc[sample[0]]))
print(type(em))
print(em)
print(em.shape)


<class 'torch.Tensor'>
tensor([ 0.2938, -1.6488, -0.0622, -0.8910,  0.4180,  0.1570, -2.9281, -0.7466,
        -0.6867, -0.2636, -0.7993,  1.9872, -0.2783, -0.2079,  2.1681, -1.0402,
         1.4449, -1.2026,  1.6475, -0.1707, -0.8598,  0.4058,  0.0160, -0.8408,
         0.2830, -0.2891, -0.1552, -0.0115,  0.6332, -1.2350],
       grad_fn=<EmbeddingBackward0>)
torch.Size([30])


### We can always use ptrtrained embeddings such as FASTTEXT, GLOVE etc