In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import spacy
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torch.utils.tensorboard import SummaryWriter

In [2]:
df = pd.read_csv('/home/prassanna/M/DL/TEXT/seq2seq/dataset/hindi_english_parallel.csv')

In [3]:
small_data = df.iloc[:1000,:]

In [125]:
small_data.to_csv('small_data.csv')

In [126]:
small = pd.read_csv('/home/prassanna/M/DL/TEXT/seq2seq/small_data.csv')
small

Unnamed: 0.1,Unnamed: 0,hindi,english
0,0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout
1,1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,Accerciser Accessibility Explorer
2,2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the bottom panel
3,3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the top panel
4,4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,A list of plugins that are disabled by default
...,...,...,...
995,995,एक्सेर्साइसर,Accerciser
996,996,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout
997,997,एक्सेर्साइसर पहुंचनीयता अन्वेषक,Accerciser Accessibility Explorer
998,998,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,A list of plugins that are disabled by default


In [None]:
# train_test_split(X, y, test_size=0.3, shuffle=True)
# train = small_data.sample(frac=0.7)

In [19]:
train, testtemp = train_test_split(small_data, test_size=0.3, shuffle=True)

In [20]:
val , test = train_test_split(testtemp, test_size=0.5, shuffle=True)

#### now we have train,val, test

In [24]:
spacy_hi = spacy.load("xx_sent_ud_sm") #multilang model
spacy_en = spacy.load("en_core_web_sm") 

def tokenize_hi(text):
    return [tok.text for tok in spacy_hi.tokenizer(text)]


def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [26]:
dataset = train

In [32]:
from torchtext.vocab import vocab
from collections import Counter, OrderedDict

hindi = dataset.iloc[:,0].values
eng = dataset.iloc[:,1].values

tok_hindi =[(tokenize_hi(sent)) for sent in hindi]
tok_eng =[(tokenize_en(sent)) for sent in eng]


def get_counter(tok_data):
    counter = Counter()
    for i in range(len(tok_data)):
        counter.update(tok_data[i]) 
    return counter

def build_vocab(tok_data):
    counter = get_counter(tok_data)
    sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True) 
    #text as a key,index as a value
    ordered_dict = OrderedDict(sorted_by_freq_tuples)
    voc = vocab(ordered_dict, min_freq=1)
    return voc

In [54]:
def get_sent_indexarray(sent, voc_lang):
    arr = [voc_lang([word]) for word in sent]
    return arr, len(arr)

def get_seq_len(data):
    tok_data_l = [get_sent_indexarray(tokenize_hi(sent), voc_lang) for sent in data]
    seq_vec = [i[0] for i in tok_data_l]
    seq_len = torch.LongTensor([i[1] for i in tok_data_l])
    return seq_vec, seq_len

In [55]:
def pad_sequences(seq_vec, seq_len):
    seq_tensor = torch.zeros((len(seq_vec), seq_len.max())).long()
    for idx, (seq_vec, seq_len) in enumerate(zip(seq_vec, seq_len)):
        seq_tensor[idx,:seq_len] = torch.LongTensor(seq_vec).squeeze()
    return seq_tensor

def make_variables_lang(data):
    seq_vec, seq_len = get_seq_len(data)
    return pad_sequences(seq_vec, seq_len)

In [39]:
voc = build_vocab(tok_hindi)
voc.get_stoi()

{'मूल': 423,
 'प्लगिन': 421,
 'विभाजन': 420,
 'मुख्य': 418,
 'लेकिन': 417,
 'H': 416,
 'छिपाएँ': 414,
 'बिना': 413,
 'बच्चों': 412,
 'संयोजन': 411,
 'hotkey': 410,
 'ऊँचाई': 409,
 'Start': 407,
 'विभाजित': 405,
 '©': 403,
 'तेज': 401,
 'प्रारंभEnd': 399,
 'संस्करण': 396,
 'काम': 394,
 'गणना': 388,
 'बाल': 387,
 'कोशिकाlt;/bgt': 386,
 'क्षैतिज': 419,
 'दें': 385,
 'व्यायाम': 383,
 'त्रुटियां': 381,
 'निगम': 380,
 'आईबीएम': 379,
 '2006,2007': 378,
 'ओफसेट': 373,
 'अंतः': 372,
 'सारांशः': 371,
 'पूरा': 370,
 'संशोधनीय': 406,
 '>': 369,
 '<': 367,
 'विशिष्ट': 366,
 'खो': 363,
 'वर्तमान': 361,
 'कर': 360,
 'निचले': 357,
 'खाती': 356,
 'A': 354,
 'विचरक': 353,
 'अनुप्रयोगः': 346,
 'साथ%': 345,
 'उसके': 344,
 'मिटाने': 342,
 'करनेयोग्य': 340,
 'pluginviews': 338,
 'खाली': 334,
 'इसे': 331,
 'आप': 330,
 'क्या': 329,
 'होगा': 328,
 'आपको': 327,
 'लाभ': 384,
 'समाधान': 326,
 'पाया': 324,
 'दिखाएं': 335,
 'देख': 323,
 'मौजूद': 322,
 'तो': 319,
 'वह': 318,
 'सकती': 317,
 'bgt': 315,
 'lt': 314,
 '

In [44]:
voc.insert_token( '<bos>',1 ) #'<bos>', '<eos>','<unk>',<pad>

In [45]:
len(voc)

427

In [46]:
voc.get_stoi()

{'मूल': 425,
 'प्लगिन': 423,
 'विभाजन': 422,
 'मुख्य': 420,
 'लेकिन': 419,
 'H': 418,
 'छिपाएँ': 416,
 'बिना': 415,
 'बच्चों': 414,
 'संयोजन': 413,
 'hotkey': 412,
 'ऊँचाई': 411,
 'Start': 409,
 'विभाजित': 407,
 '©': 405,
 'तेज': 403,
 'प्रारंभEnd': 401,
 'संस्करण': 398,
 'काम': 396,
 'गणना': 390,
 'बाल': 389,
 'कोशिकाlt;/bgt': 388,
 'क्षैतिज': 421,
 'दें': 387,
 'व्यायाम': 385,
 'त्रुटियां': 383,
 'निगम': 382,
 'आईबीएम': 381,
 '2006,2007': 380,
 'ओफसेट': 375,
 'अंतः': 374,
 'सारांशः': 373,
 'पूरा': 372,
 'संशोधनीय': 408,
 '>': 371,
 '<': 369,
 'विशिष्ट': 368,
 'खो': 365,
 'वर्तमान': 363,
 'कर': 362,
 'निचले': 359,
 'खाती': 358,
 'A': 356,
 'विचरक': 355,
 'अनुप्रयोगः': 348,
 'साथ%': 347,
 'उसके': 346,
 'मिटाने': 344,
 'करनेयोग्य': 342,
 'pluginviews': 340,
 'खाली': 336,
 'इसे': 333,
 'आप': 332,
 'क्या': 331,
 'होगा': 330,
 'आपको': 329,
 'लाभ': 386,
 'समाधान': 328,
 'पाया': 326,
 'दिखाएं': 337,
 'देख': 325,
 'मौजूद': 324,
 'तो': 321,
 'वह': 320,
 'सकती': 319,
 'bgt': 317,
 'lt': 316,
 '

In [58]:
s,l = get_seq_len(hindi)

In [60]:
s[0]

[[78], [19], [78]]

In [91]:
from torchtext.vocab import Vocab
def full_vocab(tok_data):
    counter = get_counter(tok_data)
    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

In [92]:
vocabulary = full_vocab(tok_hindi)

TypeError: __init__() got an unexpected keyword argument 'specials'

In [70]:
len(vocabulary)

116

In [71]:
vocabulary.get_stoi()

{'।': 115,
 'V': 111,
 'T': 110,
 'P': 108,
 'N': 107,
 'M': 106,
 'F': 103,
 'A': 101,
 '?': 100,
 '>': 99,
 '<': 98,
 '7': 97,
 '-': 94,
 ')': 93,
 '&': 91,
 'ृ': 90,
 'k': 87,
 'f': 85,
 'E': 83,
 'H': 104,
 '@': 82,
 '1': 95,
 ',': 81,
 'ऊ': 78,
 '/': 72,
 'ओ': 70,
 'v': 69,
 'p': 68,
 'w': 66,
 'c': 65,
 'x': 75,
 '.': 63,
 '%': 60,
 'श': 26,
 'R': 109,
 'ज': 23,
 'ञ': 114,
 'I': 105,
 'च': 21,
 'म': 20,
 'त': 14,
 'प': 13,
 'स': 9,
 'ु': 24,
 'ँ': 77,
 'फ': 52,
 'द': 28,
 'इ': 36,
 'b': 58,
 'र': 6,
 'S': 84,
 ';': 74,
 'ट': 22,
 'क': 8,
 '<bos>': 2,
 '6': 96,
 'ठ': 89,
 'ै': 59,
 'ो': 16,
 '<eos>': 3,
 '(': 92,
 '0': 64,
 '्': 5,
 '_': 112,
 'औ': 88,
 'ः': 51,
 'य': 15,
 'ा': 4,
 'ौ': 80,
 '़': 79,
 '2': 73,
 'ध': 34,
 '<unk>': 0,
 'C': 102,
 'घ': 67,
 'ि': 7,
 'छ': 71,
 'न': 10,
 'व': 19,
 'ष': 40,
 'े': 12,
 'ए': 27,
 'ं': 11,
 'ह': 25,
 'थ': 55,
 '©': 113,
 'ग': 29,
 'a': 32,
 'e': 41,
 'ब': 30,
 'अ': 31,
 'ल': 17,
 'आ': 33,
 'r': 53,
 't': 35,
 '<pad>': 1,
 'ई': 37,
 'ड': 38

In [84]:
vocabulary['<eos>']

3

In [81]:
voc.insert_token('<eos>','<unk>',2,3)

TypeError: insert_token() takes 3 positional arguments but 5 were given

In [94]:
voc.insert_token('<eos>', 2)

### we can add bos eos unk  etc by concatenation inside seq-len function

In [95]:
BOS_IDX = voc['<bos>']
EOS_IDX = voc['<eos>']

In [97]:
s[0]

[[78], [19], [78]]

In [100]:
new_s0 = torch.cat([torch.tensor([BOS_IDX]), torch.tensor(s[0]), torch.tensor([EOS_IDX])], dim=0)

RuntimeError: Tensors must have same number of dimensions: got 1 and 2

In [101]:
new_s0 = torch.cat([torch.tensor([[BOS_IDX]]), torch.tensor(s[0]), torch.tensor([[EOS_IDX]])], dim=0)

In [102]:
new_s0

tensor([[ 1],
        [78],
        [19],
        [78],
        [ 2]])

In [103]:
new_s0.squeeze() #while padding required

tensor([ 1, 78, 19, 78,  2])

In [107]:
def mydataset(eng, hindi):
    train_dataset = []
    list_eng_tensors = make_variables_lang(eng)
    list_hi_tensors = make_variables_lang(hindi)
    for (eng_tensors, hi_tensors) in zip(list_eng_tensors, list_hi_tensors):
        data.append((eng_tensor, hi_tensor))
    return train_dataset

 ### but here out my variable function creates whole tensor of entire dat. we want tuples of single input tensors of eng hin (see nect notebook)

In [None]:
# def mycollate(data_batch):
#     eng_batch, hi_batch = [], []
#     for (de_item, en_item) in data_batch:
        
#     return de_batch, en_batch

In [111]:
from torch.utils.data import DataLoader
train_dl = DataLoader(train_dataset, batch_size, shuffle=True, num_workers =8, pin_memory=True)

NameError: name 'train_dataset' is not defined

In [112]:
ds = mydataset(eng, hindi)

RuntimeError: Token Preferences not found and default index is not set

#### we have yet to make eng vocab

In [116]:
arrrr = get_sent_indexarray(tokenize_hi('हाइलाइट किया गया भराई का रंग और पारदर्शिता।'))

In [117]:
arrrr

([[10], [36], [37], [43], [11], [24], [23], [178], [27]], 9)

In [119]:
z = make_variables_lang(hindi)

In [114]:
voc['होगा']

328