In [1]:
import torch
from torch import nn
import numpy as np
import pandas as pd
import torch.nn.functional as F
import copy
import math

In [38]:
if torch.cuda.is_available():
    dev = "cuda:0"
else:
    dev = "cpu"

device = torch.device(dev)

In [215]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
    
    def forward(self, x):
        return self.embed(x)

In [216]:
embed = Embedder(10014, 512)
print(embed(torch.from_numpy(np.array([[1,2,3]]))))

tensor([[[ 0.9016, -1.3919,  0.5617,  ...,  0.4047, -0.1180,  0.1834],
         [-0.9618, -0.7749, -0.2524,  ..., -0.7699, -0.3608,  0.1308],
         [-0.0676,  2.2262,  0.7730,  ..., -1.1165,  2.6093, -0.0454]]],
       device='cuda:0', grad_fn=<EmbeddingBackward0>)


In [217]:
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout = 0.1):
        super().__init__()
        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)
    
    def attention(self, q, k, v, d_k, mask = None, dropout = None):
        scores = torch.matmul(q, k.transpose(-2, -1))/math.sqrt(d_k)
        if mask is not None:
            mask = mask.unsqueeze(1)
#             print("mask.shape : {}".format(mask.shape))
#             print("scores.shape : {}".format(scores.shape))
            scores = scores.masked_fill(mask == 0, -1e9)
        scores = F.softmax(scores, dim = -1)
        if dropout is not None:
            scores = dropout(scores)
        output = torch.matmul(scores, v)
        return output
    
    def forward(self, q, k, v, mask = None):
        bs = q.size(0)
#         print("bs : {}".format(bs))
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
#         print("q.shape : {} || k.shape : {} || v.shape : {}".format(q.shape, k.shape, v.shape))
        q = q.transpose(1,2)
        k = k.transpose(1,2)
        v = v.transpose(1,2)
#         print("q.shape : {} || k.shape : {} || v.shape : {}".format(q.shape, k.shape, v.shape))
        scores = self.attention(q, k, v, self.d_k, mask, self.dropout)
#         print("scores.shape : {}".format(scores.shape))
        concat = scores.transpose(1, 2).contiguous().view(bs, -1, self.d_model)
        output = self.out(concat)
        return output

In [218]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
        super().__init__() 
        # We set d_ff as a default to 2048
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)
    
    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x

In [219]:
class Norm(nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()
        self.size = d_model
        # create two learnable parameters to calibrate normalisation
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        self.eps = eps
    
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True))/(x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm

In [220]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout = 0.1):
        super().__init__()
        self.norm1 = Norm(d_model)
        self.norm2 = Norm(d_model)
        self.attn = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
    
    def forward(self, x, mask ):
#         print(x.shape)
        x1 = self.norm1(x)
#         print(x1.shape)
        temp = self.attn(x1, x1, x1, mask)
#         print(temp.shape)
        x = x + self.dropout1(temp)
        x2 = self.norm2(x)
        x = x + self.dropout2(self.ff(x2))
        return x

In [221]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout = 0.1):
        super().__init__()
        self.norm1 = Norm(d_model)
        self.norm2 = Norm(d_model)
        self.norm3 = Norm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
        self.attn1 = MultiHeadAttention(heads, d_model)
        self.attn2 = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model)
    
    def forward(self, x, src_outputs, src_mask, tgt_mask):
        x1 = self.norm1(x)
        x = x + self.dropout1(self.attn1(x1, x1, x1, tgt_mask))
        x2 = self.norm2(x)
        x = x + self.dropout2(self.attn2(src_outputs, src_outputs, x2, src_mask))
        x3 = self.norm3(x)
        x = x + self.dropout3(self.ff(x3))
        return x

In [222]:
def getClones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)]).cuda()

In [223]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.layers = getClones(EncoderLayer(d_model, heads), N)
        self.norm = Norm(d_model)
    
    def forward(self, src, mask):
        x = self.embed(src)
        for i in range(self.N):
            x = self.layers[i](x, mask)
        return self.norm(x)

In [224]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.layers = getClones(DecoderLayer(d_model, heads), N)
        self.norm = Norm(d_model)
    
    def forward(self, tgt, src_outputs, src_mask, tgt_mask):
        x = self.embed(tgt)
        for i in range(self.N):
            x = self.layers[i](x, src_outputs, src_mask, tgt_mask)
        return self.norm(x)

In [225]:
class architecture(nn.Module):
    def __init__(self, src_vocab, tgt_vocab, d_model, Ne1, Ne2, Nd1, heads):
        super().__init__()
        self.encoder1 = Encoder(src_vocab, d_model, Ne1, heads)
        self.encoder2 = Encoder(src_vocab, d_model, Ne2, heads)
        self.decoder1 = Decoder(tgt_vocab, d_model, Nd1, heads)
        self.out = nn.Linear(d_model, tgt_vocab)
    
    def forward(self, src1, src2, tgt, src_mask1, src_mask2, tgt_mask, src_mask3, index_list):
        src_outputs1 = self.encoder1(src1, src_mask1)
#         print("source 1 completed")
        src_outputs2 = self.encoder2(src2, src_mask2)
#         print("source 1 completed")
        src_ouputs = []
        for i, index in enumerate(index_list):
            if(index == -1):
                src_ouputs.append(torch.cat([src_outputs1[i], src_outputs2[i]]))
            else:
                src_ouputs.append(torch.cat([src_outputs1[i, :index], src_outputs2[i], src_outputs1[i, index:]]))
            src_ouputs[i] = src_ouputs[i][:src_outputs1.shape[1]]
        src_ouputs = torch.stack(src_ouputs)
#         print("sources stacking completed")
        tgt_output = self.decoder1(tgt, src_ouputs, src_mask3, tgt_mask)
#         print("target side completed")
        output = self.out(tgt_output)
        return output

In [226]:
src_data = open("train.src").read().strip().split('\n')
tgt_data = open("train.tgt").read().strip().split('\n')
joint_vocab_freq = {}
for data in src_data:
    tokens = data.split(" ")
    for token in tokens:
        if token in joint_vocab_freq.keys():
            joint_vocab_freq[token] = joint_vocab_freq[token] + 1
        else:
            joint_vocab_freq[token] = 1
for data in tgt_data:
    tokens = data.split(" ")
    for token in tokens:
        if token in joint_vocab_freq.keys():
            joint_vocab_freq[token] = joint_vocab_freq[token] + 1
        else:
            joint_vocab_freq[token] = 1
sorted_joint_vocab_freq = sorted(joint_vocab_freq.items(), key=lambda x:x[1], reverse=True)
sorted_joint_vocab_freq = dict(sorted_joint_vocab_freq)
joint_vocab = list(sorted_joint_vocab_freq.keys())
joint_vocab.insert(0, "<END>")
joint_vocab.insert(0, "<START>")
joint_vocab.insert(0, "<PAD>")
joint_vocab.insert(0, "<UNK>")
#print(sorted_joint_vocab_freq)

In [227]:
print(joint_vocab)

['<UNK>', '<PAD>', '<START>', '<END>', '+', 'iti', 'api', 'eva', 'ca', 'a@@', 'na', 'ya@@', 'A@@', 'ma@@', 'M_@@', 'yA@@', 'vi@@', 'ka@@', 'mA@@', 'ra@@', 'pra@@', 'H', 'va@@', 'r@@', 'M', 'ya', 'yam', 'tu', 'ta@@', 't@@', 'e', 'vA@@', 'e@@', 'sva@@', 'sa@@', 's@@', 'yaH', 'm', 'kA@@', 'tat@@', 'nA@@', ':', 'asya', 'rA@@', 'na@@', 'an@@', 'saH', 'ni@@', 'o_@@', 'da@@', 'pa@@', 'la@@', 'yat', 'd@@', 'Sca', 'a', 'S@@', 'aH', 'tat', 'saM@@', 'tad@@', 'yA', 'Sa@@', 'pA@@', 'A', 'as@@', 'kaH', 'am', 'Ba@@', 'vA', 'ti@@', 'm@@', 'kam', 'l@@', 'ga@@', 'I@@', 'mapi', 'meva', 'tA@@', '_@@', 'tam', 'sya', 'n@@', 'ena', 'sA@@', 'k@@', 'it@@', 'ram', 'upa@@', 'ja@@', 'anu@@', 'su@@', 'ana@@', 't', 'yat@@', 'u@@', 'dA@@', 'n', '', 'taH', 'ita@@', 'BAva@@', 'tatra', 'h@@', 'iva', 'miti', 'j@@', 'ye@@', 'yAm', 'i@@', 'D@@', 'p@@', 'mat@@', 'cA@@', 'R@@', 'man@@', 'ityarTaH', 'yAH', 'sa_@@', 'yaM', 'raH', 'Ra@@', 'An@@', 'Bi@@', 'sam@@', 'z@@', 'pari@@', 'lA@@', 'Adi@@', 'BA@@', 'di@@', 'o@@', 'asti',

In [228]:
from sklearn.model_selection import train_test_split

In [229]:
embed = Embedder(len(joint_vocab), 512)
joint_vocab_index_dict = {}
for index, word in enumerate(joint_vocab):
    joint_vocab_index_dict[word] = index

# all_data = zip(list_a, list_b)
non_test_src_data, test_src_data, non_test_tgt_data, test_tgt_data = train_test_split(src_data, tgt_data, test_size=0.2, shuffle=True)
train_src_data, valid_src_data, train_tgt_data, valid_tgt_data = train_test_split(non_test_src_data, non_test_tgt_data, test_size=0.2, shuffle=True)

In [230]:
print(joint_vocab_index_dict)

{'<UNK>': 0, '<PAD>': 1, '<START>': 2, '<END>': 3, '+': 4, 'iti': 5, 'api': 6, 'eva': 7, 'ca': 8, 'a@@': 9, 'na': 10, 'ya@@': 11, 'A@@': 12, 'ma@@': 13, 'M_@@': 14, 'yA@@': 15, 'vi@@': 16, 'ka@@': 17, 'mA@@': 18, 'ra@@': 19, 'pra@@': 20, 'H': 21, 'va@@': 22, 'r@@': 23, 'M': 24, 'ya': 25, 'yam': 26, 'tu': 27, 'ta@@': 28, 't@@': 29, 'e': 30, 'vA@@': 31, 'e@@': 32, 'sva@@': 33, 'sa@@': 34, 's@@': 35, 'yaH': 36, 'm': 37, 'kA@@': 38, 'tat@@': 39, 'nA@@': 40, ':': 41, 'asya': 42, 'rA@@': 43, 'na@@': 44, 'an@@': 45, 'saH': 46, 'ni@@': 47, 'o_@@': 48, 'da@@': 49, 'pa@@': 50, 'la@@': 51, 'yat': 52, 'd@@': 53, 'Sca': 54, 'a': 55, 'S@@': 56, 'aH': 57, 'tat': 58, 'saM@@': 59, 'tad@@': 60, 'yA': 61, 'Sa@@': 62, 'pA@@': 63, 'A': 64, 'as@@': 65, 'kaH': 66, 'am': 67, 'Ba@@': 68, 'vA': 69, 'ti@@': 70, 'm@@': 71, 'kam': 72, 'l@@': 73, 'ga@@': 74, 'I@@': 75, 'mapi': 76, 'meva': 77, 'tA@@': 78, '_@@': 79, 'tam': 80, 'sya': 81, 'n@@': 82, 'ena': 83, 'sA@@': 84, 'k@@': 85, 'it@@': 86, 'ram': 87, 'upa@@': 88

In [231]:
def transformWords(name, joint_vocab_index_dict, src_data, tgt_data):
    with open("transformed/" + name + "_transformed_src.txt", "w") as fwritesrc, open("transformed/" + name+"_transformed_tgt.txt", "w") as fwritetgt:
        updated_src_data = []
        updated_tgt_data = []
        tgt_writing_content = ""
        for index1, line in enumerate(tgt_data):
            updated_tgt_data.append("<START>" + " " + line + " " + "<END>")
            updated_tgt_data[index1] = updated_tgt_data[index1].split(" ")
            tgt_writing_content = str(updated_tgt_data[index1])
            for index2, element in enumerate(updated_tgt_data[index1]):
                if(updated_tgt_data[index1][index2] in joint_vocab_index_dict.keys()):
                    updated_tgt_data[index1][index2] = joint_vocab_index_dict[updated_tgt_data[index1][index2]]
                else:
                    updated_tgt_data[index1][index2] = joint_vocab_index_dict["<UNK>"]
            tgt_writing_content = tgt_writing_content + "||" + str(updated_tgt_data[index1]) + "\n"
            fwritetgt.write(tgt_writing_content)
#             updated_tgt_data[index1] = torch.Tensor(updated_tgt_data[index1])
        src_writing_content = ""
        for index1, line in enumerate(src_data):
            updated_src_data.append(line.split(" "))
            src_writing_content = str(updated_src_data[index1])
            for index2, element in enumerate(updated_src_data[index1]):
                if(updated_src_data[index1][index2] in joint_vocab_index_dict.keys()):
                    updated_src_data[index1][index2] = joint_vocab_index_dict[updated_src_data[index1][index2]]
                else:
                    updated_src_data[index1][index2] = joint_vocab_index_dict["<UNK>"]
            src_writing_content = src_writing_content + "||" + str(updated_src_data[index1]) + "\n"
            fwritesrc.write(src_writing_content)
#             updated_src_data[index1] = torch.Tensor(updated_src_data[index1])
        return updated_src_data, updated_tgt_data

In [232]:
train_src_data, train_tgt_data = transformWords("train", joint_vocab_index_dict, train_src_data, train_tgt_data)
test_src_data, test_tgt_data = transformWords("test", joint_vocab_index_dict, test_src_data, test_tgt_data)
valid_src_data, valid_tgt_data = transformWords("valid", joint_vocab_index_dict, valid_src_data, valid_tgt_data)

In [233]:
train_df = pd.DataFrame({'src': train_src_data, 'tgt': train_tgt_data})
test_df = pd.DataFrame({'src': test_src_data, 'tgt': test_tgt_data})
valid_df = pd.DataFrame({'src': valid_src_data, 'tgt': valid_tgt_data})

In [234]:
train_df.to_csv("train_data.csv", index=False)
test_df.to_csv("test_data.csv", index=False)
valid_df.to_csv("valid_data.csv", index=False)

In [235]:
from torch.utils.data import Dataset, DataLoader
import ast
from torch.nn.utils.rnn import pad_sequence
import time 
from torch.autograd import Variable

In [236]:
epochs = 30
lr = 0.00005
betas = (0.9, 0.98)
d_model = 512
Ne1 = 2
Ne2 = 2
Nd1 = 2
heads = 1
checkpoint = 2
embed = Embedder(len(joint_vocab), d_model)

In [237]:
class customDataset(Dataset):
    def __init__(self, csv_file):
        self.content = pd.read_csv(csv_file)
#         print(self.content['src'])
        self.content[['src1', 'src2']] = self.content['src'].str.split(', 4, ', expand=True)
        self.content['src1'] = self.content['src1'] + "]"
        self.content['src2'] = "[" + self.content['src2']
        self.source_texts1 = self.content['src1']
        self.source_texts2 = self.content['src2']
#         print(self.content['src1'])
#         print(self.content['src2'])
        self.target_texts = self.content['tgt']

    def __len__(self):
        return len(self.content)

    def __getitem__(self, idx):
#         print(type(self.content))
        src_data1 = ast.literal_eval(self.source_texts1[idx])
        src_data2 = ast.literal_eval(self.source_texts2[idx])
        tgt_data = ast.literal_eval(self.target_texts[idx])
#         print(src_data, tgt_data)
        return torch.tensor(src_data1).to(torch.int64), torch.tensor(src_data2).to(torch.int64), torch.tensor(tgt_data).to(torch.int64)

In [238]:
class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx
    
    def __call__(self, batch):
        batch_size = len(batch)
        source1 = [item[0] for item in batch]
#         source1 = pad_sequence(source1, batch_first=False, padding_value = self.pad_idx)
#         source1 = embed(source1)
        source2 = [item[1] for item in batch]
#         source2 = pad_sequence(source2, batch_first=False, padding_value = self.pad_idx)
#         source2 = embed(source2)    
        target = [item[2] for item in batch] 
#         target = pad_sequence(target, batch_first=False, padding_value = self.pad_idx)
        final = source1 + source2 + target
        #pad them using pad_sequence method from pytorch. 
        final = pad_sequence(final, batch_first=False, padding_value = self.pad_idx)
#         print(final.shape)
        source1 = final[:, :batch_size]
        source2 = final[:, batch_size:(2*batch_size)]
        target = final[:, (2*batch_size):(3*batch_size)]
#         target = embed(target)
        return source1, source2, target

In [239]:
class MyCollate1:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx
    
    def __call__(self, batch):
        source1 = [item[0] for item in batch]
        source1 = pad_sequence(source1, batch_first=False, padding_value = self.pad_idx)
#         source1 = embed(source1)
        source2 = [item[1] for item in batch]
        source2 = pad_sequence(source2, batch_first=False, padding_value = self.pad_idx)
#         source2 = embed(source2)    
        target = [item[2] for item in batch] 
        #pad them using pad_sequence method from pytorch. 
        target = pad_sequence(target, batch_first=False, padding_value = self.pad_idx)
#         target = embed(target)
        return source1, source2, target

In [240]:
def get_train_loader(dataset, joint_vocab_index_dict, batch_size, num_workers=0, shuffle=True, pin_memory=True): #increase num_workers according to CPU
    pad_idx = joint_vocab_index_dict["<PAD>"]
    loader = DataLoader(dataset, batch_size = batch_size, num_workers = num_workers,
                        shuffle=shuffle, pin_memory=pin_memory, collate_fn = MyCollate(pad_idx=pad_idx)) #MyCollate class runs __call__ method by default
    return loader

def get_valid_loader(dataset, joint_vocab_index_dict, batch_size, num_workers=0, shuffle=True, pin_memory=True):
    pad_idx = joint_vocab_index_dict["<PAD>"]
    loader = DataLoader(dataset, batch_size = batch_size, num_workers = num_workers,
                        shuffle=shuffle, pin_memory=pin_memory, collate_fn = MyCollate(pad_idx=pad_idx))
    return loader

In [241]:
# def get_train_loader1(dataset, joint_vocab_index_dict, batch_size, num_workers=0, shuffle=True, pin_memory=True): #increase num_workers according to CPU
#     pad_idx = joint_vocab_index_dict["<PAD>"]
#     loader = DataLoader(dataset, batch_size = batch_size, num_workers = num_workers,
#                         shuffle=shuffle, pin_memory=pin_memory, collate_fn = MyCollate1(pad_idx=pad_idx)) #MyCollate class runs __call__ method by default
#     return loader

In [242]:
train_dataset = customDataset("train_data.csv")
test_dataset = customDataset("test_data.csv")
valid_dataset = customDataset("valid_data.csv")

In [243]:
train_tuple = zip(train_src_data, train_tgt_data)
test_tuple = zip(test_src_data, test_tgt_data)
valid_tuple = zip(valid_src_data, valid_tgt_data)

In [244]:
train_dataloader = get_train_loader(train_dataset, joint_vocab_index_dict, batch_size = 256, shuffle = True, pin_memory=False)
valid_dataloader = get_valid_loader(valid_dataset, joint_vocab_index_dict, batch_size = 256, shuffle = True, pin_memory=False)

In [245]:
# train_dataloader1 = get_train_loader1(train_dataset, joint_vocab_index_dict, batch_size = 256, shuffle = True)

In [246]:
# for i, (x1, x2, y) in enumerate(train_dataloader1):
#     print("{} || {} || {}".format(x1.shape, x2.shape, y.shape))

In [247]:
for i, (x1, x2, y) in enumerate(train_dataloader):
    print("{} || {} || {}".format(x1.shape, x2.shape, y.shape))

torch.Size([18, 256]) || torch.Size([18, 256]) || torch.Size([18, 256])
torch.Size([20, 256]) || torch.Size([20, 256]) || torch.Size([20, 256])
torch.Size([16, 256]) || torch.Size([16, 256]) || torch.Size([16, 256])
torch.Size([31, 256]) || torch.Size([31, 256]) || torch.Size([31, 256])
torch.Size([16, 256]) || torch.Size([16, 256]) || torch.Size([16, 256])
torch.Size([15, 256]) || torch.Size([15, 256]) || torch.Size([15, 256])
torch.Size([14, 256]) || torch.Size([14, 256]) || torch.Size([14, 256])
torch.Size([16, 256]) || torch.Size([16, 256]) || torch.Size([16, 256])
torch.Size([18, 256]) || torch.Size([18, 256]) || torch.Size([18, 256])
torch.Size([17, 256]) || torch.Size([17, 256]) || torch.Size([17, 256])
torch.Size([14, 256]) || torch.Size([14, 256]) || torch.Size([14, 256])
torch.Size([15, 256]) || torch.Size([15, 256]) || torch.Size([15, 256])
torch.Size([16, 256]) || torch.Size([16, 256]) || torch.Size([16, 256])
torch.Size([23, 256]) || torch.Size([23, 256]) || torch.Size([23

torch.Size([15, 256]) || torch.Size([15, 256]) || torch.Size([15, 256])
torch.Size([20, 256]) || torch.Size([20, 256]) || torch.Size([20, 256])
torch.Size([27, 256]) || torch.Size([27, 256]) || torch.Size([27, 256])
torch.Size([18, 256]) || torch.Size([18, 256]) || torch.Size([18, 256])
torch.Size([28, 256]) || torch.Size([28, 256]) || torch.Size([28, 256])
torch.Size([20, 256]) || torch.Size([20, 256]) || torch.Size([20, 256])
torch.Size([16, 256]) || torch.Size([16, 256]) || torch.Size([16, 256])
torch.Size([15, 256]) || torch.Size([15, 256]) || torch.Size([15, 256])
torch.Size([15, 256]) || torch.Size([15, 256]) || torch.Size([15, 256])
torch.Size([15, 256]) || torch.Size([15, 256]) || torch.Size([15, 256])
torch.Size([15, 256]) || torch.Size([15, 256]) || torch.Size([15, 256])
torch.Size([26, 256]) || torch.Size([26, 256]) || torch.Size([26, 256])
torch.Size([21, 256]) || torch.Size([21, 256]) || torch.Size([21, 256])
torch.Size([17, 256]) || torch.Size([17, 256]) || torch.Size([17

In [248]:
def getMasks(src1, src2, tgt, src):
    src_mask1_1 = (src1 != joint_vocab_index_dict["<PAD>"]).unsqueeze(-2) + 0
    src_mask1_2 = src_mask1_1.transpose(-2, -1)
    src_mask1 = ~torch.matmul(src_mask1_2, src_mask1_1).type(torch.bool)
    
    src_mask2_1 = (src2 != joint_vocab_index_dict["<PAD>"]).unsqueeze(-2) + 0
    src_mask2_2 = src_mask2_1.transpose(-2, -1)
    src_mask2 = ~torch.matmul(src_mask2_2, src_mask2_1).type(torch.bool)
    
    tgt_mask1_1 = (tgt != joint_vocab_index_dict["<PAD>"]).unsqueeze(-2) + 0
    tgt_mask1_2 = tgt_mask1_1.transpose(-2, -1)
    tgt_mask1 = torch.matmul(tgt_mask1_2, tgt_mask1_1).type(torch.bool)
    np_mask = np.triu(np.ones((tgt.size(0), tgt.size(1), tgt.size(1))), k=1).astype('uint8')
    np_mask =  Variable(torch.from_numpy(np_mask) == 1)
    tgt_mask1 = tgt_mask1 & np_mask
    
    src_mask3_1 = (src != joint_vocab_index_dict["<PAD>"]).unsqueeze(-2) + 0
    src_mask3_2 = src_mask3_1.transpose(-2, -1)
    src_mask3 = ~torch.matmul(src_mask3_2, src_mask3_1).type(torch.bool)
    for i in range(src_mask3.size(0)):
        for j in range(src_mask3.size(1)):
            for k in range(src_mask3.size(2)):
                if(abs(j-k) > 1):
                    src_mask3[i][j][k] = True
    return src_mask1, src_mask2, tgt_mask1, src_mask3

In [249]:
model = architecture(src_vocab=len(joint_vocab), tgt_vocab=len(joint_vocab), d_model=d_model, Ne1=Ne1, Ne2=Ne2, Nd1=Nd1, heads=heads)
# model = model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=betas, eps=1e-9)

In [250]:
model.train()

architecture(
  (encoder1): Encoder(
    (embed): Embedder(
      (embed): Embedding(5092, 512)
    )
    (layers): ModuleList(
      (0): EncoderLayer(
        (norm1): Norm()
        (norm2): Norm()
        (attn): MultiHeadAttention(
          (q_linear): Linear(in_features=512, out_features=512, bias=True)
          (k_linear): Linear(in_features=512, out_features=512, bias=True)
          (v_linear): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (out): Linear(in_features=512, out_features=512, bias=True)
        )
        (ff): FeedForward(
          (linear_1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear_2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): EncoderLayer(
        (norm1): Norm()
        (n

In [251]:
start = time.time()
for epoch in range(epochs):
    total_loss = 0
    if checkpoint > 0:
        torch.save(model.state_dict(), 'checkpoints/model_weights')
    counter = 0
    total_batches = 0
    for i, (src1, src2, tgt) in enumerate(train_dataloader):
        src1 = src1.transpose(0, 1)
        src2 = src2.transpose(0, 1)
        tgt = tgt.transpose(0, 1)
        src = []
        index_list = []
        for i in range(src1.size(0)):
            indexes = (src1[i] == joint_vocab_index_dict["<PAD>"]).nonzero(as_tuple=False)
            if(indexes.shape[0] == 0):
                index_list.append(-1)
                src.append(torch.cat([src1[i], src2[i]]))
            else:
                index = indexes[0][0]
                index_list.append(index)
                src.append(torch.cat([src1[i, :index], src2[i], src1[i, index:]]))
            src[i] = src[i][:src1.size(1)]
        src = torch.stack(src)
        tgt_input = tgt[:, :-1]
        src_mask1, src_mask2, tgt_mask1, src_mask3 = getMasks(src1, src2, tgt, src)
        src1 = src1.type(torch.LongTensor)
        src2 = src2.type(torch.LongTensor)
        tgt = tgt.type(torch.LongTensor)
        preds = model(src1, src2, tgt, src_mask1, src_mask2, tgt_mask1, src_mask3, index_list)
        ys = tgt[:, 1:].contiguous().view(-1)
        preds_just = preds.view(-1, preds.size(-1))
#         print(preds_just.shape)
        preds = preds[:, :preds.shape[1] - 1, :]
        preds_reshaped = []
        for i in range(preds.shape[0]):
            for j in range(preds.shape[1]):
                preds_reshaped.append(preds[i, j, :])
        preds_reshaped = torch.stack(preds_reshaped)
#         print("preds_reshaped.shape : {}".format(preds_reshaped.shape))
#         print("ys.shape : {}".format(ys.shape))
        optimizer.zero_grad()
        loss = F.cross_entropy(preds_reshaped, ys, ignore_index=joint_vocab_index_dict["<PAD>"])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_batches = total_batches + 1
        print("total time till : {} || epoch : {} || batch : {} || loss : {}".format((time.time() - start)//60, epoch + 1, total_batches, loss.item()))
        counter = counter + 1
        if(counter == 10):
            print("total time till : {} || epoch : {} || loss : {}".format((time.time() - start)//60, epoch + 1, total_loss/counter))
            total_loss = 0
            counter = 0

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper__index_select)

In [None]:
print((torch.tensor([2418,  722,  755,   85, 2380,   11,  236, 1345, 3325, 1642]) == 1).nonzero(as_tuple=False).shape[0])

In [None]:
src, tgt = next(iter(train_dataloader))
print(src[:10])
src = src.transpose(0, 1)
tgt = tgt.transpose(0, 1)
# print(src.transpose(-2, -1).shape)
src_mask = (src != joint_vocab_index_dict["<PAD>"])
print(src_mask[:10])

In [None]:
from torch.autograd import Variable 
src_mask2_1 = (src != joint_vocab_index_dict["<PAD>"]).unsqueeze(-2) + 0
src_mask2_2 = src_mask2_1.transpose(-2, -1)
src_mask2 = ~torch.matmul(src_mask2_2, src_mask2_1).type(torch.bool)
for i in range(src_mask2.size(0)):
    for j in range(src_mask2.size(1)):
        for k in range(src_mask2.size(2)):
            if(abs(j-k) > 1):
                src_mask2[i][j][k] = True
print(src_mask2[0])
# src_mask2_1 = (src != joint_vocab_index_dict["<PAD>"]).unsqueeze(-2) + 0
# src_mask2_2 = src_mask2_1.transpose(-2, -1)
# src_mask2 = torch.matmul(src_mask2_2, src_mask2_1).type(torch.bool)
# np_mask = np.triu(np.ones((src_mask2.size(0), src_mask2.size(1), src_mask2.size(1))), k=2).astype('uint8')
# np_mask = Variable(torch.from_numpy(np_mask) == 0)
# src_mask2 = src_mask2 & np_mask
# np_mask = np.tril(np.ones((src_mask2.size(0), src_mask2.size(1), src_mask2.size(1))), k=2).astype('uint8')
# np_mask = Variable(torch.from_numpy(np_mask) == 0)
# src_mask2 = src_mask2 & np_mask
# print(src_mask2)