## Import Libraries and Packages



In [1]:
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import pickle

# from torchtext.legacy.data import Field, BucketIterator,TabularDataset
import random
import re
from torchtext import data

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
#!pip install -U nltk
import nltk
import sys
#nltk.download('wordnet')

from nltk.tokenize import sent_tokenize
import nltk.data
#nltk.download('punkt')
from nltk.tokenize import word_tokenize

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## Load and Preprocess dataset

### Load dataset

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
dataset = pd.read_csv("../input/proj-dataset/proj_dataset.csv" ,index_col=0).reset_index(drop=True)

In [6]:
dataset = dataset[(dataset["labels"]=="FRENCH") | (dataset["labels"]=="SPANISH")]

In [7]:
dataset

Unnamed: 0,source,target,labels
0,Whilst those of a left-wing persuasion define ...,Nous avons donc besoin de ces mesures et le ca...,FRENCH
1,In the interinstitutional agreement on the fin...,". (EN) Actualmente, la Comisión está aplicando...",SPANISH
2,A great deal of funding has been freed up for ...,Pero aunque se reservan el poder de decidir no...,SPANISH
3,"We must all do this together, at all levels of...",Yo también tengo en mi poder una versión más l...,SPANISH
4,The European Union must contribute towards the...,"Pour la présidence française, cette propositio...",FRENCH
...,...,...,...
299995,"We have high taxation, which I used to be in f...",Cette obligation est déjà stipulée dans le tex...,FRENCH
299996,Other directives will follow and I believe tha...,Si nous exécutons ce programme en faisant preu...,FRENCH
299997,Nothing short of a cessation of the conflict w...,Debemos elaborar un plan de recursos humanos a...,SPANISH
299998,I understand that Members who are not taking p...,"Pour les hommes politiques, la pratique qui co...",FRENCH


#### Split into train and dev set

In [8]:
train_data, dev_data = train_test_split(dataset, test_size=0.2, shuffle=False)

dev_data.to_csv("dev_set.csv", index=None)
train_data.to_csv("train_set.csv",index=None)  

In [9]:
# train_data = pd.read_csv('../input/notebook0bb169eae2/train_set.csv')
# dev_data =  pd.read_csv('../input/notebook0bb169eae2/dev_set.csv')

### Preprocess data

In [10]:
en_short_forms_dict ={"'ll":" will",
                      "'re":" are",
                      "i'm":"i am",
                      "'ve":" have",
                      "\'ve" :" have",
                      "\'s":"'s",
                      "\'ll":" will",
                      "\'re":" are",
                      "n\'t":"n't" ,
                      " y'all":" you all",
                       " i\'m":" i am",
                      "'em":"them",
                      "can't":"can not",
                      "won't":"will not",
                      "cannot":"can not",
                       "isn't" :"is not",
                       "aren't":"are not",
                      "wouldn't":"would not",
                      "shouldn't":"should not",
                      "couldn't":"could not",
                      "wasn't":"was not",
                      "weren't":"were not",
                      "hasn't":"has not",
                      "hadn't":"had not",
                      "haven't":"have not",
                      "'ii":" will",
                      "fuckin'":"funcking"   
                     }



def tgt_tokenizer(label, sentence):
  language = ""
  if label in ['HUMOROUS', 'NON-HUMOROUS']:
     language = 'english'
  elif label =='FRENCH':
     language = 'french'
  elif label == 'SPANISH':
     language = 'spanish'
  regex = re.compile(r'[@_!♫♪#$%^&*(.,)<>?/\|}{~:;-]')
  sentence = regex.sub('',sentence)                  
  sentence = sentence.lower()   
  for key in en_short_forms_dict:
    sentence= sentence.replace(key, en_short_forms_dict[key])           #replaces short forms with full forms 
  token_list = word_tokenize(sentence,language=language)                                  #tokenization
  return token_list 

In [11]:
class Vocab:
   def __init__(self):
     self.word2index = {"<unk>":0 , "<sos>":1, "<eos>":2, "<pad>":3}
     self.index2word = {0:"<unk>" , 1:"<sos>", 2:"<eos>", 3:"<pad>"}
     self.vocab_size = len(self.word2index)
     self.word_count = {"<unk>":1 , "<sos>":1, "<eos>":1, "<pad>":1}

   def add_to_vocab(self,token_list):
     for token in token_list:
        if token not in self.word2index:        #add to vocab only if its not already present 
           ind = len(self.word2index)
           self.word2index[token] = ind
           self.index2word[ind] = token
           self.vocab_size += 1
           self.word_count[token] = 1
        else:
           self.word_count[token] += 1  

In [12]:
train_data

Unnamed: 0,source,target,labels
0,Whilst those of a left-wing persuasion define ...,Nous avons donc besoin de ces mesures et le ca...,FRENCH
1,In the interinstitutional agreement on the fin...,". (EN) Actualmente, la Comisión está aplicando...",SPANISH
2,A great deal of funding has been freed up for ...,Pero aunque se reservan el poder de decidir no...,SPANISH
3,"We must all do this together, at all levels of...",Yo también tengo en mi poder una versión más l...,SPANISH
4,The European Union must contribute towards the...,"Pour la présidence française, cette propositio...",FRENCH
...,...,...,...
240023,I therefore hope that proposed Amendment No 19...,"L'étiquetage permet, grâce à la transparence q...",FRENCH
240026,"According to the Rules of Procedure, you are n...",La formation du personnel intégrant les déléga...,FRENCH
240027,We can put a figure on both our employment obj...,Esta tendencia se pone igualmente de manifiest...,SPANISH
240028,"In France, 95% of elected representatives are ...","Là-bas, le détachement chargé de rendre les ho...",FRENCH


In [13]:
token_columns = {"src_token_indices":[], "label_token_indices":[],"tgt_token_indices":[]}

src_field = Vocab()
tgt_field = Vocab()
for i in range(len(train_data)):
   
   temp_src = train_data['source'].iloc[i]
   temp_tgt = train_data['target'].iloc[i]
   temp_label = train_data['labels'].iloc[i]

   #tokenisation
   src_tokens = tgt_tokenizer(temp_label, temp_src)
   tgt_tokens = tgt_tokenizer(temp_label, temp_tgt)

   #add to vocab
   src_field.add_to_vocab(src_tokens)
   tgt_field.add_to_vocab(tgt_tokens)

   #convert tokens to indices and add <sos> at the beginning ang <eos> at the end
   src_indices = [src_field.word2index['<sos>']] + [src_field.word2index[token] if token in src_field.word2index else src_field.word2index["<unk>"] for token in src_tokens] + [src_field.word2index['<eos>']]
   tgt_indices = [tgt_field.word2index['<sos>']] + [tgt_field.word2index[token] if token in tgt_field.word2index else tgt_field.word2index["<unk>"] for token in tgt_tokens] + [tgt_field.word2index['<eos>']]
   
   #add to dictionary
   token_columns["src_token_indices"].append(src_indices)
   token_columns["tgt_token_indices"].append(tgt_indices)


train_data["src_token_indices"] =  token_columns["src_token_indices"]
train_data["tgt_token_indices"] =  token_columns["tgt_token_indices"]

#calculate the no of tokens in src
train_data["len_of_src"] = train_data["src_token_indices"].apply(len)
train_data["len_of_tgt"] = train_data["tgt_token_indices"].apply(len)
train_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,source,target,labels,src_token_indices,tgt_token_indices,len_of_src,len_of_tgt
0,Whilst those of a left-wing persuasion define ...,Nous avons donc besoin de ces mesures et le ca...,FRENCH,"[1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 7, 14, 1...","[1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...",47,56
1,In the interinstitutional agreement on the fin...,". (EN) Actualmente, la Comisión está aplicando...",SPANISH,"[1, 29, 25, 41, 42, 43, 25, 44, 45, 46, 37, 47...","[1, 52, 53, 45, 54, 55, 56, 57, 58, 59, 60, 61...",57,30
2,A great deal of funding has been freed up for ...,Pero aunque se reservan el poder de decidir no...,SPANISH,"[1, 7, 65, 66, 6, 67, 68, 69, 70, 71, 72, 73, ...","[1, 74, 75, 76, 77, 69, 78, 8, 79, 80, 81, 82,...",31,20
3,"We must all do this together, at all levels of...",Yo también tengo en mi poder una versión más l...,SPANISH,"[1, 80, 87, 88, 89, 90, 91, 13, 88, 92, 6, 25,...","[1, 88, 89, 90, 52, 91, 78, 92, 93, 94, 95, 8,...",15,24
4,The European Union must contribute towards the...,"Pour la présidence française, cette propositio...",FRENCH,"[1, 25, 93, 94, 87, 95, 96, 25, 97, 98, 6, 25,...","[1, 29, 45, 103, 104, 105, 106, 107, 108, 109,...",39,19
...,...,...,...,...,...,...,...
240023,I therefore hope that proposed Amendment No 19...,"L'étiquetage permet, grâce à la transparence q...",FRENCH,"[1, 140, 365, 1089, 109, 308, 307, 206, 9459, ...","[1, 5678, 3019, 121, 296, 45, 3024, 635, 28573...",22,34
240026,"According to the Rules of Procedure, you are n...",La formation du personnel intégrant les déléga...,FRENCH,"[1, 1156, 53, 25, 752, 6, 432, 545, 32, 137, 1...","[1, 45, 2072, 124, 549, 527, 214, 16630, 47, 2...",14,17
240027,We can put a figure on both our employment obj...,Esta tendencia se pone igualmente de manifiest...,SPANISH,"[1, 80, 111, 708, 7, 1344, 43, 995, 458, 1673,...","[1, 167, 3823, 76, 6418, 10781, 8, 4043, 52, 4...",33,42
240028,"In France, 95% of elected representatives are ...","Là-bas, le détachement chargé de rendre les ho...",FRENCH,"[1, 29, 2283, 6048, 6, 1268, 2345, 32, 6420, 1...","[1, 20675, 12, 26104, 3534, 8, 2709, 214, 5501...",11,28


In [14]:
# save Vocab object
with open('src_field.pkl','wb') as pkl:
    pickle.dump(src_field, pkl)

with open('tgt_field.pkl','wb') as pkl:
    pickle.dump(tgt_field, pkl)

In [15]:
# load Vocab object
# with open('../input/notebook0bb169eae2/src_field.pkl','rb') as pkl:
#     src_field = pickle.load(pkl)

# with open('../input/notebook0bb169eae2/tgt_field.pkl','rb') as pkl:
#     tgt_field = pickle.load(pkl)

In [16]:
max_src_len = max(train_data["len_of_src"])
max_tgt_len = max(train_data["len_of_tgt"])

In [17]:
def add_padding(token_list,length,field):
    while len(token_list)<length:
       token_list.append(field.word2index["<pad>"])
    return token_list

In [18]:
train_data["src_token_indices"] = train_data["src_token_indices"].apply(add_padding, args=(max_src_len,src_field)) 
train_data["tgt_token_indices"] = train_data["tgt_token_indices"].apply(add_padding, args=(max_tgt_len,tgt_field)) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [19]:
train_src = np.vstack(list(train_data["src_token_indices"]))
train_tgt = np.vstack(list(train_data["tgt_token_indices"]))

In [20]:
classes = ['FRENCH', 'SPANISH']
onehot_encoder = OneHotEncoder(categories=[classes])
train_labels = onehot_encoder.fit_transform(np.array(train_data['labels']).reshape(-1, 1)).toarray()

In [21]:
train_src.shape

(160000, 259)

In [22]:
### Need to Edit!

## Hyperparameters
d_model = 256
src_pad_idx = src_field.word2index["<pad>"]
nhead = 4
src_vocab_size = src_field.vocab_size
tgt_vocab_size = tgt_field.vocab_size
num_encoder_layers = 2
num_decoder_layers = 2

dim_feedforward = 512
dropout = 0.2

n_cond_label = len(classes)

lr = 0.001
batch_size = 2
epochs = 20

In [23]:
class Dataset(torch.utils.data.Dataset):
  def __init__(self, src, tgt, labels):
    self.src = src
    self.tgt = tgt
    self.labels = labels

  def __len__(self):
    return self.src.shape[0]
  
  def __getitem__(self, index):
    return self.src[index], self.tgt[index], self.labels[index]

In [24]:
train_dataset = Dataset(train_src, train_tgt, train_labels)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

In [25]:
# latent: [batch, emb_dim, d_model]
# label : [batch, no_of_labels]
# new_latent: [batch, emb_dim, d_model+no_of_labels]

def latent_space_concat(latent, label):
  new_label = label.unsqueeze(2).transpose(1, 2)
  new_label = new_label.expand(-1, latent.size(1), -1)
  new_latent = torch.cat((latent, new_label), dim=2)
  return new_latent

In [26]:
# example

a = torch.arange(2*3*4).reshape(2, 3, 4)
b = torch.arange(6).reshape(-1, 3)
c = latent_space_concat(a, b)

print("a:::",a)
print("b:::",b)
print("c:::",c)
print(a.shape, b.shape, c.shape)

a::: tensor([[[ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11]],

        [[12, 13, 14, 15],
         [16, 17, 18, 19],
         [20, 21, 22, 23]]])
b::: tensor([[0, 1, 2],
        [3, 4, 5]])
c::: tensor([[[ 0,  1,  2,  3,  0,  1,  2],
         [ 4,  5,  6,  7,  0,  1,  2],
         [ 8,  9, 10, 11,  0,  1,  2]],

        [[12, 13, 14, 15,  3,  4,  5],
         [16, 17, 18, 19,  3,  4,  5],
         [20, 21, 22, 23,  3,  4,  5]]])
torch.Size([2, 3, 4]) torch.Size([2, 3]) torch.Size([2, 3, 7])


In [27]:
'''
__init__() params:
  d_model             : feature dimension
  nhead               : number of heads in multiheadattention
  n_cond_label        : number of conditioning labels
  num_encoder_layers  : number of encoders
  num_decoder_layers  : number of decoders
  dim_feedforward     : FNN dimension
  dropout             : p value of dropout

forward() params:
  src                   : sequence to encoder stack
  tgt                   : sequence to decoder stack
  labels                : conditioning labels
  tgt_mask              : mask for tgt sequence
  src_key_padding_mask  : mask for src keys per batch
'''

class ModifiedTransformer(nn.Module):
  def __init__(self, d_model = 512, nhead = 8, n_cond_label = 1,
                     num_encoder_layers = 6, num_decoder_layers = 6,
                     dim_feedforward = 2048, dropout = 0.1):
    super(ModifiedTransformer, self).__init__()

    self.d_model = d_model

    encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, 'relu')
    encoder_norm = nn.LayerNorm(d_model)
    self.encoder = nn.TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)

    decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, 'relu')
    decoder_norm = nn.LayerNorm(d_model)
    self.decoder = nn.TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)

    self.fnn1 = nn.Linear(d_model + n_cond_label, d_model*2)
    self.fnn2 = nn.Linear(d_model*2, d_model)
    self._reset_parameters()


  def forward(self, src, tgt, labels, tgt_mask = None, src_key_padding_mask = None):
    test = "number of batches in src and tgt must be equal"
    assert src.size(1) == tgt.size(1), test

    test = "number of features in src and tgt, must be equal to d_model"
    assert src.size(2) == self.d_model and tgt.size(2) == self.d_model, test

    test = "number of batches in src and labels must be equal"
    assert src.size(1) == labels.size(1), test

    memory = self.encoder(src, src_key_padding_mask=src_key_padding_mask)

    new_memory = latent_space_concat(memory.transpose(0, 1), labels.T)
    fnn_out = self.fnn1(new_memory)
    fnn_out = F.leaky_relu(fnn_out)
    fnn_out = self.fnn2(fnn_out)
    reduced_memory = F.leaky_relu(fnn_out)

    output = self.decoder(tgt, reduced_memory, tgt_mask=tgt_mask)
    return output


  def generate_square_subsequent_mask(self, sz):
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


  def _reset_parameters(self):
    for p in self.parameters():
      if p.dim() > 1:
        nn.init.xavier_uniform_(p)

In [28]:
def get_PE(max_len, d_model):
  position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
  div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
  pe = torch.empty(max_len, d_model)
  pe[:, 0::2] = torch.sin(position * div_term)
  pe[:, 1::2] = torch.cos(position * div_term)
  pe = pe.unsqueeze(0).transpose(0, 1)
  return pe

In [29]:
class Model(nn.Module):
  def __init__(self, d_model, src_pad_idx, nhead,
               src_vocab_size, tgt_vocab_size, num_encoder_layers, num_decoder_layers,
               n_cond_label, max_src_len, max_tgt_len, dim_feedforward, dropout):
    super(Model, self).__init__()

    self.PE = get_PE(max(max_src_len, max_tgt_len), d_model)
    self.src_embedding = nn.Embedding(src_vocab_size, d_model)
    self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
    
    self.transformer = ModifiedTransformer(d_model, nhead, n_cond_label, 
                                           num_encoder_layers, num_decoder_layers, dim_feedforward, dropout)
    self.fnn_out = nn.Linear(d_model, tgt_vocab_size)
    self.max_src_len = max_src_len
    self.max_tgt_len = max_tgt_len

    self.d_model = d_model
    self.src_pad_idx = src_pad_idx


  def forward(self, src, tgt, labels):
    src, tgt, labels = src.T, tgt.T, labels.T
    src_seq_len, src_N = src.shape
    tgt_seq_len, tgt_N = tgt.shape
    label_feature_len, label_N = labels.shape

    src_PE = self.PE[:src_seq_len, :].to(src.device)
    tgt_PE = self.PE[:tgt_seq_len, :].to(src.device)

    emb_src = self.src_embedding(src) + src_PE
    emb_tgt = self.tgt_embedding(tgt) + tgt_PE

    src_key_padding_mask = (src.transpose(0, 1) == self.src_pad_idx)
    tgt_mask = self.transformer.generate_square_subsequent_mask(tgt_seq_len).to(src.device)
    tran_out = self.transformer(emb_src, emb_tgt, labels, src_key_padding_mask=src_key_padding_mask, tgt_mask=tgt_mask)
    return self.fnn_out(tran_out.transpose(0, 1))

In [30]:
model = Model(d_model, src_pad_idx, nhead, src_vocab_size, tgt_vocab_size, num_encoder_layers, num_decoder_layers,
              n_cond_label, max_src_len, max_tgt_len, dim_feedforward, dropout)
# model.load_state_dict(torch.load("../input/notebook0bb169eae2/model14.pth"))
model = model.to(device)

    
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss(ignore_index=src_pad_idx)


In [31]:
for epoch in range(epochs):
  losses = np.empty(len(train_loader), dtype=np.float32)
  for idx,(src,tgt,labels) in enumerate(train_loader):
    optimizer.zero_grad()
 
    src = torch.as_tensor(src, dtype=torch.int64).to(device)
    tgt = torch.as_tensor(tgt, dtype=torch.int64).to(device)
    labels = torch.as_tensor(labels, dtype=torch.float32).to(device)
 
    predicts = model(src, tgt[:, :-1], labels)
    predicts = F.log_softmax(predicts, dim=2)
    predicts = predicts.reshape(-1, predicts.shape[2])
    loss = criterion(predicts, tgt[:, 1:].reshape(-1))
    loss.backward()
    losses[idx] = loss.item()
    
    optimizer.step()
    #break
  torch.save(model.state_dict(), f'model{epoch}.pth')
  print('{}: {:.4f}'.format(epoch, losses.mean()))
  #break

0: 7.4302


1: 7.4912


2: 7.5220


3: 7.5342


In [None]:
torch.save(model.state_dict(), 'model.pth')