In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from preprocessing import *
from torch import nn
import torch

In [None]:
pp = PreProcessor()
pp.read_data('dataset')
input_tensor, output_tensor = pp.create_tensors()

In [None]:
b_s, b_e = pp.splitters['bengali.csv']
h_s, h_e = pp.splitters['hindi.csv'  ]
t_s, t_e = pp.splitters['telugu.csv' ]

b_input, b_output = input_tensor[b_s:b_e], output_tensor[b_s:b_e]
h_input, h_output = input_tensor[h_s:h_e], output_tensor[h_s:h_e]
t_input, t_output = input_tensor[t_s:t_e], output_tensor[t_s:t_e]

b, h, t  =  (b_e-b_s), (h_e-h_s), (t_e-t_s)

b_input_train, b_input_test, b_input_val    = b_input[:int(0.8*b)], b_input[int(0.8*b):int(0.9*b)], b_input[int(0.9*b):]
h_input_train, h_input_test, h_input_val    = b_input[:int(0.8*h)], b_input[int(0.8*h):int(0.9*h)], h_input[int(0.9*h):]
t_input_train, t_input_test, t_input_val    = t_input[:int(0.8*t)], t_input[int(0.8*b):int(0.9*t)], t_input[int(0.9*t):]

b_output_train, b_output_test, b_output_val = b_output[:int(0.8*b)], b_output[int(0.8*b):int(0.9*b)], b_output[int(0.9*b):]
h_output_train, h_output_test, h_output_val = b_output[:int(0.8*h)], b_output[int(0.8*h):int(0.9*h)], h_output[int(0.9*h):]
t_output_train, t_output_test, t_output_val = t_output[:int(0.8*t)], t_output[int(0.8*b):int(0.9*t)], t_output[int(0.9*t):]

In [None]:
# Constants and Hyperparameters

xlm_roberta             = AutoModelForMaskedLM.from_pretrained('xlm-roberta-base')
xlm_roberta_output_size = 250002
num_tags                = b_output_train.shape[2]
batch_size              = 16
dropout_rate            = 0.2
sequence_length         = pp.max_length

In [None]:
class Model(nn.Module):
	def __init__(self):
		super().__init__()
		self.xlm_roberta = xlm_roberta
		self.dropout     = nn.Dropout(dropout_rate)
		self.linear      = nn.Linear(xlm_roberta_output_size, num_tags)
		self.batch_norm  = nn.BatchNorm1d(num_features=sequence_length)
		self.softmax     = nn.Softmax(dim=-1)

	def forward(self, input):
		roberta_logits      = self.xlm_roberta(input).logits
		dropout_logits      = self.dropout(roberta_logits)
		model_logits        = self.linear(dropout_logits)
		normalised_logits   = self.batch_norm(model_logits)
		model_probabilities = self.softmax(normalised_logits) 

		return model_probabilities