In [2]:
import torch
import torch.nn as nn
from torch import tensor
import numpy as np
from tqdm import tqdm
import pandas as pd
import itertools
import pickle
from transformers import AutoTokenizer, AutoModel
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device


device(type='cuda')

In [3]:
class Dataset(torch.utils.data.Dataset):

  def __init__(self, x, y, pad):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    ##################### TODO: create two tensors one for x and the other for labels ###############################
    # padding all sentences and labels to have the same length
    max_len = max([len(sentence) for sentence in x])
    
    for i in range(len(x)):
      x[i] = torch.cat((x[i], torch.zeros(max_len - len(x[i]), 768)), 0) 
      y[i] = y[i] + [0] * (max_len - len(y[i]))
      
    self.x = torch.stack(x)
    self.y = torch.tensor(y)
    #################################################################################################################

  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    ###################### TODO: return the length of the dataset #############################
    return len(self.x)
    ###########################################################################################

  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    ###################### TODO: return a tuple of x and y ###################################
    return self.x[idx], self.y[idx]
    ##########################################################################################

In [4]:
class Context_Model(nn.Module):
  def __init__(self, input_size=768, hidden_size=50, n_classes=100, bidirectional=False):
    """
    The constructor of our NER model
    Inputs:
    - vacab_size: the number of unique words
    - embedding_dim: the embedding dimension
    - n_classes: the number of final classes (tags)
    """
    super(Context_Model, self).__init__()
    ####################### TODO: Create the layers of your model #######################################
    # (1) Create the embedding layer
    

    # (2) Create an LSTM layer with hidden size = hidden_size and batch_first = True
    self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=bidirectional)

    # (3) Create a linear layer with number of neorons = n_classes
    self.linear = nn.Linear(hidden_size * (2 if bidirectional else 1), n_classes)
    #####################################################################################################

  def forward(self, X, hidden=None):
    """
    This function does the forward pass of our model
    Inputs:
    - sentences: tensor of shape (batch_size, max_length)

    Returns:
    - final_output: tensor of shape (batch_size, max_length, n_classes)
    """

    ######################### TODO: implement the forward pass ####################################
    final_output, hidden = self.lstm(X, hidden)
    final_output = self.linear(final_output)
    ###############################################################################################
    return final_output, hidden

In [5]:
def train(model, train_dataset, n_classes, batch_size=512, epochs=5, learning_rate=0.01):
  """
  This function implements the training logic
  Inputs:
  - model: the model ot be trained
  - train_dataset: the training set of type NERDataset
  - batch_size: integer represents the number of examples per step
  - epochs: integer represents the total number of epochs (full training pass)
  - learning_rate: the learning rate to be used by the optimizer
  """

  ############################## TODO: replace the Nones in the following code ##################################
  
  # (1) create the dataloader of the training set (make the shuffle=True)
  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

  # (2) make the criterion cross entropy loss
  criterion = torch.nn.CrossEntropyLoss()

  # (3) create the optimizer (Adam)
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  # GPU configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

  for epoch_num in range(epochs):
    total_acc_train = 0
    total_loss_train = 0

    for train_input, train_label in tqdm(train_dataloader):

      # (4) move the train input to the device
      train_input = train_input.to(device)

      # (5) move the train label to the device
      train_label = train_label.to(device)


      # (6) do the forward pass
      output, _ = model(train_input)
      output = output.to(device)
      
      # (7) loss calculation (you need to think in this part how to calculate the loss correctly)
      # -1 is ignore 
      batch_loss = criterion(output.view(-1, n_classes), train_label.view(-1))
    

      # (8) append the batch loss to the total_loss_train
      total_loss_train += batch_loss.item()
      
      # (9) calculate the batch accuracy (just add the number of correct predictions)
      acc = (output.argmax(dim=-1) == train_label).sum().item()
      total_acc_train += acc

      # (10) zero your gradients
      optimizer.zero_grad()

      # (11) do the backward pass
      batch_loss.backward(retain_graph=True)

      # (12) update the weights with your optimizer
      optimizer.step()
      
    # epoch loss
    epoch_loss = total_loss_train / len(train_dataset)

    # (13) calculate the accuracy
    epoch_acc = total_acc_train / (len(train_dataset) * train_dataset[0][0].shape[0])

    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
        | Train Accuracy: {epoch_acc}\n')

  ##############################################################################################################

In [12]:
df = pd.read_csv("./processed/val.csv")

data = df["tokenized_cleaned"].apply(eval)
data = list(filter(lambda sent: len(sent) > 1, data))
id2word = {idx+1:word for idx, word in  enumerate(set(itertools.chain.from_iterable(data)))}
id2word[0] = '<PAD>'
print("id2word ...")

word2id = {word:idx for idx, word in id2word.items()}
print("word2id ...")

context_model = Context_Model(hidden_size=256, n_classes=len(word2id.keys()), bidirectional=True).to(device)

X  = pd.DataFrame()


tokenizer = AutoTokenizer.from_pretrained('CAMeL-Lab/bert-base-arabic-camelbert-ca')
embedder = AutoModel.from_pretrained('CAMeL-Lab/bert-base-arabic-camelbert-ca')
X[0] = [tokenizer(sent[:-1], return_tensors="pt", padding=True) for sent in data]

print("starting embedding...")

tqdm.pandas()
train_data = []
for i in range(0, len(X[0]), 500):
  batch = X[0][i:i+500].progress_apply(lambda x: embedder(**x).last_hidden_state[:,1, :])
  train_data.extend(batch)
Y = [ [word2id[word] for word in sent[1:]] for sent in data]
print("starting training...")

train_dataset = Dataset(train_data, Y, np.zeros(768))

train(context_model, train_dataset, n_classes=len(word2id.keys()), batch_size=20, epochs=3, learning_rate=0.01)

pickle.dump(context_model, open("./models/context_model_bi.pkl", "wb"))

id2word ...
word2id ...
starting embedding...


100%|██████████| 5/5 [00:00<00:00,  6.14it/s]


starting training...


100%|██████████| 1/1 [00:01<00:00,  1.78s/it]


Epochs: 1 | Train Loss: 0.9554011344909668         | Train Accuracy: 0.0017699115044247787



100%|██████████| 1/1 [00:01<00:00,  1.89s/it]


Epochs: 2 | Train Loss: 0.8444215774536132         | Train Accuracy: 0.7238938053097345



100%|██████████| 1/1 [00:01<00:00,  1.86s/it]

Epochs: 3 | Train Loss: 0.2575518608093262         | Train Accuracy: 0.7451327433628319




