In [None]:
import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm
import pandas as pd
import itertools
from transformers import AutoTokenizer, AutoModel
import pickle 
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
class Dataset(torch.utils.data.Dataset):

  def __init__(self, x, y, pad_x, pad_y):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    ##################### TODO: create two tensors one for x and the other for labels ###############################
    self.data = torch.tensor(x)
    self.labels = torch.tensor(y)

    max_len = max([len(sentence) for sentence in x])
    for i in range(len(x)):
      x[i] = x[i] + [pad_x] * (max_len - len(x[i]))
      y[i] = y[i] + [pad_y] * (max_len - len(y[i]))
    #################################################################################################################

  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    ###################### TODO: return the length of the dataset #############################
    return len(self.data)
    ###########################################################################################

  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    ###################### TODO: return a tuple of x and y ###################################
    return self.data[idx], self.labels[idx]
    ##########################################################################################


In [None]:

class NER(nn.Module):
  def __init__(self, vocab_size=35181, embedding_dim=50, hidden_size=50, n_classes=12):
    """
    The constructor of our NER model
    Inputs:
    - vacab_size: the number of unique words
    - embedding_dim: the embedding dimension
    - n_classes: the number of final classes (tags)
    """
    super(NER, self).__init__()
    ####################### TODO: Create the layers of your model #######################################
    # (1) Create the embedding layer
    self.embedding = nn.Embedding(vocab_size, embedding_dim)

    # (2) Create an LSTM layer with hidden size = hidden_size and batch_first = True
    self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)

    # (3) Create a linear layer with number of neorons = n_classes
    self.linear = nn.Linear(hidden_size, n_classes)
    #####################################################################################################

  def forward(self, sentences):
    """
    This function does the forward pass of our model
    Inputs:
    - sentences: tensor of shape (batch_size, max_length)

    Returns:
    - final_output: tensor of shape (batch_size, max_length, n_classes)
    """

    final_output = None
    ######################### TODO: implement the forward pass ####################################
    embeddings = self.embedding(sentences)
    lstm_out, _ = self.lstm(embeddings)
    final_output = self.linear(lstm_out)
    
    ###############################################################################################
    return final_output

## Ml_Model building

#### RNN

In [None]:
class Torch_Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, num_layers, bidirectional=False, type='RNN', dropout=0.0):
        super(Torch_Model, self).__init__()
        self.num_layers = num_layers if not bidirectional else num_layers*2
        self.hidden_to_output_size = hidden_size if not bidirectional else hidden_size*2
        self.hidden_size = hidden_size
        if type == 'RNN':
            self.rnn = nn.RNN(input_size, hidden_size, self.num_layers,
                               bidirectional=bidirectional, batch_first=True, dropout=dropout)
        elif type == 'LSTM':
            self.rnn = nn.LSTM(input_size, hidden_size, self.num_layers,
                                bidirectional=bidirectional,batch_first=True, dropout=dropout)
        elif type == 'GRU':
            self.rnn = nn.GRU(input_size, hidden_size, self.num_layers,
                               bidirectional=bidirectional,batch_first=True, dropout=dropout)
        # -> x needs to be: (batch_size, seq, input_size)

        self.hidden_to_output = nn.Linear(self.hidden_to_output_size, num_classes)
        # self.soft_max = nn.LogSoftmax(dim=1)
        

        
    def forward(self, x, hidden=None):
        # Set initial hidden states (and cell states for LSTM)
        if hidden is None:
            if isinstance(self.rnn, nn.LSTM):
                # hidden = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
                hidden = (torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device),
                           torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device))
            else:
                hidden = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        
        
        h_out, hidden_out = self.rnn(x, hidden)
        
        h_out = h_out[:, -1, :]
        out = self.hidden_to_output(h_out)
        # out = self.soft_max(out)
        
        return out, hidden_out


#### General Model

In [None]:
class Ml_Model():
    # Hyper-parameters 
    num_epochs = 10
    # warmup_epochs = 2
    batch_size = 1
    learning_rate = 0.001
    # warmup_learning_rate = 0.01

    sequence_length = 1

    
    def __init__(self, input_size, hidden_size, num_classes, num_layers = 1, type="RNN",
                  bidirectional=False, optimizer="Adam", loss="CrossEntropy", dropout=0.0,
                  word2idx=None):
        # TODO: instantiate model
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        self.model = None
        self.loss = None

        self.word2idx = word2idx

        self.model = Torch_Model(input_size, hidden_size, num_classes, num_layers,
                                    bidirectional=bidirectional, dropout=dropout, type=type).to(device)

        if optimizer == "Adam":
            self.optimizer = torch.optim.Adam(self.model.parameters(), lr=Ml_Model.learning_rate)
        elif optimizer == "SGD":
            self.optimizer = torch.optim.SGD(self.model.parameters(), lr=Ml_Model.learning_rate)
        
        if loss == "CrossEntropy":
            self.loss = nn.CrossEntropyLoss()
        elif loss == "MSE":
            self.loss = nn.MSELoss()

        self.tokenizer = AutoTokenizer.from_pretrained('CAMeL-Lab/bert-base-arabic-camelbert-ca')
        self.embedder = AutoModel.from_pretrained('CAMeL-Lab/bert-base-arabic-camelbert-ca')
        

    def train(self, data):
        # TODO: train model GENERIC
        # self.model.fit(data, labels)
        X = [sent[:-1] for sent in data]
        # TODO: padding error
        Y = [self.word2idx[word] for sent in data for word in sent[1:]]
        Y = torch.tensor(Y)
        data_tokenized = self.tokenizer(X, return_tensors="pt", padding=True, truncation=True, max_length=512)
        data_embedded = self.embedder(**data_tokenized)
        X = data_embedded.last_hidden_state[:, 1:-1, :].detach().numpy()
        PAD = np.zeros(X.shape[2])

        data = Dataset(X, Y, PAD, self.word2idx['<PAD>'])
        train_dataloader = torch.utils.data.DataLoader(data, batch_size=Ml_Model.batch_size, shuffle=True).to(device)


        if isinstance(self.model, Torch_Model):

            for epoch in range(Ml_Model.num_epochs):

                for train_input, train_label in tqdm(train_dataloader):
                    total_loss_train = 0

                    words = train_input.to(device)

                    label = train_label.to(device)
                    # Forward pass
                    outputs, _ = self.model(words)
                    self.loss = self.loss(outputs.view(-1, self.num_classes), label.view(-1))
                    
                    total_loss_train += self.loss.item()


                    # Backward and optimize
                    self.optimizer.zero_grad()
                    self.loss.backward()
                    self.optimizer.step()
                
                epoch_loss = total_loss_train / len(train_dataloader)
                
                print(f'Epochs: {epoch + 1} | Train Loss: {epoch_loss}')
    
    # def predict(self, data):
    #     # TODO: run model and return predicted output
    #     if isinstance(self.model, Torch_Model):
    #         with torch.no_grad():
    #             words = data.reshape(-1, self.sequence_length, self.input_size).to(device)
    #             outputs = self.model(words)
    #             # max returns (value ,index)
    #             _, predicted = torch.max(outputs.data, 1)
    #             return predicted

    # def test(self, data, labels):
    #     # TODO: calculate score and print them
    #     if isinstance(self.model, RNN):
    #         with torch.no_grad():
    #             n_correct = 0
    #             n_samples = 0
    #             for i in range(0, len(data), batch_size):
    #                 words = data[i:i+batch_size]
    #                 words = words.reshape(-1, sequence_length, input_size).to(device)
    #                 labels = labels[i:i+batch_size].to(device)
    #                 outputs = self.model(words)
    #                 # max returns (value ,index)
    #                 _, predicted = torch.max(outputs.data, 1)
    #                 n_samples += labels.size(0)
    #                 n_correct += (predicted == labels).sum().item()

    #             acc = 100.0 * n_correct / n_samples
    #             print(f'Accuracy of the network on the 10000 test images: {acc} %')


In [None]:

df = pd.read_csv("./processed/val.csv")
data = df["tokenized_diacritics_removed"].apply(eval)
# data = list(filter(lambda sent: len(sent) > 2 and min([len(word) for word in  sent]) > 1, data))
id2word = {idx:word for idx, word in  enumerate(set(itertools.chain.from_iterable(data)))}
word2id = {word:idx for idx, word in id2word.items()}

letter_model = Ml_Model(input_size=768, hidden_size=512, num_classes=len(word2id.keys()), type="LSTM", word2idx=word2id)

letter_model.train(list(data))

In [None]:
letter_model.predict(torch.tensor([0,1,2,3,4,5], dtype=torch.float32))

## Classification Model

In [None]:
class Tashkeel_Model():

    def __init__(self, model_type=None):
        # TODO: create model and pre processing module

        self.model = Ml_Model()
        # self.preprocessor = Preprocessing()
        pass
    
    def load_data(self, df):
        self.data = df["features"]
        self.labels = df["labels"]
        
        pass
    
    def preprocess_data(self):
        self.preprocessor.process(self.data)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Sample data (replace this with your dataset)
sentences = [
    "This is a sample sentence.",
    "Another example of a sentence.",
    "PyTorch RNN modeling is fascinating."
]

# Tokenizing and converting sentences to numerical representations (indices)
word_to_idx = {}  # Your word to index mapping
idx_to_word = {}  # Your index to word mapping

# Assuming you have a function to tokenize your sentences and create word-to-index mapping
def tokenize_sentence(sentence):
    # Tokenize your sentence and convert to indices based on word_to_idx mapping
    # Return a list of indices
    pass

# Convert sentences to indices
indexed_sentences = [tokenize_sentence(sentence) for sentence in sentences]

# Define an RNN language model
class NextWordPredictor(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(NextWordPredictor, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded, hidden)
        output = self.fc(output)
        return output, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_dim)

# Instantiate the model, loss function, and optimizer
vocab_size = len(word_to_idx)  # Define your vocabulary size
embedding_dim = 128
hidden_dim = 128
model = NextWordPredictor(vocab_size, embedding_dim, hidden_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for sentence in indexed_sentences:
        optimizer.zero_grad()
        inputs = torch.tensor(sentence[:-1], dtype=torch.long).unsqueeze(0)  # Input sequence
        targets = torch.tensor(sentence[1:], dtype=torch.long)  # Target sequence

        hidden = model.init_hidden(1)
        outputs, _ = model(inputs, hidden)
        loss = criterion(outputs.view(-1, vocab_size), targets)

        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item()}')


# DONT LOOK UP

In [1]:
import torch
import torch.nn as nn
from torch import tensor
import numpy as np
from tqdm import tqdm
import pandas as pd
import itertools
from transformers import AutoTokenizer, AutoModel
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

  from .autonotebook import tqdm as notebook_tqdm


device(type='cuda')

In [2]:
class Dataset(torch.utils.data.Dataset):

  def __init__(self, x, y, pad):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    ##################### TODO: create two tensors one for x and the other for labels ###############################
    # padding all sentences and labels to have the same length
    max_len = max([len(sentence) for sentence in x])
    
    for i in range(len(x)):
      x[i] = torch.cat((x[i], torch.zeros(max_len - len(x[i]), 768)), 0) 
      y[i] = y[i] + [0] * (max_len - len(y[i]))
      
    self.x = torch.stack(x)
    self.y = torch.tensor(y)
    #################################################################################################################

  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    ###################### TODO: return the length of the dataset #############################
    return len(self.x)
    ###########################################################################################

  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    ###################### TODO: return a tuple of x and y ###################################
    return self.x[idx], self.y[idx]
    ##########################################################################################

In [3]:
class Model(nn.Module):
  def __init__(self, input_size=768, hidden_size=50, n_classes=100, bidirectional=False):
    """
    The constructor of our NER model
    Inputs:
    - vacab_size: the number of unique words
    - embedding_dim: the embedding dimension
    - n_classes: the number of final classes (tags)
    """
    super(Model, self).__init__()
    ####################### TODO: Create the layers of your model #######################################
    # (1) Create the embedding layer
    

    # (2) Create an LSTM layer with hidden size = hidden_size and batch_first = True
    self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=bidirectional)

    # (3) Create a linear layer with number of neorons = n_classes
    self.linear = nn.Linear(hidden_size * (2 if bidirectional else 1), n_classes)
    #####################################################################################################

  def forward(self, X):
    """
    This function does the forward pass of our model
    Inputs:
    - sentences: tensor of shape (batch_size, max_length)

    Returns:
    - final_output: tensor of shape (batch_size, max_length, n_classes)
    """

    final_output = None
    ######################### TODO: implement the forward pass ####################################
    final_output, hidden = self.lstm(X)
    final_output = self.linear(final_output)
    ###############################################################################################
    return final_output, hidden

In [4]:
def train(model, train_dataset, n_classes, batch_size=512, epochs=5, learning_rate=0.01):
  """
  This function implements the training logic
  Inputs:
  - model: the model ot be trained
  - train_dataset: the training set of type NERDataset
  - batch_size: integer represents the number of examples per step
  - epochs: integer represents the total number of epochs (full training pass)
  - learning_rate: the learning rate to be used by the optimizer
  """

  ############################## TODO: replace the Nones in the following code ##################################
  
  # (1) create the dataloader of the training set (make the shuffle=True)
  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

  # (2) make the criterion cross entropy loss
  criterion = torch.nn.CrossEntropyLoss()

  # (3) create the optimizer (Adam)
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  # GPU configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

  for epoch_num in range(epochs):
    total_acc_train = 0
    total_loss_train = 0

    for train_input, train_label in tqdm(train_dataloader):

      # (4) move the train input to the device
      train_input = train_input.to(device)

      # (5) move the train label to the device
      train_label = train_label.to(device)


      # (6) do the forward pass
      output, _ = model(train_input)
      output = output.to(device)
      
      # (7) loss calculation (you need to think in this part how to calculate the loss correctly)
      # -1 is ignore 
      batch_loss = criterion(output.view(-1, n_classes), train_label.view(-1))
    

      # (8) append the batch loss to the total_loss_train
      total_loss_train += batch_loss.item()
      
      # (9) calculate the batch accuracy (just add the number of correct predictions)
      acc = (output.argmax(dim=-1) == train_label).sum().item()
      total_acc_train += acc

      # (10) zero your gradients
      optimizer.zero_grad()

      # (11) do the backward pass
      batch_loss.backward(retain_graph=True)

      # (12) update the weights with your optimizer
      optimizer.step()
      
    # epoch loss
    epoch_loss = total_loss_train / len(train_dataset)

    # (13) calculate the accuracy
    epoch_acc = total_acc_train / (len(train_dataset) * train_dataset[0][0].shape[0])

    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
        | Train Accuracy: {epoch_acc}\n')

  ##############################################################################################################

In [5]:
def evaluate(model, test_dataset, batch_size=512):
  """
  This function takes a NER model and evaluates its performance (accuracy) on a test data
  Inputs:
  - model: a NER model
  - test_dataset: dataset of type NERDataset
  """
  ########################### TODO: Replace the Nones in the following code ##########################

  # (1) create the test data loader
  test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

  # GPU Configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()

  total_acc_test = 0
  
  # (2) disable gradients
  with torch.no_grad():

    for test_input, test_label in tqdm(test_dataloader):
      # (3) move the test input to the device
      test_label = test_label.to(device)

      # (4) move the test label to the device
      test_input = test_input.to(device)

      # (5) do the forward pass
      output, _ = model(test_input)

      # accuracy calculation (just add the correct predicted items to total_acc_test)
      acc = (output.argmax(dim=-1) == test_label).sum().item()
      total_acc_test += acc
    
    # (6) calculate the over all accuracy
    total_acc_test /= (len(test_dataset) * test_dataset[0][0].shape[0])
  ##################################################################################################

  
  print(f'\nTest Accuracy: {total_acc_test}')

In [6]:
df = pd.read_csv("./processed/val.csv")
df.head()

Unnamed: 0,0,tokenized,tokenized_cleaned,cleaned,diacritics,word_with_context
0,قَوْلُهُ وَلَا تُكْرَهُ ضِيَافَتُهُ .,"['قَوْلُهُ', 'وَلَا', 'تُكْرَهُ', 'ضِيَافَتُهُ']","['قوله', 'ولا', 'تكره', 'ضيافته']",قَوْلُهُ وَلَا تُكْرَهُ ضِيَافَتُهُ,"[[0, 6, 2, 2], [0, 0, 14], [2, 6, 0, 2], [4, 0...","[('قوله', tensor([[-5.7860e-05, 7.6122e-01, ..."
1,الْفَرْقُ الثَّالِثُ وَالثَّلَاثُونَ بَيْنَ قَ...,"['الْفَرْقُ', 'الثَّالِثُ', 'وَالثَّلَاثُونَ',...","['الفرق', 'الثالث', 'والثلاثون', 'بين', 'قاعدة...",الْفَرْقُ الثَّالِثُ وَالثَّلَاثُونَ بَيْنَ قَ...,"[[14, 6, 0, 6, 2], [14, 14, 14, 14, 4, 2], [0,...","[('الفرق', tensor([[-8.2446e-06, 7.6140e-01, ..."
2,قَوْلُهُ وَهُوَ أَيْ الْبَيْعُ بِالْمَعْنَى ال...,"['قَوْلُهُ', 'وَهُوَ', 'أَيْ', 'الْبَيْعُ', 'ب...","['قوله', 'وهو', 'أي', 'البيع', 'بالمعنى', 'الث...",قَوْلُهُ وَهُوَ أَيْ الْبَيْعُ بِالْمَعْنَى ال...,"[[0, 6, 2, 2], [0, 2, 0], [0, 6], [14, 6, 0, 6...","[('قوله', tensor([[-5.7860e-05, 7.6122e-01, ..."
3,إذْ الْمُقَابَلَةُ لَا تَصْدُقُ عَلَى الْعَقْد...,"['إذْ', 'الْمُقَابَلَةُ', 'لَا', 'تَصْدُقُ', '...","['إذ', 'المقابلة', 'لا', 'تصدق', 'على', 'العقد...",إذْ الْمُقَابَلَةُ لَا تَصْدُقُ عَلَى الْعَقْد...,"[[14, 6], [14, 6, 2, 0, 14, 0, 0, 2], [0, 14],...","[('إذ', tensor([[-2.3214e-05, 7.6133e-01, 4...."
4,وَقَدْ يُجْعَلُ كَلَامُهُ عَلَى حَذْفِ مُضَافٍ...,"['وَقَدْ', 'يُجْعَلُ', 'كَلَامُهُ', 'عَلَى', '...","['وقد', 'يجعل', 'كلامه', 'على', 'حذف', 'مضاف',...",وَقَدْ يُجْعَلُ كَلَامُهُ عَلَى حَذْفِ مُضَافٍ...,"[[0, 0, 6], [2, 6, 0, 2], [0, 0, 14, 2, 2], [0...","[('وقد', tensor([[-3.2933e-04, 7.6013e-01, 7..."


In [7]:
df = pd.read_csv("./processed/val.csv")

data = df["tokenized_cleaned"].apply(eval)
data = list(filter(lambda sent: len(sent) > 1, data))
id2word = {idx+1:word for idx, word in  enumerate(set(itertools.chain.from_iterable(data)))}
id2word[0] = '<PAD>'
print("id2word ...")

word2id = {word:idx for idx, word in id2word.items()}
print("word2id ...")

context_model = Model(hidden_size=256, n_classes=len(word2id.keys()), bidirectional=True).to(device)

X  = pd.DataFrame()


tokenizer = AutoTokenizer.from_pretrained('CAMeL-Lab/bert-base-arabic-camelbert-ca')
embedder = AutoModel.from_pretrained('CAMeL-Lab/bert-base-arabic-camelbert-ca')
X[0] = [tokenizer(sent[:-1], return_tensors="pt", padding=True) for sent in data]

print("starting embedding...")

tqdm.pandas()
train_data = X[0].progress_apply(lambda x: embedder(**x).last_hidden_state[:,1, :]) 
Y = [ [word2id[word] for word in sent[1:]] for sent in data]
print("starting training...")

train_dataset = Dataset(train_data, Y, np.zeros(768))

train(context_model, train_dataset, n_classes=len(word2id.keys()), batch_size=20, epochs=3, learning_rate=0.01)

pickle.dump(context_model, open("./models/context_model_bi.pkl", "wb"))

id2word ...
word2id ...
starting embedding...


  9%|▉         | 608/6492 [02:57<28:39,  3.42it/s]  


RuntimeError: [enforce fail at alloc_cpu.cpp:80] data. DefaultCPUAllocator: not enough memory: you tried to allocate 1867776 bytes.

In [None]:
df = pd.read_csv("./processed/val.csv")

data = df["tokenized_diacritics_removed"].apply(eval)
data = data[1000: 1250]
data = list(filter(lambda sent: len(sent) > 1, data))
id2word = {idx+1:word for idx, word in  enumerate(set(itertools.chain.from_iterable(data)))}
id2word[0] = '<PAD>'
word2id = {word:idx for idx, word in id2word.items()}


X = [sent[:-1] for sent in data]
print("starting embedding...")

X = [ embedder(**tokenizer(sent, return_tensors="pt", padding=True, truncation=True, max_length=512)).last_hidden_state[:,1, :] for sent in X]
Y = [ [word2id[word] for word in sent[1:]] for sent in data]
print("starting training...")

train_dataset = Dataset(X, Y, np.zeros(768))

evaluate(context_model, train_dataset, batch_size=20)

In [None]:

pickle.dump(context_model, open("./models/context_model.pkl", "wb"))
pickle.dump(word2id, open("./models/word2id.pkl", "wb"))
pickle.dump(id2word, open("./models/id2word.pkl", "wb"))


# Tashkeel Model

In [None]:
class Dataset(torch.utils.data.Dataset):

  def __init__(self, x, e, y, pad=None):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    ##################### TODO: create two tensors one for x and the other for labels ###############################
    # padding all sentences and labels to have the same length
    # max_len = max([len(sentence) for sentence in x])
    # for i in range(len(x)):
      # x[i] = x[i] + [pad] * (max_len - len(x[i]))  
      # y[i] = y[i] + [14] * (max_len - len(y[i]))
    self.x = torch.tensor(x)
    self.e = torch.tensor(e)
    print(e.shape, self.e.shape)
    self.y = torch.tensor(y)
    #################################################################################################################

  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    ###################### TODO: return the length of the dataset #############################
    return len(self.x)
    ###########################################################################################

  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    ###################### TODO: return a tuple of x and y ###################################
    return self.x[idx], self.e[idx], self.y[idx]
    ##########################################################################################

In [None]:
def load_data(data):
    diacritic2id = pickle.load(open("./assets/diacritic2id.pickle", "rb"))
    arabic_letters = list(pickle.load(open("./assets/arabic_letters.pickle", "rb")))

    data = data.apply(lambda sent: [[arabic_letters.index(letter) for letter in word] for word in sent])
    labels = df["diacritics"].apply(eval)

    data = data.explode().to_list()
    labels = labels.explode().to_list()
    train_data = zip(data, labels)

    train_data = list(filter(lambda sent: isinstance(sent[0], list), train_data))

    data, labels = zip(*train_data) 

    data = [list(word) for word in data]
    labels = [list(diacritic) for diacritic in labels]

    # Convert words and labels to numerical sequences
    max_word_len = max(len(word) for word in data)
    X = np.zeros((len(data), max_word_len, len(arabic_letters)+1), dtype=np.float32)
    y = np.zeros((len(data), max_word_len), dtype=np.int64)

    print(X.shape, y.shape)
    for i, word in enumerate(data):
        for j, char in enumerate(word):
            X[i, j, char] = 1
            y[i, j] = labels[i][j]


    train_data = Dataset(X, y)

    return train_data

In [None]:
df = pd.read_csv("./processed/val.csv")
data = df["tokenized_diacritics_removed"].apply(eval)

load_data(data)


In [None]:
context_model = Model(input_size=37, hidden_size=512, n_classes=15, bidirectional=True).to(device)

In [None]:
train(context_model, train_data, n_classes=len(diacritic2id.keys()), batch_size=100, epochs=10, learning_rate=0.005)

In [None]:
def predict(model, data):

    # (1) create the test data loader

    # GPU Configuration
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()

    # (2) disable gradients
    with torch.no_grad():
        data = torch.tensor(data).to(device)
        output, _ = model(data)

    
    ##################################################################################################
    return output.argmax(dim=-1).cpu().numpy()
    

In [None]:
sent = "الفرق الثالث والثلاثون بين قاعدة تقدم الحكم عل"
sent = sent.split()
sent = [[arabic_letters.index(letter) for letter in word] for word in sent]

# Convert words and labels to numerical sequences
max_word_len = max(len(word) for word in sent)
X = np.zeros((len(sent), max_word_len, len(arabic_letters)+1), dtype=np.float32)
y = np.zeros((len(sent), max_word_len), dtype=np.int64)

print(X.shape, y.shape)
for i, word in enumerate(sent):
    for j, char in enumerate(word):
        X[i, j, char] = 1
        # y[i, j] = labels[i][j]


pred = predict(context_model, X)

In [None]:
id2diacritic = {idx:diacritic for diacritic, idx in diacritic2id.items()}
gomla = ""
for i, word in enumerate(sent):
    for j, char in enumerate(word):
        gomla += arabic_letters[char] + id2diacritic[pred[i][j]]
        
    gomla += " "
gomla

In [None]:
pickle.dump(context_model, open("./models/tashkeel_model.pkl", "wb"))

In [None]:
data = pd.read_csv("./processed/val.csv")
data.head()

In [None]:
labels = data["diacritics"].apply(eval)
data = data["word_with_context"].apply(lambda x: eval(x))

In [None]:
def load_data(data, labels):
    diacritic2id = pickle.load(open("./assets/diacritic2id.pickle", "rb"))
    arabic_letters = list(pickle.load(open("./assets/arabic_letters.pickle", "rb")))

    data = data.apply(lambda sent: [([arabic_letters.index(letter) for letter in word[0]], word[1]) for word in sent])

    data = data.explode().to_list()
    labels = labels.explode().to_list()
    train_data = zip(data, labels)
    
    cleaned_data = []
    for word in train_data:
        try:
            if isinstance(word[0][0], list):
                cleaned_data.append(word)
        except:
            pass
    # train_data = list(filter(lambda sent: tensor(sent[0][0]))), train_data))
    data, labels = zip(*cleaned_data)

    data = [(list(word[0]),word[1]) for word in data]
    labels = [list(diacritic) for diacritic in labels]

    # Convert words and labels to numerical sequences
    max_word_len = max(len(word[0]) for word in data)
    X = torch.zeros((len(data), max_word_len, len(arabic_letters)+1), dtype=torch.float32).cpu()
    y = torch.zeros((len(data), max_word_len), dtype=torch.int64).cpu()

    for i, word in enumerate(data):
        for j, char in enumerate(word):
            try:
                X[i, j, tensor(char, dtype=torch.long)] = 1
                y[i, j] = labels[i][j]
            except Exception as e:
                y[i, j] = 14


    E = torch.cat([word[1] for word in data])

    train_data = Dataset(X, E, y)

    return train_data    

In [None]:
train_data = load_data(data, labels)

In [None]:
train_data[0][1].shape

In [None]:
tashkeel_model = Model(input_size=37, hidden_size=128, n_classes=15).to(device)
tashkeel_model

In [None]:
def train(model, train_dataset, n_classes, batch_size=512, epochs=5, learning_rate=0.01):
  """
  This function implements the training logic
  Inputs:
  - model: the model ot be trained
  - train_dataset: the training set of type NERDataset
  - batch_size: integer represents the number of examples per step
  - epochs: integer represents the total number of epochs (full training pass)
  - learning_rate: the learning rate to be used by the optimizer
  """

  ############################## TODO: replace the Nones in the following code ##################################
  
  # (1) create the dataloader of the training set (make the shuffle=True)
  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

  # (2) make the criterion cross entropy loss
  criterion = torch.nn.CrossEntropyLoss()

  # (3) create the optimizer (Adam)
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  # GPU configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

  for epoch_num in range(epochs):
    total_acc_train = 0
    total_loss_train = 0

    for train_input, hidden, train_label in tqdm(train_dataloader):
      # (4) move the train input to the device
      train_input = train_input.to(device)

      # (5) move the train label to the device
      train_label = train_label.to(device)


      # (6) do the forward pass
      output, _ = model(train_input, (hidden.unsqueeze(0), torch.zeros_like(hidden.unsqueeze(0))))
      output = output.to(device)
      
      # (7) loss calculation (you need to think in this part how to calculate the loss correctly)
      # -1 is ignore 
      batch_loss = criterion(output.view(-1, n_classes), train_label.view(-1))
    

      # (8) append the batch loss to the total_loss_train
      total_loss_train += batch_loss.item()
      
      # (9) calculate the batch accuracy (just add the number of correct predictions)
      acc = (output.argmax(dim=-1) == train_label).sum().item()
      total_acc_train += acc

      # (10) zero your gradients
      optimizer.zero_grad()

      # (11) do the backward pass
      batch_loss.backward(retain_graph=True)

      # (12) update the weights with your optimizer
      optimizer.step()
      
    # epoch loss
    epoch_loss = total_loss_train / len(train_dataset)

    # (13) calculate the accuracy
    epoch_acc = total_acc_train / (len(train_dataset) * train_dataset[0][0].shape[0])

    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
        | Train Accuracy: {epoch_acc}\n')

  ##############################################################################################################

In [None]:
class Model(nn.Module):
  def __init__(self, input_size=768, hidden_size=50, n_classes=100, bidirectional=False):
    """
    The constructor of our NER model
    Inputs:
    - vacab_size: the number of unique words
    - embedding_dim: the embedding dimension
    - n_classes: the number of final classes (tags)
    """
    super(Model, self).__init__()
    ####################### TODO: Create the layers of your model #######################################
    # (1) Create the embedding layer
    

    # (2) Create an LSTM layer with hidden size = hidden_size and batch_first = True
    self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=bidirectional)

    # (3) Create a linear layer with number of neorons = n_classes
    self.linear = nn.Linear(hidden_size * (2 if bidirectional else 1), n_classes)
    #####################################################################################################

  def forward(self, X, hidden=None):
    """
    This function does the forward pass of our model
    Inputs:
    - sentences: tensor of shape (batch_size, max_length)

    Returns:
    - final_output: tensor of shape (batch_size, max_length, n_classes)
    """

    final_output = None
    ######################### TODO: implement the forward pass ####################################
    final_output, hidden = self.lstm(X, hidden)
    final_output = self.linear(final_output)
    ###############################################################################################
    return final_output, hidden

In [None]:
train(tashkeel_model, train_data, n_classes=15, batch_size=200, epochs=10, learning_rate=0.002)

In [None]:
pickle.dump(tashkeel_model, open("./models/tashkeel_model_context.pkl", "wb"))

In [None]:
import pickle
import torch.nn as nn


class Model(nn.Module):
    def __init__(self, input_size=768, hidden_size=50, n_classes=100, bidirectional=False):
        """
        The constructor of our NER model
        Inputs:
        - vacab_size: the number of unique words
        - embedding_dim: the embedding dimension
        - n_classes: the number of final classes (tags)
        """
        super(Model, self).__init__()

        self.lstm = nn.LSTM(input_size, hidden_size,
                            batch_first=True, bidirectional=bidirectional)

        self.linear = nn.Linear(
            hidden_size * (2 if bidirectional else 1), n_classes)

    def forward(self, X):
        """
        This function does the forward pass of our model
        Inputs:
        - sentences: tensor of shape (batch_size, max_length)

        Returns:
        - final_output: tensor of shape (batch_size, max_length, n_classes)
        """

        final_output, hidden = self.lstm(X)
        final_output = self.linear(final_output)

        return final_output, hidden




class Context_Model(nn.Module):
    def __init__(self, input_size=768, hidden_size=50, n_classes=100, bidirectional=False):
        """
        The constructor of our NER model
        Inputs:
        - vacab_size: the number of unique words
        - embedding_dim: the embedding dimension
        - n_classes: the number of final classes (tags)
        """
        super(Context_Model, self).__init__()

        self.lstm = nn.LSTM(input_size, hidden_size,
                            batch_first=True, bidirectional=bidirectional)

        self.linear = nn.Linear(
            hidden_size * (2 if bidirectional else 1), n_classes)

    def forward(self, X):
        """
        This function does the forward pass of our model
        Inputs:
        - sentences: tensor of shape (batch_size, max_length)

        Returns:
        - final_output: tensor of shape (batch_size, max_length, n_classes)
        """

        final_output, hidden = self.lstm(X)
        final_output = self.linear(final_output)

        return final_output, hidden




In [None]:
context_model = pickle.load(open("./models/context_model.pkl", "rb"))
new_context_model = Context_Model()
new_context_model.__dict__ = context_model.__dict__
# new_context_model.load_state_dict(context_model.state_dict())

In [None]:
pickle.dumps(new_context_model, open("./models/context_model_new.pkl", "wb"))

In [None]:
type(new_context_model)