In [1]:
import torch
import torch.nn as nn
from torch import tensor
import numpy as np
from tqdm import tqdm
import pandas as pd
import itertools
from transformers import AutoTokenizer, AutoModel
import pickle
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

  from .autonotebook import tqdm as notebook_tqdm


device(type='cuda')

# Dataset

In [2]:
class Dataset(torch.utils.data.Dataset):

  def __init__(self, x, y, pad=None):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    # padding all sentences and labels to have the same length
    x = list(x)
    y = list(y)
    max_len = max([len(sentence) for sentence in x])
    for i in range(len(x)):
      x[i] = torch.cat((tensor(x[i]), torch.zeros((max_len - len(x[i]), 53))))
      y[i] = tensor(y[i] + [15] * (max_len - len(y[i])))
    self.x = torch.stack(x)
    self.y = torch.stack(y)

  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    return len(self.x)

  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    return self.x[idx], self.y[idx]

In [3]:
def load_data(data, labels):

    data = data.explode().to_list()
    labels = labels.explode().to_list()
    train_data = zip(data, labels)
    
    cleaned_data = []
    for word in train_data:
        try:
            if isinstance(word[0], list):
                cleaned_data.append(word)
        except:
            pass
    data, labels = zip(*cleaned_data)

    train_data = Dataset(data, labels)

    return train_data    

# Model 

#### Model 1

In [4]:
class Model(nn.Module):
  def __init__(self, input_size=768, hidden_size=50, n_classes=100, bidirectional=False):
    super(Model, self).__init__()

    self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=bidirectional)

    self.linear = nn.Linear(hidden_size * (2 if bidirectional else 1), n_classes)

  def forward(self, X, hidden=None):

    final_output, hidden = self.lstm(X, hidden)
    final_output = self.linear(final_output)

    return final_output, hidden

In [5]:
def train(model, train_dataset, n_classes, batch_size=512, epochs=5, learning_rate=0.01):
  # (1) create the dataloader of the training set (make the shuffle=True)
  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

  # (2) make the criterion cross entropy loss
  criterion = torch.nn.CrossEntropyLoss()

  # (3) create the optimizer (Adam)
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  # GPU configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

  for epoch_num in range(epochs):
    total_acc_train = 0
    total_loss_train = 0

    for train_input, train_label in tqdm(train_dataloader):
      # (4) move the train input to the device
      train_input = train_input.to(device)

      # (5) move the train label to the device
      train_label = train_label.to(device)


      # (6) do the forward pass
      output, _ = model(train_input)
      output = output.to(device)
      
      # (7) loss calculation (you need to think in this part how to calculate the loss correctly)
      # -1 is ignore 
      batch_loss = criterion(output.view(-1, n_classes), train_label.view(-1))
    

      # (8) append the batch loss to the total_loss_train
      total_loss_train += batch_loss.item()
      
      # (9) calculate the batch accuracy (just add the number of correct predictions)
      acc = (output.argmax(dim=-1) == train_label).sum().item()
      total_acc_train += acc

      # (10) zero your gradients
      optimizer.zero_grad()

      # (11) do the backward pass
      batch_loss.backward(retain_graph=True)

      # (12) update the weights with your optimizer
      optimizer.step()
      
    # epoch loss
    epoch_loss = total_loss_train / len(train_dataset)

    # (13) calculate the accuracy
    epoch_acc = total_acc_train / (len(train_dataset) * train_dataset[0][0].shape[0])

    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
        | Train Accuracy: {epoch_acc}\n')

  ##############################################################################################################

In [6]:
def eval_letter_model(model, eval_dataset, batch_size=512):

    test_dataloader = torch.utils.data.DataLoader(
        eval_dataset, batch_size=batch_size)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()


    total_acc_test = 0

    with torch.no_grad():
        for test_input, test_label in tqdm(test_dataloader):

            test_label = test_label.to(device)


            test_input = test_input.to(device)

            output, _ = model(test_input)


            acc = (output.argmax(dim=-1) == test_label).sum().item()
            total_acc_test += acc

    print(
        f'\nTest Accuracy: {total_acc_test/(len(eval_dataset) * eval_dataset[0][0].shape[0])}')


#### Training

In [7]:
data = pd.read_csv("./processed/train_shwya.csv")

labels = data["diacritics"].apply(eval)
data = data["features"].apply(eval)


In [8]:
print("Creating Dataset...")
train_data = load_data(data, labels)


Creating Dataset...


In [14]:
tashkeel_model = Model(input_size=36+15+2, hidden_size=512, n_classes=16).to(device)


train(tashkeel_model, train_data, n_classes=16, batch_size=200, epochs=3, learning_rate=0.05)

100%|██████████| 30/30 [00:01<00:00, 29.48it/s]


Epochs: 1 | Train Loss: 0.008321050292980282         | Train Accuracy: 0.6466409136714814



100%|██████████| 30/30 [00:00<00:00, 105.39it/s]


Epochs: 2 | Train Loss: 0.0031591397946338184         | Train Accuracy: 0.7668122270742358



100%|██████████| 30/30 [00:00<00:00, 96.95it/s]

Epochs: 3 | Train Loss: 0.002592827024818628         | Train Accuracy: 0.823782331205912






In [15]:
train(tashkeel_model, train_data, n_classes=16, batch_size=500, epochs=1, learning_rate=0.05)


100%|██████████| 12/12 [00:00<00:00, 21.83it/s]

Epochs: 1 | Train Loss: 0.0018052597813931414         | Train Accuracy: 0.7711454484380249






In [142]:
pickle.dump(tashkeel_model, open("./models/demo.pickle", "wb"))

In [None]:
# tashkeel_model = pickle.load(open("./models/demo.pickle", "rb"))
# tashkeel_model

In [None]:
data = pd.read_csv("./processed/train_sample.csv")

labels = data["diacritics"].apply(eval)
data = data["features"].apply(eval)

print("Creating Dataset...")
train_data = load_data(data, labels)

eval_letter_model(tashkeel_model, train_data, batch_size=500)

#### Prediction

In [7]:
class Pred_Dataset(torch.utils.data.Dataset):

  def __init__(self, x, pad=None):
    x = list(x)
    max_len = max([len(sentence) for sentence in x])
    for i in range(len(x)):
      x[i] = torch.cat((tensor(x[i], dtype=torch.float32), torch.zeros((max_len - len(x[i]), 53))))
    self.x = torch.stack(x)

  def __len__(self):
    return len(self.x)

  def __getitem__(self, idx):
    return self.x[idx]

In [8]:
def load_pred_data(data):

    data = data.explode().to_list()
    
    cleaned_data = []
    for word in data:
        try:
            if isinstance(word[0], list):
                cleaned_data.append(word)
        except:
            pass
    data = cleaned_data

    train_data = Pred_Dataset(data)

    return train_data    

In [9]:
def pred_letter_model(model, eval_dataset):

    test_dataloader = torch.utils.data.DataLoader(
        eval_dataset, batch_size=1)

    out_preds = []
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()

    with torch.no_grad():
        for test_input in tqdm(test_dataloader):

            test_input = test_input.to(device)

            output, _ = model(test_input)

            outputs = output.argmax(dim=-1)

            for i in range(len(test_input[0])):
                if test_input[0][i].sum() > 0:
                    out_preds.append(outputs[0][i].item())

    return out_preds

In [27]:
data = pd.read_csv("./processed/testing.csv")

data = data["features"].apply(eval)
test_data = load_pred_data(data)



In [35]:
predictions = pred_letter_model(tashkeel_model, test_data)
len(predictions)

100%|██████████| 104788/104788 [05:01<00:00, 347.23it/s]


417359

In [36]:
submission = pd.DataFrame(predictions, columns=["label"])
submission["ID"] = submission.index
submission[submission.label == 15]


Unnamed: 0,label,ID
161,15,161
7027,15,7027
7058,15,7058
7682,15,7682
7757,15,7757
...,...,...
398865,15,398865
404551,15,404551
415906,15,415906
415979,15,415979


In [213]:
submission.to_csv("./submission/test6.csv", index=False)
# submission.head()

## Demo

In [20]:
from code.preprocessing import PreProcess_sent

input_sent = "قوله ولا تكره ضيافته"

preprocessed, tokenized = PreProcess_sent(input_sent)

test_data = load_pred_data(preprocessed)

tashkeel_model = pickle.load(open("./models/full_accuracy.pickle", "rb"))

predictions = pred_letter_model(tashkeel_model, test_data)
len(predictions)

100%|██████████| 4/4 [00:00<00:00, 210.95it/s]


17

In [21]:
diacritic2id = pickle.load(open("./assets/diacritic2id.pickle", "rb"))
id2diacritic = {v: k for k, v in diacritic2id.items()}
tokenized_one_arr = tokenized.explode().to_list()

sentence_diacritized = ""
i = 0
for token in tokenized_one_arr:
    for letter in token:
        sentence_diacritized += letter + id2diacritic[predictions[i]]
        i += 1
    sentence_diacritized += " "

sentence_diacritized

'قَوَلهِ وَلاَ تَكْرٌهِ ضَيْاَفِتُّهُ '