In [1]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
%cd ./drive/My Drive/Narrator Disambiguation

Mounted at /content/drive
/content/drive/My Drive/Narrator Disambiguation


In [2]:
!pip install farasapy
!pip install pyarabic
!git clone https://github.com/aub-mind/arabert
!pip install transformers

fatal: destination path 'arabert' already exists and is not an empty directory.


In [3]:
import torch
import math
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import sys

In [4]:
from transformers import BertTokenizer, BertModel, BertConfig
from arabert.preprocess import ArabertPreprocessor

In [5]:
device = torch.device("cuda")

In [6]:
f = open('Large dataset/sanads.test', 'r')
sanads = f.readlines()
f.close()

In [7]:
# Load labels
f = open('Large dataset/labels.test', 'r')
lines = f.readlines()
labels_no_padding = list(map(lambda x: list(map(int,x.split(','))), lines))
f.close()

labels_flat = [l for labels_ls in labels_no_padding for l in labels_ls]

# Pad labels
padding_label = 18298
labels = [li+[padding_label]*(10-len(li)) for li in labels_no_padding]
labels = torch.tensor(labels)

In [8]:
n_sanads = len(sanads)
max_l=300
emb_size = 768
n_categories = 18299
print(n_sanads)

27056


In [9]:
arabert_prep = ArabertPreprocessor(model_name='bert-base-arabertv2')
tokenizer = BertTokenizer.from_pretrained('aubmindlab/bert-base-arabertv2')
model = BertModel.from_pretrained('aubmindlab/bert-base-arabertv2',output_hidden_states=True)
model.eval()
model.to(device)

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv2 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(64000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

# **Frozen AraBERT**

In [10]:
# Calculate contextual embeddings for each name in the training sanads

names_embed = []
c=0
for s in sanads:
  if c % 1000 == 0:
    print("Finished ", c)
  
  c+=1

  # Preprocessing and Tokenization
  sanad_prep = arabert_prep.preprocess(s)
  sanad_wrapped = tokenizer.cls_token + sanad_prep + tokenizer.sep_token
  tokens = tokenizer.tokenize(sanad_wrapped)
  indexed_tokens = torch.LongTensor([tokenizer.convert_tokens_to_ids(tokens)]).to(device)

  # Run sanad through model
  with torch.no_grad(): 
    outputs = model(indexed_tokens)
    embs = outputs[2][-1][0]
  
  # Calculate start index of each name in the sanad
  # names are sparated by the word فاصل
  indices = [i for i, x in enumerate(tokens) if x == 'فاصل' and (tokens[i+1][0] != '+') and (tokens[i+1][0] != '#')]

  name_embs = torch.tensor([]).to(device)
  l=1
  for i,t in enumerate(tokens):
    if i == 0 or i == len(tokens) - 1:
      continue
    if i in indices:
      names_embed += [torch.mean(name_embs,0)]
      name_embs = torch.tensor([]).to(device)
      l=1

    # Tokens that has + in it are prefixes that are not included
    elif '+' not in t:
      # For OOV average the embeddings of the word parts
      if t.startswith('##'):
        if len(name_embs) != 0:
          name_embs[-1] += embs[i]
        else:
          name_embs = torch.cat((name_embs,embs[i].unsqueeze(0)),0)

        l+=1
      else:
        if l > 1:
          name_embs[-1] /= l
          l = 1
        name_embs = torch.cat((name_embs,embs[i].unsqueeze(0)),0)

  names_embed += [torch.mean(name_embs,0)]
    

print(len(names_embed))
print(len(names_embed[0]))

Finished  0
Finished  1000
Finished  2000
Finished  3000
Finished  4000
Finished  5000
Finished  6000
Finished  7000
Finished  8000
Finished  9000
Finished  10000
Finished  11000
Finished  12000
Finished  13000
Finished  14000
Finished  15000
Finished  16000
Finished  17000
Finished  18000
Finished  19000
Finished  20000
Finished  21000
Finished  22000
Finished  23000
Finished  24000
Finished  25000
Finished  26000
Finished  27000
150904
768


In [11]:
def batch_iter(names, labels, batch_size, shuffle=True):
    
    data_len = len(names)
   
    batch_num = math.ceil(data_len / batch_size)
    index_array = list(range(data_len))

    if shuffle:
        np.random.shuffle(index_array)

    for i in range(batch_num):
        indices = index_array[i * batch_size: (i + 1) * batch_size]
        names_b = torch.stack([names[idx] for idx in indices])
        labels_b = torch.tensor([labels[idx] for idx in indices])
        
        yield names_b.to(device), labels_b.to(device)


In [12]:
model_save_path = 'Models/frozen-AraBERT-large.bin'
classifier = nn.Linear(emb_size, n_categories)
classifier.to(device)
classifier.load_state_dict(torch.load(model_save_path, map_location=torch.device(device)))

<All keys matched successfully>

In [13]:
from sklearn.metrics import f1_score

def get_f1(model, X, y):
  b_size = 500

  
  target = []
  predicted = []
  for xs, ts in batch_iter(X, y, b_size, shuffle=False):

    zs = model(xs)
    pred = zs.max(1, keepdim=True)[1].view_as(ts)
    target += ts.tolist()
    predicted += pred.tolist()

  score1 = f1_score(target, predicted, average='micro')
  score2 = f1_score(target, predicted, average='macro')
  
  return score1,score2

In [14]:
get_f1(classifier, names_embed, labels_flat)

(0.7752876000636166, 0.6800067932157435)

In [15]:
def get_SER(model, X, y):
  correct, total = 0, len(y)
  
  i = 0
  for labels in y:
    xs = []
    ts = []

    for l in labels:
      ts += [l]
      xs += [X[i]]
      i+=1
      if i % 2000 == 0:
        print(i)

    xs = torch.stack(xs)
    ts = torch.tensor(ts, device=device)
    zs = model(xs)
    pred = zs.max(1, keepdim=True)[1] 
    
    correct += pred.eq(ts.view_as(pred)).prod().item()
    
  return 100 - 100*(correct / total)

In [16]:
get_SER(classifier, names_embed, labels_no_padding)

2000
4000
6000
8000
10000
12000
14000
16000
18000
20000
22000
24000
26000
28000
30000
32000
34000
36000
38000
40000
42000
44000
46000
48000
50000
52000
54000
56000
58000
60000
62000
64000
66000
68000
70000
72000
74000
76000
78000
80000
82000
84000
86000
88000
90000
92000
94000
96000
98000
100000
102000
104000
106000
108000
110000
112000
114000
116000
118000
120000
122000
124000
126000
128000
130000
132000
134000
136000
138000
140000
142000
144000
146000
148000
150000


73.44766410408042

# **Tuned**

In [17]:
# Preprocessing
indexed_tokens = torch.zeros((n_sanads,max_l), dtype=torch.long)
attention_mask = torch.zeros((n_sanads,max_l), dtype=torch.long)

an_indices = []

padding_token = tokenizer.pad_token
co=0
for s_num in range(n_sanads):

  co+=1
  if co % 1000 == 0:
    print(co)

  
  sanad_connected = arabert_prep.preprocess(sanads[s_num])
  
  sanad_wrapped = tokenizer.cls_token + sanad_connected + tokenizer.sep_token

  tokenized_text = tokenizer.tokenize(sanad_wrapped)
  tokenized_text_padded = tokenized_text+[padding_token]*(max_l-len(tokenized_text))
  indices = [i for i, x in enumerate(tokenized_text) if x == "فاصل" and (tokenized_text[i+1][0] != '+')] + [len(tokenized_text)]
  
  an_indices += [indices]
  
  indexed_tokens[s_num] = torch.LongTensor(tokenizer.convert_tokens_to_ids(tokenized_text_padded))
  attention_mask[s_num] = torch.cat((torch.ones((len(tokenized_text))),torch.zeros((len(tokenized_text_padded)-len(tokenized_text)))))


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000


In [18]:
# Calculate lengths of names in the sanads
names_lens = []
for idxs in an_indices:
  idx_len = len(idxs)
  names_lens += [[idxs[0]] + [idxs[i+1]-idxs[i]-1 for i in range(idx_len-1)] + [1 for j in range(10 - idx_len)]]

names_lens = torch.tensor(names_lens)
print(names_lens.shape)


torch.Size([27056, 10])


In [19]:
# Generate masks to extract names from sanads
mask_tensor = torch.tensor([
         [[0] + [1]*(indices[0]-1) + [0]*(max_l-indices[0])] +
         [[0]*indices[i] + [1]*(indices[i+1]-indices[i]) + [0]*(max_l-indices[i+1]) for i in range(len(indices)-2)] +
         [[0]*indices[-2] + [1]*(indices[-1]-indices[-2]) + [0]*(max_l-indices[-1]-1) + [0]] +
         [[0]*max_l for j in range(10 -len(indices))]
         
 for indices in an_indices]).unsqueeze(-1)

print(mask_tensor.shape)

torch.Size([27056, 10, 300, 1])


In [20]:
def batch_iter(token_ids, attention_mask, names_masks, names_lens, labels, batch_size, shuffle=True):
    
    data_len = token_ids.shape[0]
    batch_num = math.ceil(data_len / batch_size)
    index_array = list(range(data_len))

    if shuffle:
        np.random.shuffle(index_array)

    for i in range(batch_num):
        indices = index_array[i * batch_size: (i + 1) * batch_size]

        token_ids_b = torch.cat([token_ids[idx].unsqueeze(0) for idx in indices])
        attention_mask_b = torch.cat([attention_mask[idx].unsqueeze(0) for idx in indices])
        names_masks_b = torch.cat([names_masks[idx].unsqueeze(0) for idx in indices])
        names_lens_b = torch.cat([names_lens[idx].unsqueeze(0) for idx in indices])
        labels_b = torch.cat([labels[idx].unsqueeze(0) for idx in indices])

        
        
        yield token_ids_b.to(device),attention_mask_b.to(device),names_masks_b.to(device),names_lens_b.to(device),labels_b.to(device)


In [21]:
# Model

class ClassifyNarrator(nn.Module):
    def __init__(self, emb_size, output_size, max_l):
       
        super(ClassifyNarrator, self).__init__()

        self.max_l = max_l
        self.emb_size = emb_size

        self.bert = BertModel.from_pretrained('aubmindlab/bert-base-arabertv2',output_hidden_states=True)
        self.i2o = nn.Linear(emb_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, tokens_tensor, attention_mask, names_masks, names_lens):
       
        batch_size = tokens_tensor.shape[0]

        brt_outputs = self.bert(tokens_tensor, attention_mask=attention_mask)
        embs = brt_outputs[2][-1]

        names_emb = torch.mul(embs.unsqueeze(1), names_masks).view(batch_size*10,self.max_l,self.emb_size)

        emb_sum = torch.sum(names_emb,1)
        emb_avg = torch.div(emb_sum, names_lens.view(batch_size*10,1))

        output = self.i2o(emb_avg) 
        
        output = self.softmax(output)        

        return output


In [22]:
classifier = ClassifyNarrator(emb_size, n_categories, max_l)
classifier.to(device)
model_save_path = 'Models/tuned-AraBERT-large.bin'
classifier.load_state_dict(torch.load(model_save_path, map_location=torch.device(device)))

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv2 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [23]:
from sklearn.metrics import f1_score

def get_f1(model, token_ids, attention_mask, names_masks,names_lens, y):

  was_training = model.training
  model.eval()

  b_size = 400

  target = []
  predicted = []

  with torch.no_grad():
    for token_ids_b, attention_mask_b, names_masks_b,names_lens_b, ts in batch_iter(token_ids,
                                                                       attention_mask,
                                                                       names_masks,
                                                                       names_lens,
                                                                       y,
                                                                       b_size):

      zs = model(token_ids_b, attention_mask_b, names_masks_b, names_lens_b)
      pred = zs.max(1, keepdim=True)[1].view((len(ts)*10))
      ts = ts.view((len(ts)*10))

      
      target += ts[ts!=padding_label].tolist()
      predicted += pred[ts!=padding_label].tolist()

  score1 = f1_score(target, predicted, average='micro')
  score2 = f1_score(target, predicted, average='macro')

  if was_training:
    model.train()
  
  return score1,score2

In [24]:
get_f1(classifier,indexed_tokens,attention_mask,mask_tensor,names_lens,labels)

(0.835471558076658, 0.7884138044909654)

In [25]:
def get_accuracy(model, token_ids, attention_mask, names_masks,names_lens, y):
  was_training = model.training
  model.eval()

  b_size = 400

  correct, total = 0, len(y)
  with torch.no_grad():
      for token_ids_b, attention_mask_b, names_masks_b,names_lens_b, ts in batch_iter(token_ids,
                                                                       attention_mask,
                                                                       names_masks,
                                                                       names_lens,
                                                                       y,
                                                                       b_size,
                                                                       shuffle=False):

        zs = model(token_ids_b, attention_mask_b, names_masks_b, names_lens_b)
        pred = zs.max(1, keepdim=True)[1].view(-1,10) 
        
        correct += torch.logical_or(pred.eq(ts) , ts==padding_label).prod(1).sum().item()
        
  if was_training:
    model.train()

  return 100 - 100*(correct / total)

In [26]:
get_accuracy(classifier,indexed_tokens,attention_mask,mask_tensor,names_lens,labels)

60.603932584269664