### Imports :

In [1]:
import pandas as pd

from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.nn import TransformerDecoder, TransformerDecoderLayer
import torch.nn.functional as F

import torch
import torch.nn as nn
import torch.optim as optim

import numpy as np

import math
import random

import os
import re
from tqdm import tqdm

from transformers import AutoModel
from transformers import AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset, Sampler

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

import time

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
cd drive/MyDrive/TOUNSI

/content/drive/MyDrive/TOUNSI


In [4]:
seed = 99

## Setting the seed :

In [5]:
def set_seed():
    """Set seed for reproducibility.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed()

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [7]:
device

'cuda'

## Importing data :

In [8]:
data_path = "Code/data/external/transliteration/dataset.xlsx"

In [9]:
xls = pd.ExcelFile("Code/data/external/transliteration/dataset.xlsx")
dataset = pd.read_excel(xls, "Sheet1")

known = dataset[dataset.from_source == True]

dataset = dataset[["arabizi", "arabic", "from_source"]]
dataset.columns = ["Arabize", "Arabic", "from_source"]

In [10]:
dataset

Unnamed: 0,Arabize,Arabic,from_source
0,w,و,True
1,ya,يا,True
2,fi,في,True
3,rabi,ربي,True
4,el,ال,True
...,...,...,...
16849,2ejilan,آجلا,False
16850,elhewya,الهاوية,False
16851,5altin,خالطين,False
16852,nadhra,نظرة,False


## Preprocessing, Tokenisation, Spliting :

In [11]:
# By using known, the transliteration function can check if a word is already in the dictionary and directly retrieve the Arabic version. This saves the model from recomputing the transliteration,
# which is especially useful for common words or frequently repeated text.
# It also improves accuracy by avoiding potential errors from the model on words it has previously handled correctly.
known = known[["arabizi", "arabic"]].set_index("arabizi", drop=True).arabic.to_dict()
known_idx = list(known.keys())

In [12]:
# Calculate the maximum length of the Arabizi text in the dataset : This will help set the maximum input sequence length for the model
in_max = dataset.apply(lambda x: len(str(x.Arabize)), axis=1).max()

# Calculate the maximum length of the "Arabic" text in the dataset : The "+ 2" accounts for the sos and eos tokens
out_max = dataset.apply(lambda x: len(x.Arabic), axis=1).max() + 2

# Define token values for padding, end of sequence, and start of sequence
pad_token = 0  # Token used to pad sequences to the same length
eos_token = 2  # Token indicating the end of a sequence
sos_token = 1  # Token indicating the start of a sequence

In [13]:
def preprocess(a):

    x = a.copy()

    def filter_letters_arabizi(word):

        word = word.replace("$", "s")
        word = word.replace("å", "a")
        word = word.replace("é", "e")
        word = word.replace("ê", "e")
        word = word.replace("ÿ", "y")
        word = word.replace("ą", "a")
        word = word.replace("ī", "i")
        word = word.replace("\n", "")
        word = word.replace("′", "'")

        return word

    x.Arabize = filter_letters_arabizi(str(x.Arabize))
    x.Arabic = x.Arabic

    return x

In [14]:
dataset[["Arabize","Arabic"]] = dataset[["Arabize","Arabic"]].apply(preprocess, axis=1)

In [15]:
# Create a set of all unique tokens (words) in the Arabizi (input) text, converted to lowercase
in_tokens = set(" ".join(dataset.Arabize.values.tolist()).lower())

# Create a dictionary mapping each unique token to an integer, starting from 1 (0 is reserved for padding)
in_token_to_int = {token: (i+1) for i, token in enumerate(sorted(in_tokens))}

# Add a pad token mapping to 0, as it's often reserved for padding
in_token_to_int[0] = "<pad>"

In [16]:
# Create a set of all unique tokens (words) in the Arabic (output) text
out_tokens = set(" ".join(dataset.Arabic.values.tolist()))

# Create a dictionary mapping each unique token to an integer, starting from 3 (0, 1, and 2 are reserved for pad, sos, and eos)
out_token_to_int = {token: (i+3) for i, token in enumerate(sorted(out_tokens))}

# Add special tokens for padding, start of sequence (sos), and end of sequence (eos) to the Arabic token dictionary
out_token_to_int["<pad>"] = pad_token  # Padding token
out_token_to_int["<sos>"] = sos_token  # Start of sequence token
out_token_to_int["<eos>"] = eos_token  # End of sequence token

In [17]:
def tokenize(a):

    x = a.copy()

    # Tokenize the Arabizi text (convert each character to its integer representation)
    # Convert the Arabizi text to lowercase, then map each character to its corresponding integer from the 'in_token_to_int' dictionary
    x.Arabize = [in_token_to_int[i] for i in x.Arabize.lower()]

    # Tokenize the Arabic text, adding the start of sequence (sos) token at the beginning and the end of sequence (eos) token at the end
    # The start and end tokens are used for sequence modeling
    x.Arabic = [sos_token] + [out_token_to_int[i] for i in x.Arabic] + [eos_token]

    # Pad the Arabizi sequence to ensure it has a consistent length (in_max)
    # If the sequence is shorter than in_max, it is padded with the pad_token (0)
    x.Arabize = x.Arabize + (in_max - len(x.Arabize)) * [pad_token]

    # Pad the Arabic sequence to ensure it has a consistent length (out_max)
    # If the sequence is shorter than out_max, it is padded with the pad_token (0)
    x.Arabic = x.Arabic + (out_max - len(x.Arabic)) * [pad_token]

    # Return the row with tokenized and padded sequences
    return x

In [18]:
dataset[["Arabize","Arabic"]] = dataset[["Arabize","Arabic"]].apply(tokenize, axis=1)

In [19]:
validation = dataset.sample(frac=0.1)
train = dataset.drop(validation.index)

X_train = train.Arabize
y_train = train.Arabic

X_valid = validation.Arabize
y_valid = validation.Arabic

## Model Architecture : :

#### Positional Encoding :

Unlike models like RNNs or LSTMs, which inherently process data sequentially, Transformers process the entire sequence simultaneously. Therefore, positional encoding is used to inject information about the order of tokens in the sequence.

In [20]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=9000):
        super(PositionalEncoding, self).__init__()

        # Dropout layer for regularization during training
        self.dropout = nn.Dropout(p=dropout)

        # Learnable scaling parameter for the positional encoding
        self.scale = nn.Parameter(torch.ones(1))

        # Initialize a tensor for the positional encodings of size (max_len, d_model)
        pe = torch.zeros(max_len, d_model)

        # Create a tensor of positions from 0 to max_len-1 and reshape to (max_len, 1)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)

        # Calculate the scaling factor for each dimension of the positional encoding
        # This is based on the formula from the original Transformer paper
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        # Apply sine function to even indices of the positional encoding
        pe[:, 0::2] = torch.sin(position * div_term)

        # Apply cosine function to odd indices of the positional encoding
        pe[:, 1::2] = torch.cos(position * div_term)

        # Reshape the positional encoding tensor to match the shape needed for adding it to the embeddings
        pe = pe.unsqueeze(0).transpose(0, 1)

        # Register the positional encoding tensor as a buffer (not a model parameter)
        # This means it will be saved with the model but not updated by backpropagation
        self.register_buffer('pe', pe)

    # Forward pass: add positional encoding to the input tensor
    def forward(self, x):
        # Add positional encoding to the input embeddings
        # The positional encoding is scaled by the learnable 'scale' parameter
        # Only the relevant part of 'pe' is selected based on the sequence length of 'x'
        x = x + self.scale * self.pe[:x.size(0), :]

        # Apply dropout to the resulting tensor to prevent overfitting
        return self.dropout(x)


#### Transformer model :

In [21]:
class TransformerModel(nn.Module):

    def __init__(self, intoken, outtoken, hidden, enc_layers=1, dec_layers=1, dropout=0.15, nheads=4):
        super(TransformerModel, self).__init__()
        # intoken, outtoken : specify the size of the input (Arabizi) and output (Arabic) vocabularies.
        # hidden : is the hidden dimension (the number of features in the token embeddings and internal model layers).
        # enc_layers, dec_layers : specify the number of encoder and decoder layers.
        # dropout : is the dropout rate for regularization.
        # nheads : is the number of attention heads used in each attention layer.

        # Feed-forward model size, typically 4 times the hidden size
        ff_model = hidden * 4

        # Encoder: Embedding layer for input tokens (Arabizi)
        self.encoder = nn.Embedding(intoken, hidden)
        # Positional encoding for the encoder
        self.pos_encoder = PositionalEncoding(hidden, dropout)

        # Decoder: Embedding layer for output tokens (Arabic)
        self.decoder = nn.Embedding(outtoken, hidden)
        # Positional encoding for the decoder
        self.pos_decoder = PositionalEncoding(hidden, dropout)

        # Define encoder layers and create the Transformer encoder
        encoder_layers = TransformerEncoderLayer(d_model=hidden, nhead=nheads, dim_feedforward=ff_model, dropout=dropout, activation='relu')
        self.transformer_encoder = TransformerEncoder(encoder_layers, enc_layers)

        # Define decoder layers and create the Transformer decoder
        encoder_layers = TransformerDecoderLayer(hidden, nheads, ff_model, dropout, activation='relu')
        self.transformer_decoder = TransformerDecoder(encoder_layers, dec_layers)

        # Final linear layer to map the decoder output to the output token space (Arabic tokens)
        self.fc_out = nn.Linear(hidden, outtoken)

        # Masks for different parts of the model
        self.src_mask = None
        self.trg_mask = None
        self.memory_mask = None


    # Generate a mask to prevent attending to future tokens (for autoregressive generation)
    def generate_square_subsequent_mask(self, sz, sz1=None):
        if sz1 is None:
            # Create an upper triangular matrix for masking future tokens
            mask = torch.triu(torch.ones(sz, sz), 1)
        else:
            mask = torch.triu(torch.ones(sz, sz1), 1)
        # Mask all future tokens with -inf so that they cannot be attended to
        return mask.masked_fill(mask == 1, float('-inf'))


    # Create padding mask for the encoder input
    def make_len_mask_enc(self, inp):
        # Mask padding tokens in the source input (seq_len, batch_size)
        return (inp == pad_token).transpose(0, 1)


    # Create padding mask for the decoder input
    def make_len_mask_dec(self, inp):
        # Mask padding tokens in the target input (seq_len, batch_size)
        return (inp == pad_token).transpose(0, 1)


    # Define the forward pass of the model
    def forward(self, src, trg):  # SRC: (seq_len, batch_size)

        # Generate target mask (to prevent attending to future tokens during decoding)
        if self.trg_mask is None or self.trg_mask.size(0) != len(trg):
            self.trg_mask = self.generate_square_subsequent_mask(len(trg)).to(trg.device)

        # Add padding masks for both source and target
        src_pad_mask = self.make_len_mask_enc(src)
        trg_pad_mask = self.make_len_mask_dec(trg)

        # Encoder: Process source tokens (Arabizi)
        src = self.encoder(src)  # (seq_len, batch_size, hidden)
        src = self.pos_encoder(src)  # Add positional encoding to source

        # Decoder: Process target tokens (Arabic)
        trg = self.decoder(trg)  # (seq_len, batch_size, hidden)
        trg = self.pos_decoder(trg)  # Add positional encoding to target

        # Pass the source through the encoder
        memory = self.transformer_encoder(src, None, src_pad_mask)

        # Pass the target and memory through the decoder
        output = self.transformer_decoder(tgt=trg, memory=memory, tgt_mask=self.trg_mask, memory_mask=None,
                                          tgt_key_padding_mask=trg_pad_mask, memory_key_padding_mask=src_pad_mask)

        # Output layer: Map the decoder output to the output token space (Arabic)
        output = self.fc_out(output)

        # Return the model's output
        return output


In [22]:
len(in_token_to_int)

37

In [23]:
len(out_token_to_int)

53

In [24]:
set_seed()
model = TransformerModel(len(in_token_to_int), len(out_token_to_int), 128).to(device)



#### NoamOpt :

The Noam learning rate schedule is designed to prevent the model from training too quickly at the start (which can lead to instability) and to allow the model to "warm up" for a smoother training process. <br>
The rate increases initially and then decays, ensuring that the model does not get stuck in suboptimal local minima during the later stages of training.<br>
This learning rate schedule has been shown to be effective for training large-scale Transformer models like BERT and GPT.

In [25]:
class NoamOpt:
    "Optim wrapper that implements rate."

    def __init__(self, model_size, factor, warmup, optimizer):
        # model_size: The dimension of the model (the hidden size of the model's layers (e.g., 512 or 1024) )
        # factor: A scaling factor for the learning rate
        # warmup: The number of steps during which the learning rate will increase before it starts decaying
        # optimizer: The underlying optimizer that will be used to update model parameters

        self.optimizer = optimizer  # The optimizer passed to the class (like Adam)
        self._step = 0  # Track the number of optimization steps (used for learning rate calculation)
        self.warmup = warmup  # Number of warmup steps
        self.factor = factor  # Factor used to scale the learning rate
        self.model_size = model_size  # The size of the model (usually the hidden dimension)
        self._rate = 0  # Store the current learning rate

    # Update parameters and learning rate : This method is called during each training step.
    def step(self):
        "Update parameters and rate"
        self._step += 1  # Increment step count (each call to this method is a training step)
        rate = self.rate()  # Compute the learning rate using the rate function
        for p in self.optimizer.param_groups:  # Iterate through the optimizer's parameter groups
            p['lr'] = rate  # Set the current learning rate to the computed rate
        self._rate = rate  # Update the current learning rate
        self.optimizer.step()  # Perform a step of the optimizer to update the model's parameters

    # Compute the learning rate at a given step
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:  # If no step is provided, use the current step
            step = self._step

        # The Noam learning rate schedule
        # The learning rate increases during the warmup phase, then decays as the training progresses
        return self.factor * \
            (self.model_size ** (-0.5) *  # Scale by the inverse square root of the model size
            min(step ** (-0.5), step * self.warmup ** (-1.5)))  # Apply the warmup and decay formula


#### Dataset :

In [26]:
class Arab2ArabizDS(Dataset):
    # Custom Dataset class for handling Arabic to Arabizi transliteration tasks

    def __init__(self, data, label):
        """
        Initialize the dataset with data and labels.

        Parameters:
        - data: The source data
        - label: The target labels

        The data and label are assumed to be pandas DataFrames or Series, and the class converts them to lists for easier handling later.
        """
        self.data = data.values.tolist()  # Convert source data ('Arabizi') into a list of lists
        self.labels = label.values.tolist()  # Convert label data ('Arabic') into a list of lists

        # Calculate the lengths of the source and label sequences for each item
        self.lengths_source = [len(i) for i in data]  # List of lengths for each source item (data)
        self.lengths_label = [len(i) for i in label]  # List of lengths for each label item (label)

    def __len__(self):
        """
        Return the total number of samples in the dataset.
        The length of the dataset is simply the length of the source data.
        """
        return len(self.data)

    def __getitem__(self, idx):
        """
        Return a sample from the dataset at the specified index.

        Parameters:
        - idx: The index of the sample to retrieve from the dataset.

        Returns:
        - A tuple containing:
          - The source data (Arabizi text)
          - The target label (Arabic text)
          - The length of the source sequence
          - The length of the label sequence
        """
        # Return a tuple with the data, label, and their respective lengths
        return (self.data[idx], self.labels[idx], self.lengths_source[idx], self.lengths_label[idx])


#### Data Collator :

The function data_collator_Arab2Arabiz(data) is a custom data collator designed to process batches of data for training models. The purpose of this collator is to prepare a batch of data by padding the sequences to a consistent length and converting them into tensors suitable for model input.

Padding Sequences: In natural language processing (NLP) tasks, sequences can vary in length. To feed these sequences into models like transformers, they need to be padded to the same length. This function handles the padding of sequences to the maximum length in the batch. <br>
Efficient Batch Processing: Instead of dealing with sequences of different lengths, the collator ensures that all sequences in a batch have the same length, which allows for more efficient batch processing during training.<br>
Tensor Conversion: Converts the source and label sequences into PyTorch tensors, which are the format required by neural network models.

In [27]:
def data_collator_Arab2Arabiz(data):
    # Unzip the list of tuples into separate components: words (source), labels (target), and their respective lengths (source and target sequences).
    word, label, length_source, length_label = zip(*data)

    # Find the maximum length of the source and label sequences in the batch
    tensor_dim_1 = max(length_source)  # Max length of source sequences
    tensor_dim_2 = max(length_label)   # Max length of label sequences

    # Initialize two tensors filled with pad_token to store the padded sequences
    # These tensors will have dimensions:
    # - len(word) (number of items in the batch)
    # - tensor_dim_1 (maximum length of source sequences)
    # - tensor_dim_2 (maximum length of label sequences)

    out_word = torch.full((len(word), tensor_dim_1), dtype=torch.long, fill_value=pad_token)
    label_word = torch.full((len(word), tensor_dim_2), dtype=torch.long, fill_value=pad_token)

    # Iterate through each sequence in the batch and place it into the corresponding tensor
    for i in range(len(word)):
        # Fill in the padded source sequence (out_word) for each item in the batch
        out_word[i][:len(word[i])] = torch.Tensor(word[i])

        # Fill in the padded label sequence (label_word) for each item in the batch
        label_word[i][:len(label[i])] = torch.Tensor(label[i])

    # Return the batch of padded source sequences and label sequences as a tuple
    return (out_word, label_word)


#### K-Sampler :

The class KSampler is a custom data sampler used to generate batches of data for training in deep learning models. It inherits from Sampler, which is a PyTorch class used to define how samples are drawn from a dataset. The purpose of this sampler is to shuffle and group data into batches of a specific size, but with an additional focus on maintaining a balanced order based on sequence lengths.

In [28]:
class KSampler(Sampler):
    # The constructor initializes the sampler.
    def __init__(self, data_source, batch_size):
        # Extract the lengths of each sample from the data_source (assumed to be a list of tuples).
        # The second element in each tuple is the length of the sequence.
        self.lens = [x[1] for x in data_source]

        # Store the batch size, which defines how many samples should be in each batch.
        self.batch_size = batch_size

    # The __iter__ method defines how to iterate over the dataset and return batches of data.
    def __iter__(self):
        # Create a list of indices from 0 to the length of the data source.
        idx = list(range(len(self.lens)))

        # Pair each index with its corresponding sequence length.
        arr = list(zip(self.lens, idx))

        # Shuffle the list of (length, index) pairs to randomize the order of the samples.
        random.shuffle(arr)

        # Define a chunk size (larger than a single batch, used to process data in larger chunks before batching).
        n = self.batch_size * 100

        # This list will hold the batches of indices.
        iterator = []

        # Loop through the dataset in steps of size `n` (large chunks).
        for i in range(0, len(self.lens), n):
            # Take a slice of the data, size `n`.
            dt = arr[i:i+n]

            # Sort the slice by the sequence lengths to group together sequences of similar lengths.
            dt = sorted(dt, key=lambda x: x[0])

            # Create smaller batches from the sorted data (batch size is defined by `self.batch_size`).
            for j in range(0, len(dt), self.batch_size):
                # Extract the indices from the sorted slice and store them in the iterator.
                indices = list(map(lambda x: x[1], dt[j:j+self.batch_size]))
                iterator.append(indices)

        # Shuffle the final batches to ensure randomness in the order.
        random.shuffle(iterator)

        # Return the flattened list of indices for the batches.
        # [item for sublist in iterator for item in sublist] flattens the list of batches.
        return iter([item for sublist in iterator for item in sublist])  # Flatten nested list

    # The __len__ method returns the total number of items in the dataset.
    def __len__(self):
        # Return the number of sequences in the dataset (length of `self.lens`).
        return len(self.lens)


Worker-specific Seed: This function is typically used in multi-worker data loading environments (for example, in PyTorch's DataLoader with multiple workers). By ensuring each worker gets a unique but reproducible random seed, it allows parallel data loading with the randomness needed for tasks like shuffling while maintaining deterministic behavior across runs. This is important for reproducibility in experiments.

In [29]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    numpy.random.seed(worker_seed)
    random.seed(worker_seed)

## Training + Validation :

In [30]:
batch_size = 32

In [31]:
train_data = Arab2ArabizDS(X_train, y_train)
train_sampler = KSampler(train_data, batch_size)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, worker_init_fn=seed_worker, collate_fn=data_collator_Arab2Arabiz)

In [32]:
valid_data = Arab2ArabizDS(X_valid, y_valid)
valid_sampler = KSampler(valid_data, batch_size)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size,worker_init_fn=seed_worker, collate_fn=data_collator_Arab2Arabiz)

In [33]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_token)
optimizer = NoamOpt(128, 1, 4000 ,optim.Adam(model.parameters(), lr=0))

In [34]:
def run_epoch(iterator):
    # Initialize total loss to accumulate the loss over the entire epoch.
    total_loss = 0

    # Iterate through the data batches provided by the iterator.
    for src, trg in iterator:
        # Transpose the source (src) and target (trg) sequences to match the model input requirements.
        # Move the data to the correct device (CPU or GPU).
        src = src.T.to(device)
        trg = trg.T.to(device)

        # Pass the source and target sequences (excluding the last token of the target) to the model.
        output = model(src, trg[:-1, :])

        # Reshape the output of the model to match the shape required for computing loss.
        # The output shape should be [batch_size * seq_len, vocab_size].
        output = output.reshape(-1, output.shape[2])

        # Zero out the gradients for the optimizer.
        optimizer.optimizer.zero_grad()

        # Compute the loss between the model's output and the target sequence (excluding the first token).
        # The target is reshaped to be a flat vector.
        loss = criterion(output, trg[1:].reshape(-1))

        # Accumulate the loss for this batch.
        total_loss += loss.item()

        # Backpropagate the loss to compute gradients.
        loss.backward()

        # Clip gradients to avoid exploding gradients (set a maximum threshold for gradient values).
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Update the model's parameters using the optimizer.
        optimizer.step()

    # Return the average loss over the entire epoch by dividing total loss by the number of batches.
    return total_loss / len(iterator)


In [35]:
def run_validation(iterator):
    # Initialize total loss to accumulate the loss over the validation epoch.
    total_loss = 0

    # Iterate through the validation batches provided by the iterator.
    for src, trg in iterator:
        # Transpose the source (src) and target (trg) sequences to match the model input requirements.
        # Move the data to the correct device (CPU or GPU).
        src = src.T.to(device)
        trg = trg.T.to(device)

        # Pass the source and target sequences (excluding the last token of the target) to the model.
        output = model(src, trg[:-1, :])

        # Reshape the output of the model to match the shape required for computing loss.
        # The output shape should be [batch_size * seq_len, vocab_size].
        output = output.reshape(-1, output.shape[2])

        # No need to compute gradients during validation, so no zero_grad and no backward pass
        # Compute the loss between the model's output and the target sequence (excluding the first token).
        # The target is reshaped to be a flat vector.
        loss = criterion(output, trg[1:].reshape(-1))

        # Accumulate the loss for this batch.
        total_loss += loss.item()

    # Return the average loss over the entire validation set by dividing total loss by the number of batches.
    return total_loss / len(iterator)


In [36]:
set_seed()
min_loss = 99  # Initialize a variable to track the minimum validation loss seen so far. A high starting value ensures the first validation loss will be smaller.

#Change model size
for i in range(100):
    # Run one epoch of training. This will process the entire training dataset and return the training loss.
    loss = run_epoch(train_dataloader)
    # Run one epoch of validation. This will process the entire validation dataset and return the validation loss.
    loss_val = run_validation(valid_dataloader)

    # Check if the current validation loss is lower than the previous minimum validation loss.
    if loss_val < min_loss:
        min_loss = loss_val # Update the minimum loss to the new lower value.
        torch.save(model, "convert_best")

    print("EPOCH %d -- %f -- Val Loss: %f" % (i, loss, loss_val))



EPOCH 0 -- 3.124180 -- Val Loss: 2.229038
EPOCH 1 -- 1.508315 -- Val Loss: 1.138979
EPOCH 2 -- 1.011463 -- Val Loss: 0.926422
EPOCH 3 -- 0.893610 -- Val Loss: 0.839303
EPOCH 4 -- 0.825758 -- Val Loss: 0.863014
EPOCH 5 -- 0.785756 -- Val Loss: 0.775873
EPOCH 6 -- 0.752526 -- Val Loss: 0.756501
EPOCH 7 -- 0.737339 -- Val Loss: 0.777047
EPOCH 8 -- 0.720738 -- Val Loss: 0.740346
EPOCH 9 -- 0.688497 -- Val Loss: 0.667755
EPOCH 10 -- 0.654240 -- Val Loss: 0.669286
EPOCH 11 -- 0.629913 -- Val Loss: 0.620962
EPOCH 12 -- 0.603226 -- Val Loss: 0.606131
EPOCH 13 -- 0.588001 -- Val Loss: 0.599294
EPOCH 14 -- 0.573811 -- Val Loss: 0.601779
EPOCH 15 -- 0.552191 -- Val Loss: 0.600437
EPOCH 16 -- 0.542038 -- Val Loss: 0.575959
EPOCH 17 -- 0.532183 -- Val Loss: 0.540318
EPOCH 18 -- 0.526410 -- Val Loss: 0.555283
EPOCH 19 -- 0.513015 -- Val Loss: 0.537879
EPOCH 20 -- 0.502924 -- Val Loss: 0.547935
EPOCH 21 -- 0.492530 -- Val Loss: 0.540649
EPOCH 22 -- 0.484088 -- Val Loss: 0.549336
EPOCH 23 -- 0.478895 

In [None]:
model = torch.load("convert_best").eval()

In [37]:
min_loss

0.43048117548789616

In [None]:
out_int_to_token = {out_token_to_int[t]:t for t in out_token_to_int}

## NOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO

In [None]:
def arabizi_2_arabic(inp):

    input_sentence = [in_token_to_int[i] for i in inp.lower()]
    preds = [sos_token]

    input_sentence = torch.Tensor(input_sentence).unsqueeze(-1).long().to(device)


    new_char = -1

    while new_char != eos_token:

        output_sentence = torch.Tensor(preds).unsqueeze(-1).long().to(device)

        src = model.pos_encoder(model.encoder(input_sentence))
        trg = model.pos_encoder(model.decoder(output_sentence))

        memory = model.transformer_encoder(src)
        output = model.transformer_decoder(tgt = trg, memory = memory)

        output = model.fc_out(output)
        new_char = output.argmax(-1)[-1, 0].item()

        preds.append(new_char)

        if len(preds) > 50:
            break


    return "".join([out_int_to_token[i] for i in preds[1:-1]])

In [None]:
train = pd.read_csv("../input/zindidd/Train.csv")[["textt", "labell"]]
train.columns = ["texts", "data_labels"]

data = train

In [None]:
def preprocess(text):    #Might use the same setting if they work to other languages (english and french)

    text = text.replace('ß',"b")
    text = text.replace('à',"a")
    text = text.replace('á',"a")
    text = text.replace('ç',"c")
    text = text.replace('è',"e")
    text = text.replace('é',"e")
    text = text.replace('$',"s")
    text = text.replace("1","")


    text = text.lower()
    text = re.sub(r'[^A-Za-z0-9 ,!?.]', '', text)


    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    text = re.sub(r'([h][h][h][h])\1+', r'\1', text)
    text = re.sub(r'([a-g-i-z])\1+', r'\1', text)  #Remove repeating characters
    text = re.sub(r' [0-9]+ ', " ", text)
    text = re.sub(r'^[0-9]+ ', "", text)

    return text

In [None]:
#Keep numbers block
def split(text):

    splits = re.findall(r"[\w']+|[?!.,]", text)

    to_be_added = []
    idx_to_be_added = []

    forbidden = ["?", "!", ".", ","] + known_idx

    for i, split in enumerate(splits):

        if split in forbidden:
            if split in known_idx:
                to_be_added.append(known[split])
            else:
                to_be_added.append(split)
            idx_to_be_added.append(i)
        #else:
        #splits[i] = splits[i][:1000]


    splits = [i for i in splits if not i in forbidden]

    return splits, to_be_added, idx_to_be_added

In [None]:
problematic = []

def convert_phrase_2(text):
    text = text.replace("0","")
    text = text.replace("6","")

    #print("\nTEXT: "+text)
    phrase, to_be_added, idx_to_be_added = split(text.lower())

    max_len_phrase = max([len(i) for i in phrase])

    input_sentence = []
    for word in phrase:
        input_sentence.append([in_token_to_int[i] for i in word] + [pad_token]*(max_len_phrase-len(word)))

    input_sentence = torch.Tensor(input_sentence).long().T.to(device)
    preds = [[sos_token] * len(phrase)]

    end_word = len(phrase) * [False]
    src_pad_mask = model.make_len_mask_enc(input_sentence)


    while not all(end_word):
        output_sentence = torch.Tensor(preds).long().to(device)

        src = model.pos_encoder(model.encoder(input_sentence))
        trg = model.pos_encoder(model.decoder(output_sentence))

        memory = model.transformer_encoder(src, None ,src_pad_mask)
        output = model.transformer_decoder(tgt = trg, memory = memory, memory_key_padding_mask = src_pad_mask)


        output = model.fc_out(output)


        output = output.argmax(-1)[-1].cpu().detach().numpy()
        preds.append(output.tolist())


        end_word = (output == eos_token) | end_word

        if len(preds) > 50:
            global problematic

            problematic.append(text)
            #print(text)
            break


    preds = np.array(preds).T
    result = []

    for word in preds:

        tmp = []
        for i in word[1:]:
            if out_int_to_token[i] == "<eos>":
                break
            tmp.append(out_int_to_token[i])

        result.append("".join(tmp))


    #Re-add removed punctuation
    for item, idx in zip(to_be_added, idx_to_be_added):

        if item == "?":
            item = "؟"
        elif item == ",":
            item = "،"

        result.insert(idx, item)


    result = " ".join(result)

    return result

In [None]:
train.texts = train.texts.apply(preprocess)

In [None]:
results = []
step_size = 100

texts = train.texts.values.tolist()

for i in tqdm(range(0, len(texts), step_size)):

    out = convert_phrase_2(" lkrb3 ".join(texts[i:i+step_size]))
    splitted_sentences = [ex.lstrip().rstrip() for ex in out.split(" " + convert_phrase_2("lkrb3") + " ")]

    if len(splitted_sentences) != len(texts[i:i+step_size]):
        print("DANGER")
        break

    results.extend(splitted_sentences)

In [None]:
train["converted"] = results.copy()
train.to_csv("train_data.csv")

In [None]:
test = pd.read_csv("../input/zindidd/Test.csv")
test.textt = test.textt.apply(preprocess)

In [None]:
results = []
step_size = 50

texts = test.textt.values.tolist()

for i in tqdm(range(0, len(texts), step_size)):

    out = convert_phrase_2(" lkrb3 ".join(texts[i:i+step_size]))
    splitted_sentences = [ex.lstrip().rstrip() for ex in out.split(" " + convert_phrase_2("lkrb3") + " ")]

    if len(splitted_sentences) != len(texts[i:i+step_size]):
        print("DANGER")
        break

    results.extend(splitted_sentences)

In [None]:
test["converted"] = results
test.to_csv("test_data.csv")

--------------------

In [None]:
def preprocessing_for_bert(data, tokenizer, preprocess_text, max_len=256):

    input_ids = []
    attention_masks = []
    tmp = tokenizer.encode("ab")[-1]

    for sentence in data:

        encoding = tokenizer.encode(preprocess_text(sentence))

        if len(encoding) > max_len:
            encoding = encoding[:max_len-1] + [tmp]

        in_ids = encoding
        att_mask = [1]*len(encoding)

        input_ids.append(in_ids)
        attention_masks.append(att_mask)

    return input_ids, attention_masks

In [None]:
class BertDataset(Dataset):

    def __init__(self, data, masks, label=None):

        self.data = data
        self.masks = masks

        if label != None:
            self.labels = label
        else:
            self.labels = None

        self.lengths = [len(i) for i in data]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.labels !=  None:
            return (self.data[idx], self.masks[idx], self.labels[idx], self.lengths[idx])
        else:  #For validation
            return (self.data[idx], self.masks[idx], None, self.lengths[idx])

In [None]:
def data_collator(data):

    sentence, mask, label, length = zip(*data)

    tensor_dim = max(length)

    out_sentence = torch.full((len(sentence), tensor_dim), dtype=torch.long, fill_value=pad)
    out_mask = torch.zeros(len(sentence), tensor_dim, dtype=torch.long)

    for i in range(len(sentence)):

        out_sentence[i][:len(sentence[i])] = torch.Tensor(sentence[i])
        out_mask[i][:len(mask[i])] = torch.Tensor(mask[i])

    if label[0] != None:
        return (out_sentence, out_mask, torch.Tensor(label).long())
    else:
        return (out_sentence, out_mask)

In [None]:
class KSampler(Sampler):

    def __init__(self, data_source, batch_size):
        self.lens = [x[1] for x in data_source]
        self.batch_size = batch_size

    def __iter__(self):

        idx = list(range(len(self.lens)))
        arr = list(zip(self.lens, idx))

        random.shuffle(arr)
        n = self.batch_size*100

        iterator = []

        for i in range(0, len(self.lens), n):
            dt = arr[i:i+n]
            dt = sorted(dt, key=lambda x: x[0])

            for j in range(0, len(dt), self.batch_size):
                indices = list(map(lambda x: x[1], dt[j:j+self.batch_size]))
                iterator.append(indices)

        random.shuffle(iterator)
        return iter([item for sublist in iterator for item in sublist])  #Flatten nested list

    def __len__(self):
        return len(self.lens)


In [None]:
# Create the BertClassfier class
class BertClassifier(nn.Module):

    def __init__(self, model_name, dropout, freeze_bert=False):

        super(BertClassifier, self).__init__()
        D_in, H, D_out = 768, 200, 3

        self.bert = AutoModel.from_pretrained(model_name)

        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Linear(H, D_out)
        )

        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask):

        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)

        last_hidden_state_cls = outputs[0][:, 0, :]

        logits = self.classifier(last_hidden_state_cls)

        return logits

In [None]:
def initialize_model(model_name, epochs=4, dropout=0.1):

    bert_classifier = BertClassifier(model_name, dropout=dropout, freeze_bert=False)

    bert_classifier.to(device)

    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,
                      eps=1e-8
                      )

    total_steps = len(train_dataloader) * epochs

    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [None]:
loss_fn = nn.CrossEntropyLoss()

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False, fold=0, prefix=""):

    global max_acc

    print("Start training...\n")
    for epoch_i in range(epochs):

        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        t0_epoch, t0_batch = time.time(), time.time()

        total_loss, batch_loss, batch_counts = 0, 0, 0
        model.train()

        for step, batch in enumerate(train_dataloader):
            batch_counts +=1

            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            model.zero_grad()

            logits = model(b_input_ids, b_attn_mask)

            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):

                time_elapsed = time.time() - t0_batch

                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

            if step%200 == 0 and step != 0 and epoch_i != 0 and epoch_i != 1:

                print("-"*70)

                if evaluation == True:

                    val_loss, val_accuracy = evaluate(model, val_dataloader)

                    if val_accuracy > max_acc:
                        max_acc = val_accuracy
                        torch.save(model, prefix + "_best_"+str(fold))
                        print("new max")


                    print(val_accuracy)

                    print("-"*70)
                print("\n")

                model.train()

        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)

        if evaluation == True:

            val_loss, val_accuracy = evaluate(model, val_dataloader)

            if val_accuracy > max_acc:
                max_acc = val_accuracy
                torch.save(model, prefix+"_best_"+str(fold))
                print("new max")

            time_elapsed = time.time() - t0_epoch

            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")

    print("Training complete!")


def evaluate(model, val_dataloader):

    model.eval()

    val_accuracy = []
    val_loss = []

    for batch in val_dataloader:

        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        preds = torch.argmax(logits, dim=1).flatten()

        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [None]:
def get_indices(arr, idxs):  #Helper function to get multiple indexes from a list

    output = []
    for idx in idxs:
        output.append(arr[idx])

    return output

In [None]:
#Tried these different preprocessing functions and tesed their effect on the results
#Found out that text_preprocessing_2 gives the best results for the English model
def text_preprocessing_1(text):

    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()

    return text


def text_preprocessing_2(text):

    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()

    text = re.sub(r'([a-g-i-z][a-g-i-z])\1+', r'\1', text)

    return text


def text_preprocessing_3(text):

    text = text.replace('ß',"b")
    text = text.replace('à',"a")
    text = text.replace('á',"a")
    text = text.replace('ç',"c")
    text = text.replace('è',"e")
    text = text.replace('é',"e")
    text = text.replace('$',"s")
    text = text.replace("1","")


    text = text.lower()
    text = re.sub(r'[^A-Za-z0-9 ,!?.]', '', text)


    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    text = re.sub(r'([h][h][h][h])\1+', r'\1', text)
    text = re.sub(r'([a-g-i-z])\1+', r'\1', text)  #Remove repeating characters
    text = re.sub(r' [0-9]+ ', " ", text)
    text = re.sub(r'^[0-9]+ ', "", text)

    return text

In [None]:
data = pd.read_csv("../input/zindidd/Train.csv")[["textt", "labell"]].iloc[1000:]
data.columns = ["texts", "data_labels"]

data.data_labels = data.data_labels.replace(0,2)  #Neutral 2, Positive 1, Negative 0
data.data_labels = data.data_labels.replace(-1,0)



X = data.texts.values
y = data.data_labels.values

preprocessed_data, masks = preprocessing_for_bert(X, tokenizer_en, text_preprocessing_2, max_len=256)
pad = tokenizer_en.pad_token_id

In [None]:
kfold = KFold(5, True, seed)
fold = 0

bests = []

for train_ids, val_ids in kfold.split(preprocessed_data):

    print("\n\tFOLD %d \n" % (fold))
    max_acc = -99

    X_train = get_indices(preprocessed_data, train_ids)
    y_train = get_indices(y, train_ids)
    train_masks = get_indices(masks, train_ids)

    X_val = get_indices(preprocessed_data, val_ids)
    y_val = get_indices(y, val_ids)
    val_masks = get_indices(masks, val_ids)


    X_val, y_val, val_masks = list(zip(*sorted(zip(X_val, y_val, val_masks), key=lambda x: len(x[0]))))  #Order the validation data for faster validation
    X_val, y_val, val_masks = list(X_val), list(y_val), list(val_masks)


    # Convert other data types to torch.Tensor
    y_train = torch.tensor(y_train)
    y_val = torch.tensor(y_val)

    # Create the DataLoader for our training set
    train_data = BertDataset(X_train, train_masks, y_train)
    train_sampler = KSampler(train_data, batch_size)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, collate_fn=data_collator)

    # Create the DataLoader for our validation set
    val_data = BertDataset(X_val, val_masks, y_val)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size, collate_fn=data_collator)


    set_seed()    # Set seed for reproducibility
    bert_classifier, optimizer, scheduler = initialize_model(model_name=model_name_en, epochs=n_epochs, dropout=0.05)
    train(bert_classifier, train_dataloader, val_dataloader, epochs=n_epochs, evaluation=True, fold=fold, prefix="en")

    fold += 1
    bests.append(max_acc)


In [None]:
bests

In [None]:
data = pd.read_csv("train_data.csv")[["converted", "data_labels"]].iloc[1000:]
data.columns = ["texts", "data_labels"]

data.data_labels = data.data_labels.replace(0,2)  #Neutral 2, Positive 1, Negative 0
data.data_labels = data.data_labels.replace(-1,0)



X = data.texts.values
y = data.data_labels.values

preprocessed_data, masks = preprocessing_for_bert(X, tokenizer_ar, lambda x: x, max_len=256)
pad = tokenizer_ar.pad_token_id

In [None]:
kfold = KFold(10, True, seed)
fold = 0

bests = []

for train_ids, val_ids in kfold.split(preprocessed_data):

    print("\n\tFOLD %d \n" % (fold))
    max_acc = -99

    X_train = get_indices(preprocessed_data, train_ids)
    y_train = get_indices(y, train_ids)
    train_masks = get_indices(masks, train_ids)

    X_val = get_indices(preprocessed_data, val_ids)
    y_val = get_indices(y, val_ids)
    val_masks = get_indices(masks, val_ids)


    X_val, y_val, val_masks = list(zip(*sorted(zip(X_val, y_val, val_masks), key=lambda x: len(x[0]))))  #Order the validation data for faster validation
    X_val, y_val, val_masks = list(X_val), list(y_val), list(val_masks)


    # Convert other data types to torch.Tensor
    y_train = torch.tensor(y_train)
    y_val = torch.tensor(y_val)

    # Create the DataLoader for our training set
    train_data = BertDataset(X_train, train_masks, y_train)
    train_sampler = KSampler(train_data, batch_size)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, collate_fn=data_collator)

    # Create the DataLoader for our validation set
    val_data = BertDataset(X_val, val_masks, y_val)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size, collate_fn=data_collator)


    set_seed()    # Set seed for reproducibility
    bert_classifier, optimizer, scheduler = initialize_model(model_name=model_name_ar, epochs=n_epochs, dropout=0)
    train(bert_classifier, train_dataloader, val_dataloader, epochs=n_epochs, evaluation=True, fold=fold, prefix="ar")

    fold += 1
    bests.append(max_acc)



In [None]:
bests

In [None]:
def bert_single_predict(model, test_dataloader):

    model.eval()

    all_logits = []

    for batch in tqdm(test_dataloader):

        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)

    all_logits = torch.cat(all_logits, dim=0)

    probs = F.softmax(all_logits, dim=1).cpu().numpy()

    return probs

In [None]:
def bert_ensemble_predict(sentences, models, tokenizer, preprocess, truncate=True, max_len=256):

    inputs, masks = preprocessing_for_bert(sentences, tokenizer, preprocess, max_len=max_len)


    dataset = BertDataset(inputs, masks)
    sample = SequentialSampler(dataset)
    dataloader = DataLoader(dataset, sampler=sample, batch_size=128, collate_fn=data_collator)

    preds = []

    for model in models:
        preds.append(bert_single_predict(model, dataloader))

    return preds

In [None]:
def predict_lang(lang_prefix, directory, preprocess_fn, dataset, model_name, n=1, truncate=True, max_len=256):

    print("Loading the models ....")

    global pad
    tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)
    pad = tokenizer.pad_token_id

    lang_models = []
    for i in range(n):
        lang_models.append(torch.load(directory + "/" + lang_prefix + "best_"+str(i), map_location=device))

    print("Inference ....")

    out = bert_ensemble_predict(dataset, lang_models, tokenizer, preprocess_fn, truncate=truncate, max_len=max_len)

    out_sum = out[0]
    for i in range(1,n):
        out_sum = out[i] + out_sum

    return out_sum

In [None]:
#Sort the list for faster inference
df = pd.read_csv("../input/zindidd/Test.csv")
df_converted = pd.read_csv("test_data.csv")

df["lens"] = df.textt.apply(len)
df = df.sort_values(by="lens").set_index("IDD", drop=True)
df_converted = df_converted.set_index("IDD", drop=True).loc[df.index]


#Convert to list
test = df.textt.tolist()
test_converted = df_converted[["converted"]].converted.tolist()

In [None]:
output_ar = predict_lang("ar_", "./", lambda x:x, test_converted, model_name_ar, n=10, truncate=True, max_len=512)

In [None]:
output_en = predict_lang("en_", "./", text_preprocessing_2, test, model_name_en, n=5, truncate=True, max_len=512)

In [None]:
df["preds"] = ((output_ar/10)*1.30+(output_en/5)).argmax(1)

df.preds = df.preds.replace(0,-1)
df.preds = df.preds.replace(2,0)

the_output = df.reset_index()[["IDD", "preds"]]
the_output.columns = ["ID", "label"]

the_output.to_csv("lessvalid_convvalid150.csv", index=False)