In [None]:
!pip install transformers



In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import tarfile
import shutil
import math
import os
import math
import time
import torch
import codecs
import torch.optim
import torch.utils.data
from tqdm import tqdm
from torch import nn
from tqdm import tqdm
from random import shuffle
from itertools import groupby
from google.colab import drive
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
#@title Multi Head Attention
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class MultiHeadAttention(nn.Module):

    def __init__(self, d_model, n_heads, d_queries, d_values, dropout, in_decoder=False):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_queries = d_queries
        self.d_values = d_values
        self.d_keys = d_queries
        self.in_decoder = in_decoder

        self.cast_queries = nn.Linear(d_model, n_heads * d_queries)
        self.cast_keys_values = nn.Linear(d_model, n_heads * (d_queries + d_values))
        self.cast_output = nn.Linear(n_heads * d_values, d_model)
        self.softmax = nn.Softmax(dim=-1)
        self.layer_norm = nn.LayerNorm(d_model)
        self.apply_dropout = nn.Dropout(dropout)

    def forward(self, query_sequences, key_value_sequences, key_value_sequence_lengths):
        batch_size = query_sequences.size(0)  # batch size (N) in number of sequences
        query_sequence_pad_length = query_sequences.size(1)
        key_value_sequence_pad_length = key_value_sequences.size(1)
        self_attention = torch.equal(key_value_sequences, query_sequences)
        input_to_add = query_sequences.clone()
        query_sequences = self.layer_norm(query_sequences)  # (N, query_sequence_pad_length, d_model)
        if self_attention:
            key_value_sequences = self.layer_norm(key_value_sequences)  # (N, key_value_sequence_pad_length, d_model)

        # Project input sequences to queries, keys, values
        queries = self.cast_queries(query_sequences)  # (N, query_sequence_pad_length, n_heads * d_queries)
        keys, values = self.cast_keys_values(key_value_sequences).split(split_size=self.n_heads * self.d_keys, dim=-1)  # (N, key_value_sequence_pad_length, n_heads * d_keys), (N, key_value_sequence_pad_length, n_heads * d_values)

        # Split the last dimension by the n_heads subspaces
        queries = queries.contiguous().view(batch_size, query_sequence_pad_length, self.n_heads, self.d_queries)  # (N, query_sequence_pad_length, n_heads, d_queries)
        keys = keys.contiguous().view(batch_size, key_value_sequence_pad_length, self.n_heads, self.d_keys)  # (N, key_value_sequence_pad_length, n_heads, d_keys)
        values = values.contiguous().view(batch_size, key_value_sequence_pad_length, self.n_heads, self.d_values)  # (N, key_value_sequence_pad_length, n_heads, d_values)
        queries = queries.permute(0, 2, 1, 3).contiguous().view(-1, query_sequence_pad_length, self.d_queries)  # (N * n_heads, query_sequence_pad_length, d_queries)
        keys = keys.permute(0, 2, 1, 3).contiguous().view(-1, key_value_sequence_pad_length, self.d_keys)  # (N * n_heads, key_value_sequence_pad_length, d_keys)
        values = values.permute(0, 2, 1, 3).contiguous().view(-1, key_value_sequence_pad_length, self.d_values)  # (N * n_heads, key_value_sequence_pad_length, d_values)

        # Perform multi-head attention

        # Perform dot-products
        attention_weights = torch.bmm(queries, keys.permute(0, 2,
                                                            1))  # (N * n_heads, query_sequence_pad_length, key_value_sequence_pad_length)

        # Scale dot-products
        attention_weights = (1. / math.sqrt(
            self.d_keys)) * attention_weights  # (N * n_heads, query_sequence_pad_length, key_value_sequence_pad_length)

        # Before computing softmax weights, prevent queries from attending to certain keys

        # MASK 1: keys that are pads
        not_pad_in_keys = torch.LongTensor(range(key_value_sequence_pad_length)).unsqueeze(0).unsqueeze(0).expand_as(
            attention_weights).to(device)  # (N * n_heads, query_sequence_pad_length, key_value_sequence_pad_length)
        not_pad_in_keys = not_pad_in_keys < key_value_sequence_lengths.repeat_interleave(self.n_heads).unsqueeze(
            1).unsqueeze(2).expand_as(
            attention_weights)  # (N * n_heads, query_sequence_pad_length, key_value_sequence_pad_length)
        # Note: PyTorch auto-broadcasts singleton dimensions in comparison operations (as well as arithmetic operations)

        # Mask away by setting such weights to a large negative number, so that they evaluate to 0 under the softmax
        attention_weights = attention_weights.masked_fill(~not_pad_in_keys, -float('inf'))  # (N * n_heads, query_sequence_pad_length, key_value_sequence_pad_length)

        # MASK 2: if this is self-attention in the decoder, keys chronologically ahead of queries
        if self.in_decoder and self_attention:
            # Therefore, a position [n, i, j] is valid only if j <= i
            # torch.tril(), i.e. lower triangle in a 2D matrix, sets j > i to 0
            not_future_mask = torch.ones_like(
                attention_weights).tril().bool().to(
                device)  # (N * n_heads, query_sequence_pad_length, key_value_sequence_pad_length)
            # Mask away by setting such weights to a large negative number, so that they evaluate to 0 under the softmax
            attention_weights = attention_weights.masked_fill(~not_future_mask, -float('inf'))  # (N * n_heads, query_sequence_pad_length, key_value_sequence_pad_length)
        # Compute softmax along the key dimension
        attention_weights = self.softmax(
            attention_weights)  # (N * n_heads, query_sequence_pad_length, key_value_sequence_pad_length)
        # Apply dropout
        attention_weights = self.apply_dropout(
            attention_weights)  # (N * n_heads, query_sequence_pad_length, key_value_sequence_pad_length)
        # Calculate sequences as the weighted sums of values based on these softmax weights
        sequences = torch.bmm(attention_weights, values)  # (N * n_heads, query_sequence_pad_length, d_values)
        # Unmerge batch and n_heads dimensions and restore original order of axes
        sequences = sequences.contiguous().view(batch_size, self.n_heads, query_sequence_pad_length, self.d_values).permute(0, 2, 1,
                                                                       3)  # (N, query_sequence_pad_length, n_heads, d_values)
        # Concatenate the n_heads subspaces (each with an output of size d_values)
        sequences = sequences.contiguous().view(batch_size, query_sequence_pad_length, -1)  # (N, query_sequence_pad_length, n_heads * d_values)
        # Transform the concatenated subspace-sequences into a single output of size d_model
        sequences = self.cast_output(sequences)  # (N, query_sequence_pad_length, d_model)
        # Apply dropout and residual connection
        sequences = self.apply_dropout(sequences) + input_to_add  # (N, query_sequence_pad_length, d_model)
        return sequences


class PositionWiseFCNetwork(nn.Module):
    """
    The Position-Wise Feed Forward Network sublayer.
    """

    def __init__(self, d_model, d_inner, dropout):
        super(PositionWiseFCNetwork, self).__init__()
        self.d_model = d_model
        self.d_inner = d_inner
        self.layer_norm = nn.LayerNorm(d_model)
        self.fc1 = nn.Linear(d_model, d_inner)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(d_inner, d_model)
        self.apply_dropout = nn.Dropout(dropout)

    def forward(self, sequences):
        """
        Forward prop.

        :param sequences: input sequences, a tensor of size (N, pad_length, d_model)
        :return: transformed output sequences, a tensor of size (N, pad_length, d_model)
        """
        input_to_add = sequences.clone()  # (N, pad_length, d_model)
        sequences = self.layer_norm(sequences)  # (N, pad_length, d_model)
        sequences = self.apply_dropout(self.relu(self.fc1(sequences)))  # (N, pad_length, d_inner)
        sequences = self.fc2(sequences)  # (N, pad_length, d_model)
        sequences = self.apply_dropout(sequences) + input_to_add  # (N, pad_length, d_model)
        return sequences


class Encoder(nn.Module):
    """
    The Encoder.
    """

    def __init__(self, vocab_size, positional_encoding, d_model, n_heads, d_queries, d_values, d_inner, n_layers,
                 dropout):
        super(Encoder, self).__init__()

        self.vocab_size = vocab_size
        self.positional_encoding = positional_encoding
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_queries = d_queries
        self.d_values = d_values
        self.d_inner = d_inner
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding.requires_grad = False
        self.encoder_layers = nn.ModuleList([self.make_encoder_layer() for i in range(n_layers)])
        self.apply_dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(d_model)

    def make_encoder_layer(self):
        encoder_layer = nn.ModuleList([MultiHeadAttention(d_model=self.d_model,
                                                          n_heads=self.n_heads,
                                                          d_queries=self.d_queries,
                                                          d_values=self.d_values,
                                                          dropout=self.dropout,
                                                          in_decoder=False),
                                       PositionWiseFCNetwork(d_model=self.d_model,
                                                             d_inner=self.d_inner,
                                                             dropout=self.dropout)])

        return encoder_layer


    def forward(self, encoder_sequences, encoder_sequence_lengths):
        pad_length = encoder_sequences.size(1)
        encoder_sequences = self.embedding(encoder_sequences) * math.sqrt(self.d_model) + self.positional_encoding[:,
                                                                                          :pad_length, :].to(
            device)  # (N, pad_length, d_model)
        encoder_sequences = self.apply_dropout(encoder_sequences)  # (N, pad_length, d_model)
        for encoder_layer in self.encoder_layers:
            encoder_sequences = encoder_layer[0](query_sequences=encoder_sequences,
                                                 key_value_sequences=encoder_sequences,
                                                 key_value_sequence_lengths=encoder_sequence_lengths)  # (N, pad_length, d_model)
            encoder_sequences = encoder_layer[1](sequences=encoder_sequences)  # (N, pad_length, d_model)
        encoder_sequences = self.layer_norm(encoder_sequences)  # (N, pad_length, d_model)

        return encoder_sequences


class Decoder(nn.Module):

    def __init__(self, vocab_size, positional_encoding, d_model, n_heads, d_queries, d_values, d_inner, n_layers, dropout):
        super(Decoder, self).__init__()

        self.vocab_size = vocab_size
        self.positional_encoding = positional_encoding
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_queries = d_queries
        self.d_values = d_values
        self.d_inner = d_inner
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding.requires_grad = False
        self.decoder_layers = nn.ModuleList([self.make_decoder_layer() for i in range(n_layers)])
        self.apply_dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(d_model)
        self.fc = nn.Linear(d_model, vocab_size)

    def make_decoder_layer(self):
        decoder_layer = nn.ModuleList([MultiHeadAttention(d_model=self.d_model,
                                                          n_heads=self.n_heads,
                                                          d_queries=self.d_queries,
                                                          d_values=self.d_values,
                                                          dropout=self.dropout,
                                                          in_decoder=True),
                                       MultiHeadAttention(d_model=self.d_model,
                                                          n_heads=self.n_heads,
                                                          d_queries=self.d_queries,
                                                          d_values=self.d_values,
                                                          dropout=self.dropout,
                                                          in_decoder=True),
                                       PositionWiseFCNetwork(d_model=self.d_model,
                                                             d_inner=self.d_inner,
                                                             dropout=self.dropout)])

        return decoder_layer

    def forward(self, decoder_sequences, decoder_sequence_lengths, encoder_sequences, encoder_sequence_lengths):
        """
        Forward prop.

        :param decoder_sequences: the source language sequences, a tensor of size (N, pad_length)
        :param decoder_sequence_lengths: true lengths of these sequences, a tensor of size (N)
        :param encoder_sequences: encoded source language sequences, a tensor of size (N, encoder_pad_length, d_model)
        :param encoder_sequence_lengths: true lengths of these sequences, a tensor of size (N)
        :return: decoded target language sequences, a tensor of size (N, pad_length, vocab_size)
        """
        pad_length = decoder_sequences.size(1)  # pad-length of this batch only, varies across batches

        # Sum vocab embeddings and position embeddings
        # print("d_model: ", self.d_model, "Embedding: ", len(self.embedding(decoder_sequences)), len(self.positional_encoding) , pad_length)#[:,
                                                                                          #:pad_length, :])),
        decoder_sequences = self.embedding(decoder_sequences) * math.sqrt(self.d_model) + self.positional_encoding[:,
                                                                                          :pad_length, :].to(
            device)  # (N, pad_length, d_model)

        # Dropout
        decoder_sequences = self.apply_dropout(decoder_sequences)

        # Decoder layers
        for decoder_layer in self.decoder_layers:
            # Sublayers
            decoder_sequences = decoder_layer[0](query_sequences=decoder_sequences,
                                                 key_value_sequences=decoder_sequences,
                                                 key_value_sequence_lengths=decoder_sequence_lengths)  # (N, pad_length, d_model)
            decoder_sequences = decoder_layer[1](query_sequences=decoder_sequences,
                                                 key_value_sequences=encoder_sequences,
                                                 key_value_sequence_lengths=encoder_sequence_lengths)  # (N, pad_length, d_model)
            decoder_sequences = decoder_layer[2](sequences=decoder_sequences)  # (N, pad_length, d_model)

        # Apply layer-norm
        decoder_sequences = self.layer_norm(decoder_sequences)  # (N, pad_length, d_model)

        # Find logits over vocabulary
        decoder_sequences = self.fc(decoder_sequences)  # (N, pad_length, vocab_size)

        return decoder_sequences


class Transformer(nn.Module):
    """
    The Transformer network.
    """

    def __init__(self, vocab_size, positional_encoding, d_model=512, n_heads=16, d_queries=64, d_values=64,
                 d_inner=2048, n_layers=6, dropout=0.1):
        """
        :param vocab_size: size of the (shared) vocabulary
        :param positional_encoding: positional encodings up to the maximum possible pad-length
        :param d_model: size of vectors throughout the transformer model
        :param n_heads: number of heads in the multi-head attention
        :param d_queries: size of query vectors (and also the size of the key vectors) in the multi-head attention
        :param d_values: size of value vectors in the multi-head attention
        :param d_inner: an intermediate size in the position-wise FC
        :param n_layers: number of layers in the Encoder and Decoder
        :param dropout: dropout probability
        """
        super(Transformer, self).__init__()

        self.vocab_size = vocab_size
        self.positional_encoding = positional_encoding
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_queries = d_queries
        self.d_values = d_values
        self.d_inner = d_inner
        self.n_layers = n_layers
        self.dropout = dropout

        self.encoder = Encoder(vocab_size=vocab_size,
                               positional_encoding=positional_encoding,
                               d_model=d_model,
                               n_heads=n_heads,
                               d_queries=d_queries,
                               d_values=d_values,
                               d_inner=d_inner,
                               n_layers=n_layers,
                               dropout=dropout)

        # Decoder
        self.decoder = Decoder(vocab_size=vocab_size,
                               positional_encoding=positional_encoding,
                               d_model=d_model,
                               n_heads=n_heads,
                               d_queries=d_queries,
                               d_values=d_values,
                               d_inner=d_inner,
                               n_layers=n_layers,
                               dropout=dropout)

        # Initialize weights
        self.init_weights()

    def init_weights(self):
        """
        Initialize weights in the transformer model.
        """
        # Glorot uniform initialization with a gain of 1.
        for p in self.parameters():
            # Glorot initialization needs at least two dimensions on the tensor
            if p.dim() > 1:
                nn.init.xavier_uniform_(p, gain=1.)

        # Share weights between the embedding layers and the logit layer
        nn.init.normal_(self.encoder.embedding.weight, mean=0., std=math.pow(self.d_model, -0.5))
        self.decoder.embedding.weight = self.encoder.embedding.weight
        self.decoder.fc.weight = self.decoder.embedding.weight

        print("Model initialized.")

    def forward(self, encoder_sequences, decoder_sequences, encoder_sequence_lengths, decoder_sequence_lengths):
        """
        Forward propagation.

        :param encoder_sequences: source language sequences, a tensor of size (N, encoder_sequence_pad_length)
        :param decoder_sequences: target language sequences, a tensor of size (N, decoder_sequence_pad_length)
        :param encoder_sequence_lengths: true lengths of source language sequences, a tensor of size (N)
        :param decoder_sequence_lengths: true lengths of target language sequences, a tensor of size (N)
        :return: decoded target language sequences, a tensor of size (N, decoder_sequence_pad_length, vocab_size)
        """
        # Encoder
        encoder_sequences = self.encoder(encoder_sequences,
                                         encoder_sequence_lengths)  # (N, encoder_sequence_pad_length, d_model)

        # Decoder
        decoder_sequences = self.decoder(decoder_sequences, decoder_sequence_lengths, encoder_sequences,
                                         encoder_sequence_lengths)  # (N, decoder_sequence_pad_length, vocab_size)

        return decoder_sequences


class LabelSmoothedCE(torch.nn.Module):

    def __init__(self, eps=0.1):
        """
        :param eps: smoothing co-efficient
        """
        super(LabelSmoothedCE, self).__init__()
        self.eps = eps

    def forward(self, inputs, targets, lengths):
        """
        Forward prop.

        :param inputs: decoded target language sequences, a tensor of size (N, pad_length, vocab_size)
        :param targets: gold target language sequences, a tensor of size (N, pad_length)
        :param lengths: true lengths of these sequences, to be able to ignore pads, a tensor of size (N)
        :return: mean label-smoothed cross-entropy loss, a scalar
        """
        # Remove pad-positions and flatten
        inputs, _, _, _ = pack_padded_sequence(input=inputs, lengths=lengths.to("cpu"), batch_first=True, enforce_sorted=False)  # (sum(lengths), vocab_size)
        targets, _, _, _ = pack_padded_sequence(input=targets,lengths=lengths.to("cpu"), batch_first=True, enforce_sorted=False)  # (sum(lengths))
        target_vector = torch.zeros_like(inputs).scatter(dim=1, index=targets.unsqueeze(1), value=1.).to(device)  # (sum(lengths), n_classes), one-hot
        target_vector = target_vector * (1. - self.eps) + self.eps / target_vector.size(1)  # (sum(lengths), n_classes), "smoothed" one-hot

        # Compute smoothed cross-entropy loss
        loss = (-1 * target_vector * F.log_softmax(inputs, dim=1)).sum(dim=1)  # (sum(lengths))
        # Compute mean loss
        loss = torch.mean(loss)
        return loss


In [None]:
import numpy as np

vocab_size = 234

# Define the dimensionality of the character embedding
embedding_dim = 32

# Create a random character embedding matrix
embedding_matrix = np.random.rand(vocab_size, embedding_dim)

In [None]:
#@title Vocab Map
vocabMap = {"<PAD>": 0, " ": 1, "<BOS>": 2, "<EOS>": 3, "ህ": 4,"ል": 5, "ም": 6, "ር": 7, "ስ": 8, "ሽ": 9, "ቅ": 10, "ብ": 11, "ቭ": 12, "ት": 13, "ች": 14, "ን": 15, "ኝ": 16, "ክ": 17, "ው": 18, "ዝ": 19, "ዥ": 20, "ይ": 21, "ድ": 22, "ጅ": 23, "ግ": 24, "ጥ": 25, "ጭ": 26, "ጵ": 27, "ጽ": 28, "ፍ": 29, "ፕ": 30, "ኧ": 31, "አ": 32, "ኡ": 33, "ኢ": 34, "ኤ": 35, "እ": 36, "ኦ": 37, "ኸ": 38, "ሀ": 39, "ሁ": 40, "ሂ": 41, "ሄ": 42, "ሆ": 43, "ለ": 44, "ሉ": 45, "ሊ": 46, "ላ": 47, "ሌ": 48, "ሎ": 49, "መ": 50, "ሙ": 51, "ሚ": 52, "ማ": 53, "ሜ": 54, "ሞ": 55, "ረ": 56, "ሩ": 57, "ሪ": 58, "ራ": 59, "ሬ": 60, "ሮ": 61, "ሰ": 62, "ሱ": 63, "ሲ": 64, "ሳ": 65, "ሴ": 66, "ሶ": 67, "ሸ": 68, "ሹ": 69, "ሺ": 70, "ሻ": 71, "ሼ": 72, "ሾ": 73, "ቀ": 74, "ቁ": 75, "ቂ": 76, "ቃ": 77, "ቄ": 78, "ቆ": 79, "በ": 80, "ቡ": 81, "ቢ": 82, "ባ": 83, "ቤ": 84, "ቦ": 85, "ቨ": 86, "ቩ": 87, "ቪ": 88, "ቫ": 89, "ቬ": 90, "ቮ": 91, "ተ": 92, "ቱ": 93, "ቲ": 94, "ታ": 95, "ቴ": 96, "ቶ": 97, "ቸ": 98, "ቹ": 99, "ቺ": 100, "ቻ": 101, "ቼ": 102, "ቾ": 103, "ነ": 104, "ኑ": 105, "ኒ": 106, "ና": 107, "ኔ": 108, "ኖ": 109, "ኘ": 110, "ኙ": 111, "ኚ": 112, "ኛ": 113, "ኜ": 114, "ኞ": 115, "ከ": 116, "ኩ": 117, "ኪ": 118, "ካ": 119, "ኬ": 120, "ኮ": 121, "ወ": 122, "ዉ": 123, "ዊ": 124, "ዋ": 125, "ዌ": 126, "ዎ": 127, "ዘ": 128, "ዙ": 129, "ዚ": 130, "ዛ": 131, "ዜ": 132, "ዞ": 133, "ዠ": 134, "ዡ": 135, "ዢ": 136, "ዣ": 137, "ዤ": 138, "ዦ": 139, "የ": 140, "ዩ": 141, "ዪ": 142, "ያ": 143, "ዬ": 144, "ዮ": 145, "ደ": 146, "ዱ": 147, "ዲ": 148, "ዳ": 149, "ዴ": 150, "ዶ": 151, "ጀ": 152, "ጁ": 153, "ጂ": 154, "ጃ": 155, "ጄ": 156, "ጆ": 157, "ገ": 158, "ጉ": 159, "ጊ": 160, "ጋ": 161, "ጌ": 162, "ጐ": 163, "ጠ": 164, "ጡ": 165, "ጢ": 166, "ጣ": 167, "ጤ": 168, "ጦ": 169, "ጨ": 170, "ጩ": 171, "ጪ": 172, "ጫ": 173, "ጬ": 174, "ጮ": 175, "ጰ": 176, "ጱ": 177, "ጲ": 178, "ጳ": 179, "ጴ": 180, "ጶ": 181, "ጸ": 182, "ጹ": 183, "ጺ": 184, "ጻ": 185, "ጼ": 186, "ጾ": 187, "ፈ": 188, "ፉ": 189, "ፊ": 190, "ፋ": 191, "ፌ": 192, "ፎ": 193, "ፐ": 194, "ፑ": 195, "ፒ": 196, "ፓ": 197, "ፔ": 198, "ፖ": 199, "ኋ": 200, "ሏ": 201, "ሟ": 202, "ሯ": 203, "ሷ": 204, "ሿ": 205, "ቋ": 206, "ቧ": 207, "ቯ": 208, "ቷ": 209, "ቿ": 210, "ኗ": 211, "ኟ": 212, "ኳ": 213, "ዟ": 214, "ዧ": 215, "ዷ": 216, "ጇ": 217, "ጓ": 218, "ጧ": 219, "ጯ": 220, "ጷ": 221, "ጿ": 222, "ፏ": 223, "ፗ": 224, "ጔ": 225, "ኴ": 226, "ኌ": 227, "ቌ": 228, "ጒ": 229, "ኲ": 230, "ኊ": 231, "ቊ": 232, "\n": 233}

# Ignore ids=[0, 2, 3]
idToChar = {1:" ", 4: 'ህ', 5: 'ል', 6: 'ም', 7: 'ር', 8: 'ስ', 9: 'ሽ', 10: 'ቅ', 11: 'ብ', 12: 'ቭ', 13: 'ት', 14: 'ች', 15: 'ን', 16: 'ኝ', 17: 'ክ', 18: 'ው', 19: 'ዝ', 20: 'ዥ', 21: 'ይ', 22: 'ድ', 23: 'ጅ', 24: 'ግ', 25: 'ጥ', 26: 'ጭ', 27: 'ጵ', 28: 'ጽ', 29: 'ፍ', 30: 'ፕ', 31: 'ኧ', 32: 'አ', 33: 'ኡ', 34: 'ኢ', 35: 'ኤ', 36: 'እ', 37: 'ኦ', 38: 'ኸ', 39: 'ሀ', 40: 'ሁ', 41: 'ሂ', 42: 'ሄ', 43: 'ሆ', 44: 'ለ', 45: 'ሉ', 46: 'ሊ', 47: 'ላ', 48: 'ሌ', 49: 'ሎ', 50: 'መ', 51: 'ሙ', 52: 'ሚ', 53: 'ማ', 54: 'ሜ', 55: 'ሞ', 56: 'ረ', 57: 'ሩ', 58: 'ሪ', 59: 'ራ', 60: 'ሬ', 61: 'ሮ', 62: 'ሰ', 63: 'ሱ', 64: 'ሲ', 65: 'ሳ', 66: 'ሴ', 67: 'ሶ', 68: 'ሸ', 69: 'ሹ', 70: 'ሺ', 71: 'ሻ', 72: 'ሼ', 73: 'ሾ', 74: 'ቀ', 75: 'ቁ', 76: 'ቂ', 77: 'ቃ', 78: 'ቄ', 79: 'ቆ', 80: 'በ', 81: 'ቡ', 82: 'ቢ', 83: 'ባ', 84: 'ቤ', 85: 'ቦ', 86: 'ቨ', 87: 'ቩ', 88: 'ቪ', 89: 'ቫ', 90: 'ቬ', 91: 'ቮ', 92: 'ተ', 93: 'ቱ', 94: 'ቲ', 95: 'ታ', 96: 'ቴ', 97: 'ቶ', 98: 'ቸ', 99: 'ቹ', 100: 'ቺ', 101: 'ቻ', 102: 'ቼ', 103: 'ቾ', 104: 'ነ', 105: 'ኑ', 106: 'ኒ', 107: 'ና', 108: 'ኔ', 109: 'ኖ', 110: 'ኘ', 111: 'ኙ', 112: 'ኚ', 113: 'ኛ', 114: 'ኜ', 115: 'ኞ', 116: 'ከ', 117: 'ኩ', 118: 'ኪ', 119: 'ካ', 120: 'ኬ', 121: 'ኮ', 122: 'ወ', 123: 'ዉ', 124: 'ዊ', 125: 'ዋ', 126: 'ዌ', 127: 'ዎ', 128: 'ዘ', 129: 'ዙ', 130: 'ዚ', 131: 'ዛ', 132: 'ዜ', 133: 'ዞ', 134: 'ዠ', 135: 'ዡ', 136: 'ዢ', 137: 'ዣ', 138: 'ዤ', 139: 'ዦ', 140: 'የ', 141: 'ዩ', 142: 'ዪ', 143: 'ያ', 144: 'ዬ', 145: 'ዮ', 146: 'ደ', 147: 'ዱ', 148: 'ዲ', 149: 'ዳ', 150: 'ዴ', 151: 'ዶ', 152: 'ጀ', 153: 'ጁ', 154: 'ጂ', 155: 'ጃ', 156: 'ጄ', 157: 'ጆ', 158: 'ገ', 159: 'ጉ', 160: 'ጊ', 161: 'ጋ', 162: 'ጌ', 163: 'ጐ', 164: 'ጠ', 165: 'ጡ', 166: 'ጢ', 167: 'ጣ', 168: 'ጤ', 169: 'ጦ', 170: 'ጨ', 171: 'ጩ', 172: 'ጪ', 173: 'ጫ', 174: 'ጬ', 175: 'ጮ', 176: 'ጰ', 177: 'ጱ', 178: 'ጲ', 179: 'ጳ', 180: 'ጴ', 181: 'ጶ', 182: 'ጸ', 183: 'ጹ', 184: 'ጺ', 185: 'ጻ', 186: 'ጼ', 187: 'ጾ', 188: 'ፈ', 189: 'ፉ', 190: 'ፊ', 191: 'ፋ', 192: 'ፌ', 193: 'ፎ', 194: 'ፐ', 195: 'ፑ', 196: 'ፒ', 197: 'ፓ', 198: 'ፔ', 199: 'ፖ', 200: 'ኋ', 201: 'ሏ', 202: 'ሟ', 203: 'ሯ', 204: 'ሷ', 205: 'ሿ', 206: 'ቋ', 207: 'ቧ', 208: 'ቯ', 209: 'ቷ', 210: 'ቿ', 211: 'ኗ', 212: 'ኟ', 213: 'ኳ', 214: 'ዟ', 215: 'ዧ', 216: 'ዷ', 217: 'ጇ', 218: 'ጓ', 219: 'ጧ', 220: 'ጯ', 221: 'ጷ', 222: 'ጿ', 223: 'ፏ', 224: 'ፗ', 225: 'ጔ', 226: 'ኴ', 227: 'ኌ', 228: 'ቌ', 229: 'ጒ', 230: 'ኲ', 231: 'ኊ', 232: 'ቊ', 233: "\n"}

def encodeString(sentence: str):
  splitted = [char for char in sentence]
  if len(splitted)==0:
    raise Exception("Empty sentence")
  return [ vocabMap.get(item, 1) for item in splitted]

def encodeSentences(sentences, bos: bool, eos: bool):
  return [ addEOSBOS(encodeString(sentence), bos, eos) for sentence in sentences ]

def addEOSBOS(splitted: list, bos:bool, eos:bool):
  if bos:
    splitted.insert(0, 2)
  if eos:
    splitted.append(3)
  return splitted

def decodeString(prediction:list):
  return "".join(
      [
          idToChar.get(charI, "") for charI in prediction
          ]
      )

In [None]:
#@title Positional Encoder
def get_positional_encoding(d_model, max_length=100):
    """
    Computes positional encoding as defined in the paper.

    :param d_model: size of vectors throughout the transformer model
    :param max_length: maximum sequence length up to which positional encodings must be calculated
    :return: positional encoding, a tensor of size (1, max_length, d_model)
    """
    positional_encoding = torch.zeros((max_length, d_model))  # (max_length, d_model)
    for i in range(max_length):
        for j in range(d_model):
            if j % 2 == 0:
                positional_encoding[i, j] = math.sin(i / math.pow(10000, j / d_model))
            else:
                positional_encoding[i, j] = math.cos(i / math.pow(10000, (j - 1) / d_model))

    positional_encoding = positional_encoding.unsqueeze(0)  # (1, max_length, d_model)

    return positional_encoding


def get_lr(step, d_model, warmup_steps):
    lr = 2. * math.pow(d_model, -0.5) * min(math.pow(step, -0.5), step * math.pow(warmup_steps, -1.5))
    return lr


def save_checkpoint(epoch, model, optimizer, prefix=''):
    state = {'epoch': epoch,
             'model': model,
             'optimizer': optimizer}
    filename = "/content/drive/MyDrive/MTModel/seq2seq_last_verr_0.902_new.pth.tar" #"/content/drive/MyDrive/MTModel/seq2seq2_finetuned_1.pth.tar" #"/content/drive/MyDrive/MTModel/seq2seq_last.pth.tar"
    torch.save(state, filename)


def change_lr(optimizer, new_lr):
    for param_group in optimizer.param_groups:
        param_group['lr'] = new_lr


class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


In [None]:
!cp /content/drive/MyDrive/train.join .
!cp /content/drive/MyDrive/train.spa .
!cp /content/drive/MyDrive/val.join . && cp /content/drive/MyDrive/val.spa .
# !cp train.join /content/drive/MyDrive/ && cp train.spa /content/drive/MyDrive/
# !cp val.join /content/drive/MyDrive/ && cp val.spa /content/drive/MyDrive/

In [None]:
# !cp /content/drive/MyDrive/train.txt
# !cp train.join /content/drive/MyDrive/train.join && cp train.spa /content/drive/MyDrive/train.spa

Data Loader

In [None]:
#@title Sequence Loader
class SequenceLoader(object):
    def __init__(self, data_folder, source_suffix, target_suffix, split, tokens_in_batch):
        self.tokens_in_batch = tokens_in_batch
        self.source_suffix = source_suffix
        self.target_suffix = target_suffix
        assert split.lower() in {"train", "val",
                                 "test"}, "'split' must be one of 'train', 'val', 'test'! (case-insensitive)"
        self.split = split.lower()

        # Is this for training?
        self.for_training = self.split == "train"
        # Load data]
        print("Path: ", data_folder+ split+"."+source_suffix)
        with open(data_folder+ split+"."+source_suffix, "r", encoding ="utf-8") as f:
            # print(f.read())
            source_data = f.read().split("\n")[:-1]
        with open(data_folder+ split +"."+target_suffix, "r", encoding ="utf-8") as f:
            target_data = f.read().split("\n")[:-1]
        print("Source data:", len(source_data), " Target Data: ", len(target_data))
        assert len(source_data) == len(target_data), "There are a different number of source or target sequences!"

        source_lengths = [len(s) for s in encodeSentences(source_data, bos=False, eos=False)]
        target_lengths = [len(t) for t in encodeSentences(target_data, bos=True,
                                                                eos=True)]  # target language sequences have <BOS> and <EOS> tokens
        self.data = list(zip(source_data, target_data, source_lengths, target_lengths))

        # If for training, pre-sort by target lengths - required for itertools.groupby() later
        if self.for_training:
            self.data.sort(key=lambda x: x[3])

        # Create batches
        self.create_batches()

    def create_batches(self):
        """
        Prepares batches for one epoch.
        """

        # If training
        if self.for_training:
            # Group or chunk based on target sequence lengths
            chunks = [list(g) for _, g in groupby(self.data, key=lambda x: x[3])]

            # Create batches, each with the same target sequence length
            self.all_batches = list()
            for chunk in chunks:
                # Sort inside chunk by source sequence lengths, so that a batch would also have similar source sequence lengths
                chunk.sort(key=lambda x: x[2])
                # How many sequences in each batch? Divide expected batch size (i.e. tokens) by target sequence length in this chunk
                seqs_per_batch = self.tokens_in_batch // chunk[0][3]
                # Split chunk into batches
                self.all_batches.extend([chunk[i: i + seqs_per_batch] for i in range(0, len(chunk), seqs_per_batch)])

            # Shuffle batches
            shuffle(self.all_batches)
            self.n_batches = len(self.all_batches)
            self.current_batch = -1
        else:
            # Simply return once pair at a time
            self.all_batches = [[d] for d in self.data]
            self.n_batches = len(self.all_batches)
            self.current_batch = -1

    def __iter__(self):
        """
        Iterators require this method defined.
        """
        return self

    def __next__(self):
        # Update current batch index
        self.current_batch += 1
        try:
            source_data, target_data, source_lengths, target_lengths = zip(*self.all_batches[self.current_batch])
        # Stop iteration once all batches are iterated through
        except IndexError:
            raise StopIteration



        # print("Source Data: ", source_data)
        # print("Target Data: ", target_data)
        # print("<Source data>: ", source_data)

        source_data = encodeSentences(source_data, bos=False, eos=False)
        # print("<\Source data>: ", len(source_data))

        # print("<Target data>: ", target_data)
        target_data = encodeSentences(target_data, bos=True, eos=True)
        # print("<\Target data>: ", len(target_data))


        # print(source_data)
        # Convert source and target sequences as padded tensors
        source_data = pad_sequence(sequences=[torch.LongTensor(s) for s in source_data],
                                   batch_first=True,
                                   padding_value=vocabMap['<PAD>'])
        target_data = pad_sequence(sequences=[torch.LongTensor(t) for t in target_data],
                                   batch_first=True,
                                   padding_value=vocabMap['<PAD>'])

        # print("Source Data: ", source_data)
        # print("Target Data: ", target_data)
        # print("<\Source data>: After padding: ", len(source_data))
        # print("<\Target data>: After Padding: ", len(target_data))


        # Convert lengths to tensors
        source_lengths = torch.LongTensor(source_lengths)
        target_lengths = torch.LongTensor(target_lengths)

        return source_data, target_data, source_lengths, target_lengths

Prepare Dataset

Train

In [None]:
 #@title Traininig
# Data parameters
data_folder = './'  # folder with data files

# Model parameters
d_model = 256
n_heads = 12
d_queries = 32
d_values = 32
d_inner = 2048
n_layers = 8
dropout = 0.1
positional_encoding = get_positional_encoding(d_model=d_model, max_length=375)  # positional encodings up to the maximum possible pad-length

checkpoint =  None #"/content/drive/MyDrive/MTModel/seq2seq2.pth.tar" # "/content/drive/MyDrive/MTModel/seq2seq_last_verr_0.902_improved.pth.tar"
tokens_in_batch = 6000  # batch size in target language tokens
batches_per_step = 25000 // tokens_in_batch  # perform a training step, i.e. update parameters, once every so many batches
print_frequency = 20  # print status once every so many steps
n_steps = 1000
warmup_steps = 800
step = 1
lr = get_lr(step=step, d_model=d_model, warmup_steps=warmup_steps)
print("\nLearning Rate: ", lr)
start_epoch = 0
betas = (0.9, 0.98)
epsilon = 1e-9
label_smoothing = 0.1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: ", device)
cudnn.benchmark = False


def main():
    """
    Training and validation.
    """
    global checkpoint, step, start_epoch, epoch, epochs

    # Initialize data-loaders
    train_loader = SequenceLoader(data_folder="./",
                                  source_suffix="join",
                                  target_suffix="spa",
                                  split="train",
                                  tokens_in_batch=tokens_in_batch)
    val_loader = SequenceLoader(data_folder="./",
                                source_suffix="join",
                                target_suffix="spa",
                                split="val",
                                tokens_in_batch=tokens_in_batch)

    # Initialize model or load checkpoint
    if checkpoint is None:
        model = Transformer(vocab_size=274,
                            positional_encoding=positional_encoding,
                            d_model=d_model,
                            n_heads=n_heads,
                            d_queries=d_queries,
                            d_values=d_values,
                            d_inner=d_inner,
                            n_layers=n_layers,
                            dropout=dropout)
        optimizer = torch.optim.Adam(params=[p for p in model.parameters() if p.requires_grad],
                                     lr=lr,
                                     betas=betas,
                                     eps=epsilon)

    else:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint['epoch'] + 1
        print('\nLoaded checkpoint from epoch %d.\n' % start_epoch)
        model = checkpoint['model']
        optimizer = checkpoint['optimizer']

    # Loss function
    criterion = LabelSmoothedCE(eps=label_smoothing)

    # Move to default device
    model = model.to(device)
    criterion = criterion.to(device)

    # Find total epochs to train
    epochs = (n_steps // (train_loader.n_batches // batches_per_step)) + 1

    # Epochs
    for epoch in range(start_epoch, epochs+100):
        # Step
        step = epoch * train_loader.n_batches // batches_per_step

        optimizer.lr = lr
        # One epoch's training
        train_loader.create_batches()
        train(train_loader=train_loader,
              model=model,
              criterion=criterion,
              optimizer=optimizer,
              epoch=epoch,
              step=step)

        # One epoch's validation
        val_loader.create_batches()
        validate(val_loader=val_loader,
                 model=model,
                 criterion=criterion)

        # Save checkpoint
        save_checkpoint(epoch, model, optimizer)


def train(train_loader, model, criterion, optimizer, epoch, step):
    """
    One epoch's training.

    :param train_loader: loader for training data
    :param model: model
    :param criterion: label-smoothed cross-entropy loss
    :param optimizer: optimizer
    :param epoch: epoch number
    """
    model.train()
    # Track some metrics
    data_time = AverageMeter()  # data loading time
    step_time = AverageMeter()  # forward prop. + back prop. time
    losses = AverageMeter()  # loss

    # Starting time
    start_data_time = time.time()
    start_step_time = time.time()

    # Batches
    for i, (source_sequences, target_sequences, source_sequence_lengths, target_sequence_lengths) in enumerate(
            train_loader):

        # Move to default device
        source_sequences = source_sequences.to(device)  # (N, max_source_sequence_pad_length_this_batch)
        target_sequences = target_sequences.to(device)  # (N, max_target_sequence_pad_length_this_batch)
        source_sequence_lengths = source_sequence_lengths.to(device)  # (N)
        target_sequence_lengths = target_sequence_lengths.to(device)  # (N)

        # Time taken to load data
        data_time.update(time.time() - start_data_time)

        try:
          # Forward prop.
          predicted_sequences = model(source_sequences, target_sequences, source_sequence_lengths,
                                      target_sequence_lengths)  # (N, max_target_sequence_pad_length_this_batch, vocab_size)
        except:
          predicted_sequences = source_sequences
          raise Exception("Source Sequence: ", list(source_sequences))

        # Note: If the target sequence is "<BOS> w1 w2 ... wN <EOS> <PAD> <PAD> <PAD> <PAD> ..."
        # we should consider only "w1 w2 ... wN <EOS>" as <BOS> is not predicted
        # Therefore, pads start after (length - 1) positions
        loss = criterion(inputs=predicted_sequences,
                         targets=target_sequences[:, 1:],
                         lengths=target_sequence_lengths - 1)  # scalar

        if loss == torch.nan:
          print("Source Sequence: ", list(source_sequences))
          print("Target Sequence: ", list(target_sequences))
          raise Exception("Source Sequence: ", list(source_sequences))
        # Backward prop.
        (loss / batches_per_step).backward()

        # Keep track of losses
        losses.update(loss.item(), (target_sequence_lengths - 1).sum().item())

        # Update model (i.e. perform a training step) only after gradients are accumulated from batches_per_step batches
        if (i + 1) % batches_per_step == 0:
            optimizer.step()
            optimizer.zero_grad()

            # This step is now complete
            step += 1

            # Update learning rate after each step
            change_lr(optimizer, new_lr=get_lr(step=step, d_model=d_model, warmup_steps=warmup_steps))

            # Time taken for this training step
            step_time.update(time.time() - start_step_time)

            # Print status
            if step % print_frequency == 0:
                print('Epoch {0}/{1}-----'
                      'Batch {2}/{3}-----'
                      'Step {4}/{5}-----'
                      'Data Time {data_time.val:.3f} ({data_time.avg:.3f})-----'
                      'Step Time {step_time.val:.3f} ({step_time.avg:.3f})-----'
                      'Loss {losses.val:.4f} ({losses.avg:.4f})'.format(epoch + 1, epochs,
                                                                        i + 1, train_loader.n_batches,
                                                                        step, n_steps,
                                                                        step_time=step_time,
                                                                        data_time=data_time,
                                                                        losses=losses))

            # Reset step time
            start_step_time = time.time()

            # If this is the last one or two epochs, save checkpoints at regular intervals for averaging
            if epoch in [epochs - 1, epochs - 2] and step % 1500 == 0:  # 'epoch' is 0-indexed
                save_checkpoint(epoch, model, optimizer, prefix='step' + str(step) + "_")

        # Reset data time
        start_data_time = time.time()


def validate(val_loader, model, criterion):
    """
    One epoch's validation.

    :param val_loader: loader for validation data
    :param model: model
    :param criterion: label-smoothed cross-entropy loss
    """
    model.eval()
    with torch.no_grad():
        losses = AverageMeter()
        # Batches
        for i, (source_sequence, target_sequence, source_sequence_length, target_sequence_length) in enumerate(
                tqdm(val_loader, total=val_loader.n_batches)):
            source_sequence = source_sequence.to(device)  # (1, source_sequence_length)
            target_sequence = target_sequence.to(device)  # (1, target_sequence_length)
            source_sequence_length = source_sequence_length.to(device)  # (1)
            target_sequence_length = target_sequence_length.to(device)  # (1)

            # Forward prop.
            predicted_sequence = model(source_sequence, target_sequence, source_sequence_length,
                                       target_sequence_length)  # (1, target_sequence_length, vocab_size)

            # Note: If the target sequence is "<BOS> w1 w2 ... wN <EOS> <PAD> <PAD> <PAD> <PAD> ..."
            # we should consider only "w1 w2 ... wN <EOS>" as <BOS> is not predicted
            # Therefore, pads start after (length - 1) position
            loss = criterion(inputs=predicted_sequence, targets=target_sequence[:, 1:], lengths=target_sequence_length - 1)

            # Keep track of losses
            losses.update(loss.item(), (target_sequence_length - 1).sum().item())

        print("\nValidation loss: %.3f\n\n" % losses.avg)


# if __name__ == '__main__':
main()


Learning Rate:  5.5242717280199024e-06
Device:  cuda
Path:  ./train.join
Source data: 24978  Target Data:  24978
Path:  ./val.join
Source data: 358  Target Data:  358
Model initialized.
Epoch 1/4-----Batch 80/1203-----Step 20/1000-----Data Time 0.004 (0.005)-----Step Time 15.292 (14.277)-----Loss 4.5889 (5.3500)
Epoch 1/4-----Batch 160/1203-----Step 40/1000-----Data Time 0.007 (0.005)-----Step Time 16.750 (14.192)-----Loss 4.3384 (4.8893)
Epoch 1/4-----Batch 240/1203-----Step 60/1000-----Data Time 0.004 (0.005)-----Step Time 14.979 (14.071)-----Loss 4.0651 (4.6646)
Epoch 1/4-----Batch 320/1203-----Step 80/1000-----Data Time 0.001 (0.005)-----Step Time 9.250 (13.753)-----Loss 3.8784 (4.5195)
Epoch 1/4-----Batch 400/1203-----Step 100/1000-----Data Time 0.004 (0.005)-----Step Time 8.767 (13.685)-----Loss 3.7630 (4.3937)
Epoch 1/4-----Batch 480/1203-----Step 120/1000-----Data Time 0.002 (0.005)-----Step Time 9.718 (13.747)-----Loss 3.6135 (4.2882)
Epoch 1/4-----Batch 560/1203-----Step 140

100%|██████████| 358/358 [00:10<00:00, 32.99it/s]



Validation loss: 3.570


Epoch 2/4-----Batch 80/1203-----Step 320/1000-----Data Time 0.007 (0.005)-----Step Time 15.209 (13.727)-----Loss 3.4939 (3.4702)
Epoch 2/4-----Batch 160/1203-----Step 340/1000-----Data Time 0.003 (0.005)-----Step Time 10.841 (13.373)-----Loss 3.5044 (3.4631)
Epoch 2/4-----Batch 240/1203-----Step 360/1000-----Data Time 0.004 (0.005)-----Step Time 12.780 (13.524)-----Loss 3.3263 (3.4397)
Epoch 2/4-----Batch 320/1203-----Step 380/1000-----Data Time 0.007 (0.005)-----Step Time 11.413 (13.799)-----Loss 3.3770 (3.4157)
Epoch 2/4-----Batch 400/1203-----Step 400/1000-----Data Time 0.002 (0.005)-----Step Time 12.325 (13.769)-----Loss 3.1852 (3.3903)
Epoch 2/4-----Batch 480/1203-----Step 420/1000-----Data Time 0.004 (0.005)-----Step Time 12.340 (13.671)-----Loss 3.2552 (3.3685)
Epoch 2/4-----Batch 560/1203-----Step 440/1000-----Data Time 0.004 (0.005)-----Step Time 14.922 (13.719)-----Loss 3.1457 (3.3431)
Epoch 2/4-----Batch 640/1203-----Step 460/1000-----Data Time 0.00

100%|██████████| 358/358 [00:11<00:00, 31.46it/s]



Validation loss: 2.979


Epoch 3/4-----Batch 76/1203-----Step 620/1000-----Data Time 0.002 (0.005)-----Step Time 11.182 (14.072)-----Loss 2.9864 (2.9101)
Epoch 3/4-----Batch 156/1203-----Step 640/1000-----Data Time 0.002 (0.005)-----Step Time 14.147 (14.341)-----Loss 2.7654 (2.8935)
Epoch 3/4-----Batch 236/1203-----Step 660/1000-----Data Time 0.007 (0.005)-----Step Time 15.724 (14.021)-----Loss 2.7798 (2.8744)
Epoch 3/4-----Batch 316/1203-----Step 680/1000-----Data Time 0.005 (0.005)-----Step Time 14.142 (13.826)-----Loss 2.7872 (2.8574)
Epoch 3/4-----Batch 396/1203-----Step 700/1000-----Data Time 0.004 (0.005)-----Step Time 14.788 (13.896)-----Loss 2.8241 (2.8409)
Epoch 3/4-----Batch 476/1203-----Step 720/1000-----Data Time 0.005 (0.005)-----Step Time 15.175 (13.962)-----Loss 2.7052 (2.8261)
Epoch 3/4-----Batch 556/1203-----Step 740/1000-----Data Time 0.005 (0.005)-----Step Time 14.969 (13.990)-----Loss 2.6965 (2.8098)
Epoch 3/4-----Batch 636/1203-----Step 760/1000-----Data Time 0.00

100%|██████████| 358/358 [00:11<00:00, 32.37it/s]



Validation loss: 1.957


Epoch 4/4-----Batch 72/1203-----Step 920/1000-----Data Time 0.006 (0.005)-----Step Time 14.768 (13.663)-----Loss 2.3719 (2.2987)
Epoch 4/4-----Batch 152/1203-----Step 940/1000-----Data Time 0.003 (0.005)-----Step Time 7.938 (13.414)-----Loss 2.4402 (2.2751)
Epoch 4/4-----Batch 232/1203-----Step 960/1000-----Data Time 0.007 (0.005)-----Step Time 14.292 (13.729)-----Loss 2.1795 (2.2421)
Epoch 4/4-----Batch 312/1203-----Step 980/1000-----Data Time 0.004 (0.005)-----Step Time 15.417 (13.776)-----Loss 2.3009 (2.2146)
Epoch 4/4-----Batch 392/1203-----Step 1000/1000-----Data Time 0.005 (0.005)-----Step Time 14.163 (13.612)-----Loss 2.0301 (2.1903)
Epoch 4/4-----Batch 472/1203-----Step 1020/1000-----Data Time 0.005 (0.005)-----Step Time 13.568 (13.639)-----Loss 2.0572 (2.1707)
Epoch 4/4-----Batch 552/1203-----Step 1040/1000-----Data Time 0.002 (0.005)-----Step Time 9.843 (13.631)-----Loss 1.8832 (2.1443)
Epoch 4/4-----Batch 632/1203-----Step 1060/1000-----Data Time 0.

100%|██████████| 358/358 [00:11<00:00, 31.89it/s]



Validation loss: 1.600


Epoch 5/4-----Batch 68/1203-----Step 1220/1000-----Data Time 0.005 (0.005)-----Step Time 14.605 (14.160)-----Loss 1.7561 (1.7572)
Epoch 5/4-----Batch 148/1203-----Step 1240/1000-----Data Time 0.007 (0.005)-----Step Time 15.883 (14.474)-----Loss 1.8688 (1.7571)
Epoch 5/4-----Batch 228/1203-----Step 1260/1000-----Data Time 0.007 (0.005)-----Step Time 15.362 (13.989)-----Loss 1.9511 (1.7625)
Epoch 5/4-----Batch 308/1203-----Step 1280/1000-----Data Time 0.003 (0.005)-----Step Time 13.392 (14.033)-----Loss 1.6268 (1.7574)
Epoch 5/4-----Batch 388/1203-----Step 1300/1000-----Data Time 0.002 (0.005)-----Step Time 11.182 (14.186)-----Loss 1.4042 (1.7419)
Epoch 5/4-----Batch 468/1203-----Step 1320/1000-----Data Time 0.004 (0.005)-----Step Time 16.192 (13.967)-----Loss 1.7107 (1.7336)
Epoch 5/4-----Batch 548/1203-----Step 1340/1000-----Data Time 0.008 (0.005)-----Step Time 12.102 (13.856)-----Loss 1.5548 (1.7266)
Epoch 5/4-----Batch 628/1203-----Step 1360/1000-----Data T

100%|██████████| 358/358 [00:10<00:00, 33.38it/s]



Validation loss: 1.408


Epoch 6/4-----Batch 68/1203-----Step 1520/1000-----Data Time 0.008 (0.005)-----Step Time 11.781 (13.361)-----Loss 1.4814 (1.5781)
Epoch 6/4-----Batch 148/1203-----Step 1540/1000-----Data Time 0.008 (0.005)-----Step Time 16.102 (14.024)-----Loss 1.4474 (1.5592)
Epoch 6/4-----Batch 228/1203-----Step 1560/1000-----Data Time 0.004 (0.005)-----Step Time 13.203 (13.993)-----Loss 1.4036 (1.5316)
Epoch 6/4-----Batch 308/1203-----Step 1580/1000-----Data Time 0.004 (0.005)-----Step Time 11.231 (14.056)-----Loss 1.7081 (1.5055)
Epoch 6/4-----Batch 388/1203-----Step 1600/1000-----Data Time 0.003 (0.005)-----Step Time 12.020 (13.908)-----Loss 1.3213 (1.5092)
Epoch 6/4-----Batch 468/1203-----Step 1620/1000-----Data Time 0.002 (0.005)-----Step Time 9.014 (13.827)-----Loss 1.3481 (1.5153)
Epoch 6/4-----Batch 548/1203-----Step 1640/1000-----Data Time 0.004 (0.005)-----Step Time 15.964 (13.877)-----Loss 1.6919 (1.5132)
Epoch 6/4-----Batch 628/1203-----Step 1660/1000-----Data Ti

100%|██████████| 358/358 [00:11<00:00, 30.98it/s]



Validation loss: 1.349




In [None]:
 #@title Traininig
 
# Data parameters
data_folder = './'  # folder with data files

# Model parameters
d_model = 256
n_heads = 12
d_queries = 32
d_values = 32
d_inner = 2048
n_layers = 8
dropout = 0.1
positional_encoding = get_positional_encoding(d_model=d_model, max_length=233)  # positional encodings up to the maximum possible pad-length

checkpoint = "/content/drive/MyDrive/MTModel/seq2seq_final.pth.tar" #"/content/drive/MyDrive/MTModel/seq2seq_last_verr_0.902.pth.tar" # "/content/drive/MyDrive/MTModel/seq2seq_last_verr_0.902_improved_wer_14.3.pth.tar" # "/content/drive/MyDrive/MTModel/seq2seq2_finetuned_1.pth.tar" # "/content/drive/MyDrive/MTModel/seq2seq2_finetuned.pth.tar" #"/content/drive/MyDrive/MTModel/seq2seq_final.pth.tar" # "/content/drive/MyDrive/MTModel/seq2seq_last_verr_0.902_improved.pth.tar"
tokens_in_batch = 8000  # batch size in target language tokens
batches_per_step = 35000 // tokens_in_batch  # perform a training step, i.e. update parameters, once every so many batches
print_frequency = 20  # print status once every so many steps
n_steps = 20000
warmup_steps = 800
step = 1
lr = get_lr(step=step, d_model=d_model, warmup_steps=warmup_steps)
print("\nLearning Rate: ", lr)
start_epoch = 0
betas = (0.9, 0.98)
epsilon = 1e-9
label_smoothing = 0.1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: ", device)
cudnn.benchmark = False


def main():
    """
    Training and validation.
    """
    global checkpoint, step, start_epoch, epoch, epochs

    # Initialize data-loaders
    train_loader = SequenceLoader(data_folder="./",
                                  source_suffix="join",
                                  target_suffix="spa",
                                  split="train",
                                  tokens_in_batch=tokens_in_batch)
    val_loader = SequenceLoader(data_folder="./",
                                source_suffix="join",
                                target_suffix="spa",
                                split="val",
                                tokens_in_batch=tokens_in_batch)

    # Initialize model or load checkpoint
    if checkpoint is None:
        model = Transformer(vocab_size=274,
                            positional_encoding=positional_encoding,
                            d_model=d_model,
                            n_heads=n_heads,
                            d_queries=d_queries,
                            d_values=d_values,
                            d_inner=d_inner,
                            n_layers=n_layers,
                            dropout=dropout)
        optimizer = torch.optim.Adam(params=[p for p in model.parameters() if p.requires_grad],
                                     lr=lr,
                                     betas=betas,
                                     eps=epsilon)

    else:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint['epoch'] + 1
        print('\nLoaded checkpoint from epoch %d.\n' % start_epoch)
        model = checkpoint['model']
        optimizer = checkpoint['optimizer']

    # Loss function
    criterion = LabelSmoothedCE(eps=label_smoothing)

    # Move to default device
    model = model.to(device)
    criterion = criterion.to(device)

    # Find total epochs to train
    epochs = (n_steps // (train_loader.n_batches // batches_per_step)) + 1

    # Epochs
    for epoch in range(start_epoch, epochs+100):
        # Step
        step = epoch * train_loader.n_batches // batches_per_step

        optimizer.lr = lr
        # One epoch's training
        train_loader.create_batches()
        train(train_loader=train_loader,
              model=model,
              criterion=criterion,
              optimizer=optimizer,
              epoch=epoch,
              step=step)

        # One epoch's validation
        val_loader.create_batches()
        validate(val_loader=val_loader,
                 model=model,
                 criterion=criterion)

        # Save checkpoint
        save_checkpoint(epoch, model, optimizer)


def train(train_loader, model, criterion, optimizer, epoch, step):
    """
    One epoch's training.

    :param train_loader: loader for training data
    :param model: model
    :param criterion: label-smoothed cross-entropy loss
    :param optimizer: optimizer
    :param epoch: epoch number
    """
    model.train()
    # Track some metrics
    data_time = AverageMeter()  # data loading time
    step_time = AverageMeter()  # forward prop. + back prop. time
    losses = AverageMeter()  # loss

    # Starting time
    start_data_time = time.time()
    start_step_time = time.time()

    # Batches
    for i, (source_sequences, target_sequences, source_sequence_lengths, target_sequence_lengths) in enumerate(
            train_loader):

        # Move to default device
        source_sequences = source_sequences.to(device)  # (N, max_source_sequence_pad_length_this_batch)
        target_sequences = target_sequences.to(device)  # (N, max_target_sequence_pad_length_this_batch)
        source_sequence_lengths = source_sequence_lengths.to(device)  # (N)
        target_sequence_lengths = target_sequence_lengths.to(device)  # (N)

        # Time taken to load data
        data_time.update(time.time() - start_data_time)

        try:
          # Forward prop.
          predicted_sequences = model(source_sequences, target_sequences, source_sequence_lengths,
                                      target_sequence_lengths)  # (N, max_target_sequence_pad_length_this_batch, vocab_size)
        except:
          predicted_sequences = source_sequences
          raise Exception("Source Sequence: ", list(source_sequences))

        # Note: If the target sequence is "<BOS> w1 w2 ... wN <EOS> <PAD> <PAD> <PAD> <PAD> ..."
        # we should consider only "w1 w2 ... wN <EOS>" as <BOS> is not predicted
        # Therefore, pads start after (length - 1) positions
        loss = criterion(inputs=predicted_sequences,
                         targets=target_sequences[:, 1:],
                         lengths=target_sequence_lengths - 1)  # scalar

        if loss == torch.nan:
          print("Source Sequence: ", list(source_sequences))
          print("Target Sequence: ", list(target_sequences))
          raise Exception("Source Sequence: ", list(source_sequences))
        # Backward prop.
        (loss / batches_per_step).backward()

        # Keep track of losses
        losses.update(loss.item(), (target_sequence_lengths - 1).sum().item())

        # Update model (i.e. perform a training step) only after gradients are accumulated from batches_per_step batches
        if (i + 1) % batches_per_step == 0:
            optimizer.step()
            optimizer.zero_grad()

            # This step is now complete
            step += 1

            # Update learning rate after each step
            change_lr(optimizer, new_lr=get_lr(step=step, d_model=d_model, warmup_steps=warmup_steps))

            # Time taken for this training step
            step_time.update(time.time() - start_step_time)

            # Print status
            if step % print_frequency == 0:
                print('Epoch {0}/{1}-----'
                      'Batch {2}/{3}-----'
                      'Step {4}/{5}-----'
                      'Data Time {data_time.val:.3f} ({data_time.avg:.3f})-----'
                      'Step Time {step_time.val:.3f} ({step_time.avg:.3f})-----'
                      'Loss {losses.val:.4f} ({losses.avg:.4f})'.format(epoch + 1, epochs,
                                                                        i + 1, train_loader.n_batches,
                                                                        step, n_steps,
                                                                        step_time=step_time,
                                                                        data_time=data_time,
                                                                        losses=losses))

            # Reset step time
            start_step_time = time.time()

            # If this is the last one or two epochs, save checkpoints at regular intervals for averaging
            if epoch in [epochs - 1, epochs - 2] and step % 1500 == 0:  # 'epoch' is 0-indexed
                save_checkpoint(epoch, model, optimizer, prefix='step' + str(step) + "_")

        # Reset data time
        start_data_time = time.time()


def validate(val_loader, model, criterion):
    """
    One epoch's validation.

    :param val_loader: loader for validation data
    :param model: model
    :param criterion: label-smoothed cross-entropy loss
    """
    model.eval()
    with torch.no_grad():
        losses = AverageMeter()
        # Batches
        for i, (source_sequence, target_sequence, source_sequence_length, target_sequence_length) in enumerate(
                tqdm(val_loader, total=val_loader.n_batches)):
            source_sequence = source_sequence.to(device)  # (1, source_sequence_length)
            target_sequence = target_sequence.to(device)  # (1, target_sequence_length)
            source_sequence_length = source_sequence_length.to(device)  # (1)
            target_sequence_length = target_sequence_length.to(device)  # (1)

            # Forward prop.
            predicted_sequence = model(source_sequence, target_sequence, source_sequence_length,
                                       target_sequence_length)  # (1, target_sequence_length, vocab_size)

            # Note: If the target sequence is "<BOS> w1 w2 ... wN <EOS> <PAD> <PAD> <PAD> <PAD> ..."
            # we should consider only "w1 w2 ... wN <EOS>" as <BOS> is not predicted
            # Therefore, pads start after (length - 1) position
            loss = criterion(inputs=predicted_sequence, targets=target_sequence[:, 1:], lengths=target_sequence_length - 1)

            # Keep track of losses
            losses.update(loss.item(), (target_sequence_length - 1).sum().item())

        print("\nValidation loss: %.3f\n\n" % losses.avg)


main()

NameError: name 'get_positional_encoding' is not defined

In [None]:
!export CUDA_LAUNCH_BLOCKING=1

In [None]:
torch.cuda.empty_cache()

In [None]:
from google. colab import runtime
runtime. unassign()

Save Model to Google Derive

In [None]:
# import gc
# del model
import gc
gc.collect()

0

Translate


In [None]:
#@title Transform method
# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Transformer model
checkpoint = torch.load("/content/drive/MyDrive/MTModel/seq2seq2.pth.tar")
model = checkpoint['model'].to(device)
model.eval()


def transform(source_sequence, beam_size=4, length_norm_coefficient=0.6):
    with torch.no_grad():
        # Beam size
        k = beam_size
        # Minimum number of hypotheses to complete
        n_completed_hypotheses = min(k, 10)

        # Vocab size
        vocab_size = 274

        if isinstance(source_sequence, str):
            encoder_sequences = encodeString(source_sequence)
            encoder_sequences = torch.LongTensor(encoder_sequences).unsqueeze(0)
        else:
            encoder_sequences = source_sequence
        encoder_sequences = encoder_sequences.to(device)  # (1, source_sequence_length)
        encoder_sequence_lengths = torch.LongTensor([encoder_sequences.size(1)]).to(device)  # (1)

        # Encode
        encoder_sequences = model.encoder(encoder_sequences=encoder_sequences,
                                          encoder_sequence_lengths=encoder_sequence_lengths)  # (1, source_sequence_length, d_model)

        # Our hypothesis to begin with is just <BOS>
        hypotheses = torch.LongTensor([[vocabMap['<BOS>']]]).to(device)  # (1, 1)
        hypotheses_lengths = torch.LongTensor([hypotheses.size(1)]).to(device)  # (1)

        # Tensor to store hypotheses' scores; now it's just 0
        hypotheses_scores = torch.zeros(1).to(device)  # (1)

        # Lists to store completed hypotheses and their scores
        completed_hypotheses = list()
        completed_hypotheses_scores = list()

        # Start decoding
        step = 1

        # Assume "s" is the number of incomplete hypotheses currently in the bag; a number less than or equal to "k"
        # At this point, s is 1, because we only have 1 hypothesis to work with, i.e. "<BOS>"
        while True:
            s = hypotheses.size(0)
            decoder_sequences = model.decoder(decoder_sequences=hypotheses,
                                              decoder_sequence_lengths=hypotheses_lengths,
                                              encoder_sequences=encoder_sequences.repeat(s, 1, 1),
                                              encoder_sequence_lengths=encoder_sequence_lengths.repeat(
                                                  s))  # (s, step, vocab_size)
            # Scores at this step
            scores = decoder_sequences[:, -1, :]  # (s, vocab_size)
            scores = F.log_softmax(scores, dim=-1)  # (s, vocab_size)

            # Add hypotheses' scores from last step to scores at this step to get scores for all possible new hypotheses
            scores = hypotheses_scores.unsqueeze(1) + scores  # (s, vocab_size)

            # Unroll and find top k scores, and their unrolled indices
            top_k_hypotheses_scores, unrolled_indices = scores.view(-1).topk(k, 0, True, True)  # (k)

            # Convert unrolled indices to actual indices of the scores tensor which yielded the best scores
            prev_word_indices = unrolled_indices // vocab_size  # (k)
            next_word_indices = unrolled_indices % vocab_size  # (k)

            # Construct the the new top k hypotheses from these indices
            top_k_hypotheses = torch.cat([hypotheses[prev_word_indices], next_word_indices.unsqueeze(1)],
                                         dim=1)  # (k, step + 1)

            # Which of these new hypotheses are complete (reached <EOS>)?
            complete = next_word_indices == vocabMap['<EOS>']  # (k), bool

            # Set aside completed hypotheses and their scores normalized by their lengths
            # For the length normalization formula, see
            # "Google’s Neural Machine Translation System: Bridging the Gap between Human and Machine Translation"
            completed_hypotheses.extend(top_k_hypotheses[complete].tolist())
            norm = math.pow(((5 + step) / (5 + 1)), length_norm_coefficient)
            completed_hypotheses_scores.extend((top_k_hypotheses_scores[complete] / norm).tolist())

            # Stop if we have completed enough hypotheses
            if len(completed_hypotheses) >= n_completed_hypotheses:
                break

            # Else, continue with incomplete hypotheses
            hypotheses = top_k_hypotheses[~complete]  # (s, step + 1)
            hypotheses_scores = top_k_hypotheses_scores[~complete]  # (s)
            hypotheses_lengths = torch.LongTensor(hypotheses.size(0) * [hypotheses.size(1)]).to(device)  # (s)

            # Stop if things have been going on for too long
            if step > 100:
                break
            step += 1

        # If there is not a single completed hypothesis, use partial hypotheses
        if len(completed_hypotheses) == 0:
            completed_hypotheses = hypotheses.tolist()
            completed_hypotheses_scores = hypotheses_scores.tolist()

        # Decode the hypotheses
        all_hypotheses = list()
        # for i, h in enumerate():
        #     all_hypotheses.append({"hypothesis": h, "score": completed_hypotheses_scores[i]})

        completed_hypotheses_scores=[]
        count =0
        for hypothesis in completed_hypotheses:
          result = decodeString(hypothesis)
          completed_hypotheses_scores.append({"hypothesis": result, "index":count })
          count +=1
        # Find the best scoring completed hypothesis
        best_hypothesis = completed_hypotheses_scores[0]["hypothesis"]
        # return best_hypothesis, all_hypotheses
        return best_hypothesis, completed_hypotheses_scores

# selected, results = transform(x)

In [None]:
!cp -r /content/drive/MyDrive/NewModel/ASROutputs .

In [None]:
!export CUDA_LAUNCH_BLOCKING=1

In [None]:
# !ls -la /content/drive/MyDrive/MTModel/seq2seq_32_epoch_Val_err_0_915.pth.tar

In [None]:
asentences = []
with open("/content/ASROutputs/alffa_amharic_test.txt", "r", encoding="utf-8")  as trueValue:
  asentences = trueValue.readlines()
  count =0
  while count < len(asentences):
    asentences[count]= asentences[count].strip()
    count+=1

In [None]:
true_value_list = []
with open("val.spa", "r", encoding="utf-8")  as trueValue:
  true_value_list = trueValue.readlines()
  count =0
  while count < len(true_value_list):
    true_value_list[count]= true_value_list[count].strip()
    count+=1

def run_test(filename: str):
  results = []
  # print(result[0]['generated_text'])
  with open(filename, "r") as file:
    lines = file.readlines()
    if len(lines)  != len(true_value_list):
      print(f"Length difference between test prediction: {len(lines)} and True Values: {len(true_value_list)}")
    for i in range(len(lines)):
      result1, result2 = transform(lines[i])
      results.append(result1)

    filePrediction = open(f"{filename.split('.')[0]}_predictions.txt", "w", encoding="utf-8")
    filePrediction.writelines(results)
    filePrediction.close()
  return results

In [None]:
!cp /content/drive/MyDrive/character_generated_results.txt .
# !cp /content/character_generated_results.txt /content/drive/MyDrive/
# !cp /content/character_generated_results.txt /content/drive/MyDrive/

In [None]:
# speechbrain_output = run_test("/content/ASROutputs/speechbrain_output.txt")
phonemebased_output = run_test("/content/ASROutputs/character_233_amharic_predictions (1).txt")
characterbased_output = run_test("/content/ASROutputs/c_37_phoneme_amharic_predictions (2).txt")

In [None]:
# print(speechbrain_output[0])
print(characterbased_output[0])
# print(phonemebased_output[0])

የኢንተርኔት አገልግሎትንም በተመለከተ በክልሎች በዞኖችና በአዲስ አበባ በተለያዩ ቦታዎች የአገልግሎት ማእከሎችን ለማቋቋም መታቀዱን አብራርተዋል


In [None]:
!pip install jiwer
!pip install datasets

Collecting jiwer
  Downloading jiwer-3.0.3-py3-none-any.whl (21 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m36.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.3 rapidfuzz-3.8.0
Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━

In [None]:
from datasets import load_dataset, load_metric

In [None]:
wer_metric = load_metric("wer")
cer_metric = load_metric("cer")

  wer_metric = load_metric("wer")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [None]:
!cd /content/gdrive/MyDrive/NewModel/ALFFAAmharic233/train/wav/ && ls -1 | wc -l

In [None]:
!cd /content/gdrive/MyDrive/NewModel/ALFFAAmharic233/train/wav_old/ && ls -1 | wc -l

/bin/bash: line 1: cd: /content/gdrive/MyDrive/NewModel/ALFFAAmharic233/train/wav_old/: No such file or directory


In [None]:
#@title sample result
with open("now.txt", "w", encoding="utf-8") as file:
  for a in results:
    file.write(a.strip()+ "\n")

In [None]:
len(results)

359

In [None]:
#@title Speechbrain results

print("Test CER: {:.3f}".format(cer_metric.compute(predictions=speechbrain_output, references=true_value_list)))
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=speechbrain_output, references=true_value_list)))

print("Compared with ALFFA test dataset")
print("Test CER: {:.3f}".format(cer_metric.compute(predictions=speechbrain_output, references=asentences)))
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=speechbrain_output, references=asentences)))

Test CER: 0.074
Test WER: 0.294
Compared with ALFFA test dataset
Test CER: 0.150
Test WER: 0.704


In [None]:
#@title Phoneme Based results

print("Test CER: {:.3f}".format(cer_metric.compute(predictions=phonemebased_output, references=true_value_list)))
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=phonemebased_output, references=true_value_list)))

print("Compared with ALFFA test dataset")
print("Test CER: {:.3f}".format(cer_metric.compute(predictions=phonemebased_output, references=asentences)))
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=phonemebased_output, references=asentences)))

Test CER: 0.070
Test WER: 0.273
Compared with ALFFA test dataset
Test CER: 0.147
Test WER: 0.701


In [None]:
#@title Character Based results

print("Test CER: {:.3f}".format(cer_metric.compute(predictions=characterbased_output, references=true_value_list)))
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=characterbased_output, references=true_value_list)))

print("Compared with ALFFA test dataset")
print("Test CER: {:.3f}".format(cer_metric.compute(predictions=characterbased_output, references=asentences)))
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=characterbased_output, references=asentences)))

Test CER: 0.071
Test WER: 0.278
Compared with ALFFA test dataset
Test CER: 0.148
Test WER: 0.700


In [None]:
# print(len(phonemebased_output))

359


In [None]:
# #@title Phoneme based results
# print("Test CER: {:.3f}".format(cer_metric.compute(predictions=phonemebased_output, references=true_value_list)))
# print("Test WER: {:.3f}".format(wer_metric.compute(predictions=phonemebased_output, references=true_value_list)))

Test CER: 0.071
Test WER: 0.277


In [None]:
characterbased_output[-10:]

['ሀጂማ አመድ ወሊመልእክት አስተላለፉ',
 'ለዚህም ይመስላል ባነሱት ነጥብ ላይ ብዙ ሙረፋ አልበዛባቸው',
 'በቅዳሜ የተደረጉት ግጥሚያዎች ለተመርካቹ ቁጥር ዝክተኛመሆን ምክንያት ሆኗል',
 'ከዚያም የክልሉ ኮሚሽን ጃር ሜዳ ቢሮ ስለሰጠን ወደዚያው ሄድንኳ',
 'ነገር ግን ተጫዋቹን የሚፈልገው ክለቡ እንጂ አሰልጣኝ ጉዴት አልነበርም',
 'በትግራይ ህዝብ ስም የተቋቋመው ኤፈርት አዲስ የቴሌቪዥን ጤቢያ ሊከፈት መሆኑን ለጋዜጣዋ ዜና ዴስክ በኢንተርኔት የደረሰው ዘገባ አመለከተች',
 'ከዚህም ሌላ የቴሌቪዥኗ ጋዜጠኛ እንግዳዘር ነጋ እቅሯ ኖኦፕራሲዮን የሆኔ ሲሆን አሁን ከኦፕራሲያኑ በኋላ በደና ሁኔታ ትገኛለች ሲል ጠቁሟል',
 'ዘታኝም ዘጋቢ እንደ ዘገበው ትልቁ የምግብ ማመላለስ ተግባር ወደ ዲናን የተደረገው ከሀምሳ ቀናት በፊት ነበርኳ',
 'ወደከተማ በሚመጡ እንግዶች ላይ ድንጋጤን ይፈጥራል በሚል ችግሩ ባለበት ታፍኖ እንዲያስተደርጓል',
 'ዛሬ ሚጽሁፍ እስከ ተጻፈበት ድረስ ከአንድ ሺ ዘጠኝ መቶ አርባ አመተ ምህረት ጀምሮ በዚሁ ከተማ በመኖር ላይ ይገኛሉ']

In [None]:
print("Test CER: {:.3f}".format(cer_metric.compute(predictions=results[:201], references=true_value_list)))
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=results[:201], references=true_value_list)))

Test CER: 0.060
Test WER: 0.247


In [None]:
with open("new_one.txt", "w" , encoding="utf-8") as file:
  for a in results:
    file.write(a+"\n")

In [None]:
dlines = []
with open("val.join", "r") as file:
  dlines = file.readlines()

Evaluate

In [None]:
#@title blue score example
# Use sacreBLEU in Python or in the command-line?
# Using in Python will use the test data downloaded in prepare_data.py
# Using in the command-line will use test data automatically downloaded by sacreBLEU...
# ...and will print a standard signature which represents the exact BLEU method used! (Important for others to be able to reproduce or compare!)
sacrebleu_in_python = True

# Make sure the right model checkpoint is selected in translate.py

# Data loader
test_loader = SequenceLoader(data_folder="./",
                             source_suffix="en",
                             target_suffix="am",
                             split="test", tokens_in_batch=None)
test_loader.create_batches()

# Evaluate
with torch.no_grad():
    hypotheses = list()
    references = list()
    for i, (source_sequence, target_sequence, source_sequence_length, target_sequence_length) in enumerate(
            tqdm(test_loader, total=test_loader.n_batches)):
        hypotheses.append(translate(source_sequence=source_sequence,
                                    beam_size=4,
                                    length_norm_coefficient=0.6)[0])
        references.extend(test_loader.bpe_model.decode(target_sequence.tolist(), ignore_ids=[0, 2, 3]))
    if sacrebleu_in_python:
        print("\n13a tokenization, cased:\n")
        print(sacrebleu.corpus_bleu(hypotheses, [references]))
        print("\n13a tokenization, caseless:\n")
        print(sacrebleu.corpus_bleu(hypotheses, [references], lowercase=True))
        print("\nInternational tokenization, cased:\n")
        print(sacrebleu.corpus_bleu(hypotheses, [references], tokenize='intl'))
        print("\nInternational tokenization, caseless:\n")
        print(sacrebleu.corpus_bleu(hypotheses, [references], tokenize='intl', lowercase=True))
        print("\n")
    else:
        with codecs.open("translated_test.am", "w", encoding="utf-8") as f:
          f.write("\n".join(hypotheses))
        print("\n13a tokenization, cased:\n")
        os.system("cat translated_test.am | sacrebleu -t wmt14/full -l en-de")
        print("\n13a tokenization, caseless:\n")
        os.system("cat translated_test.am | sacrebleu -t wmt14/full -l en-de -lc")
        print("\nInternational tokenization, cased:\n")
        os.system("cat translated_test.am | sacrebleu -t wmt14/full -l en-de -tok intl")
        print("\nInternational tokenization, caseless:\n")
        os.system("cat translated_test.am | sacrebleu -t wmt14/full -l en-de -tok intl -lc")
        print("\n")

In [None]:
# Reverse map the string
phonemeToCharMap = {"ፕ": "ፕ", "ህ": "ህ", "ል": "ል", "ም": "ም", "ር": "ር", "ስ": "ስ", "ሽ": "ሽ", "ቅ": "ቅ", "ብ": "ብ", "ቭ": "ቭ", "ት": "ት", "ች": "ች", "ን": "ን", "ኝ": "ኝ", "ክ": "ክ", "ው": "ው", "ዝ": "ዝ", "ዥ": "ዥ", "ፍ": "ፍ", "ጽ": "ጽ", "ጵ": "ጵ", "ጭ": "ጭ", "ጥ": "ጥ", "ግ": "ግ", "ጅ": "ጅ", "ድ": "ድ", "ይ": "ይ", "ኣ": "አ", "ኡ": "ኡ", "ኢ": "ኢ", "ኤ": "ኤ", "እ": "እ", "ኦ": "ኦ", "ህኡ": "ሁ", "ህኢ": "ሂ", "ህኣ": "ሀ", "ህኤ": "ሄ", "ህኦ": "ሆ", "ልኧ": "ለ", "ልኡ": "ሉ", "ልኢ": "ሊ", "ልኣ": "ላ", "ልኤ": "ሌ", "ልኦ": "ሎ", "ምኧ": "መ", "ምኡ": "ሙ", "ምኢ": "ሚ", "ምኣ": "ማ", "ምኤ": "ሜ", "ምኦ": "ሞ", "ርኧ": "ረ", "ርኡ": "ሩ", "ርኢ": "ሪ", "ርኣ": "ራ", "ርኤ": "ሬ", "ርኦ": "ሮ", "ስኧ": "ሰ", "ስኡ": "ሱ", "ስኢ": "ሲ", "ስኣ": "ሳ", "ስኤ": "ሴ", "ስኦ": "ሶ", "ሽኧ": "ሸ", "ሽኡ": "ሹ", "ሽኢ": "ሺ", "ሽኣ": "ሻ", "ሽኤ": "ሼ", "ሽኦ": "ሾ", "ቅኧ": "ቀ", "ቅኡ": "ቁ", "ቅኢ": "ቂ", "ቅኣ": "ቃ", "ቅኤ": "ቄ", "ቅኦ": "ቆ", "ብኧ": "በ", "ብኡ": "ቡ", "ብኢ": "ቢ", "ብኣ": "ባ", "ብኤ": "ቤ", "ብኦ": "ቦ", "ቭኧ": "ቨ", "ቭኡ": "ቩ", "ቭኢ": "ቪ", "ቭኣ": "ቫ", "ቭኤ": "ቬ", "ቭኦ": "ቮ", "ትኧ": "ተ", "ትኡ": "ቱ", "ትኢ": "ቲ", "ትኣ": "ታ", "ትኤ": "ቴ", "ትኦ": "ቶ", "ችኧ": "ቸ", "ችኡ": "ቹ", "ችኢ": "ቺ", "ችኣ": "ቻ", "ችኤ": "ቼ", "ችኦ": "ቾ", "ንኧ": "ነ", "ንኡ": "ኑ", "ንኢ": "ኒ", "ንኣ": "ና", "ንኤ": "ኔ", "ንኦ": "ኖ", "ኝኧ": "ኘ", "ኝኡ": "ኙ", "ኝኢ": "ኚ", "ኝኣ": "ኛ", "ኝኤ": "ኜ", "ኝኦ": "ኞ", "ክኧ": "ከ", "ክኡ": "ኩ", "ክኢ": "ኪ", "ክኣ": "ካ", "ክኤ": "ኬ", "ክኦ": "ኮ", "ውኧ": "ወ", "ውኡ": "ዉ", "ውኢ": "ዊ", "ውኣ": "ዋ", "ውኤ": "ዌ", "ውኦ": "ዎ", "ዝኧ": "ዘ", "ዝኡ": "ዙ", "ዝኢ": "ዚ", "ዝኣ": "ዛ", "ዝኤ": "ዜ", "ዝኦ": "ዞ", "ዥኧ": "ዠ", "ዥኡ": "ዡ", "ዥኢ": "ዢ", "ዥኣ": "ዣ", "ዥኤ": "ዤ", "ዥኦ": "ዦ", "ይኧ": "የ", "ይኡ": "ዩ", "ይኢ": "ዪ", "ይኣ": "ያ", "ይኤ": "ዬ", "ይኦ": "ዮ", "ድኧ": "ደ", "ድኡ": "ዱ", "ድኢ": "ዲ", "ድኣ": "ዳ", "ድኤ": "ዴ", "ድኦ": "ዶ", "ጅኧ": "ጀ", "ጅኡ": "ጁ", "ጅኢ": "ጂ", "ጅኣ": "ጃ", "ጅኤ": "ጄ", "ጅኦ": "ጆ", "ግኧ": "ገ", "ግኡ": "ጉ", "ግኢ": "ጊ", "ግኣ": "ጋ", "ግኤ": "ጌ", "ግኦ": "ጐ", "ጥኧ": "ጠ", "ጥኡ": "ጡ", "ጥኢ": "ጢ", "ጥኣ": "ጣ", "ጥኤ": "ጤ", "ጥኦ": "ጦ", "ጭኧ": "ጨ", "ጭኡ": "ጩ", "ጭኢ": "ጪ", "ጭኣ": "ጫ", "ጭኤ": "ጬ", "ጭኦ": "ጮ", "ጵኧ": "ጰ", "ጵኡ": "ጱ", "ጵኢ": "ጲ", "ጵኣ": "ጳ", "ጵኤ": "ጴ", "ጵኦ": "ጶ", "ጽኧ": "ጸ", "ጽኡ": "ጹ", "ጽኢ": "ጺ", "ጽኣ": "ጻ", "ጽኤ": "ጼ", "ጽኦ": "ጾ", "ፍኧ": "ፈ", "ፍኡ": "ፉ", "ፍኢ": "ፊ", "ፍኣ": "ፋ", "ፍኤ": "ፌ", "ፍኦ": "ፎ", "ፕኧ": "ፐ", "ፕኡ": "ፑ", "ፕኢ": "ፒ", "ፕኣ": "ፓ", "ፕኤ": "ፔ", "ፕኦ": "ፖ", "ህኡኣ": "ኋ", "ልኡኣ": "ሏ", "ምኡኣ": "ሟ", "ርኡኣ": "ሯ", "ስኡኣ": "ሷ", "ሽኡኣ": "ሿ", "ቅኡኣ": "ቋ", "ብኡኣ": "ቧ", "ቭኡኣ": "ቯ", "ትኡኣ": "ቷ", "ችኡኣ": "ቿ", "ንኡኣ": "ኗ", "ኝኡኣ": "ኟ", "ክኡኣ": "ኳ", "ዝኡኣ": "ዟ", "ዥኡኣ": "ዧ", "ድኡኣ": "ዷ", "ጅኡኣ": "ጇ", "ግኡኣ": "ጓ", "ጥኡኣ": "ጧ", "ጭኡኣ": "ጯ", "ጵኡኣ": "ጷ", "ጽኡኣ": "ጿ", "ፍኡኣ": "ፏ", "ፕኡኣ": "ፗ", "ግኡኤ": "ጔ", "ክኡኤ": "ኴ", "ህኡኤ": "ኌ", "ቅኡኤ": "ቌ", "ግኡኢ": "ጒ", "ክኡኢ": "ኲ", "ህኡኢ": "ኊ", "ቅኡኢ": "ቊ"}

In [None]:
def mapBack(sent:str) -> str:
  splitted = sent.split(" ")
  return "".join([phonemeToCharMap.get(ph, ph) for ph in splitted])

In [None]:
sample = "ንኧ ግኧ ር ግ ን ችኤ ል ስኢ ልኧ አ ው ርኦ ፕኣ ስኡ ፕኧ ር ክኣ ፕ ክኧ ችኣ ም ፕኢ ይኧ ን ስ ልኢ ግ አ ሽኧ ንኣ ፍኢ ክኧ ርኢ ይኣ ል ምኣ ድ ርኢ ድ ግኣ ር ብኣ ልኧ ብኧ ት ጭኧ ውኣ ትኣ ም ክ ን ይኣ ት ውኧ ድኧ ልኤ ልኣ ቅኧ ን ትኧ ዝኧ ውኣ ው ርኡኣ ል"
print(mapBack(sample))

NameError: ignored

In [None]:
results = []
with open("selected_prediction.txt", "r", encoding="utf-8") as file:
  results = [mapBack(sent) for sent in file.readlines()]
final_testing = open("final_testing.txt", "r", encoding ="utf-8")
true_value= final_testing.readlines()
final_testing.close()

In [None]:
new_results = []
for r in results:
  selected, detail = transform(r.strip())
  new_results.append(selected)

In [None]:
with open("newone.txt", "w", encoding="utf-8") as file:
  for a in new_results:
    file.write(a+"\n")

In [None]:
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=new_results, references=true_value)))

In [None]:
import pandas as pd

df = pd.DataFrame({"input": np.array(results),"output": np.array(new_results)})

In [None]:
true_value

['የኢንተርኔት አገልግሎትንም በተመለከተ በክልሎች በዞኖችና በአዲስ አበባ በተለያዩ ቦታዎች የአገልግሎት ማእከሎችን ለማቋቋም መታቀዱን አብራርተዋል\n',
 'በዚህም አነስተኛ መስኖዎችን በማስፋፋት አገሪቱን በተደጋጋሚ ከሚያጠቃት ድርቅና የምግብ እህል እጥረት ለማቃለል እንደሚቻል ጠቁመዋል\n',
 'ይህ እንዳይሆን በህይወታቸው መስዋእትነት ያስገኙትን የኢትዮጵያዊነት ክብር አንግበን ለቅድስት ሀገራችን አንድነትና ሉአላዊነት ሰላምና ብልጽግና በህብረት እንቁም\n',
 'የመጀመሪያው ነጥብ በተቃዋሚ ሀይሎች ሰፈር በጋራ አቋም ላይ በጋራ ለማቆም ያየነው ጽናት አነስተኛ መሆኑ ነው\n',
 'አቅም በሚፈቅደው መሰረት አስቀድሞ መዘጋጀት ስለሚፈልጉ የሚያጠራጥር አይሆንም ስትል የአሜሪካ ድምጽ ዜና ዘጋቢ ገልጻለች\n',
 'ከለቀስተኞቹ መካከል አብዛኛዎቹ ጸሀዩ ዛሬ ተገለጠ ሀይለስላሴ ማሩን አለማችን አባታችን የአፍሪካ አባት የአለም አባት በማለት ነበር ሀዘናቸውን የሚገልጹ\n',
 'ከዚህ ጋር ለሰብአዊ አገልግሎት መሰለፏም ውጤታማ መሆኗም አድናቆትን አትርፎላታል\n',
 'የኮሚሽኑ ውሳኔና የቤተ ክህነቱ ተቃውሞ\n',
 'ይህንንም ባድመንና ሽራሮን በመውረር እውን አርጓል\n',
 'ጋዜጠኞችን መለያየታቸው ብዙዎችን አሳዝኗል\n',
 'ባለፈው ሰኞ እለት ደግሞ በቴሌቭዥን ሌላ ሽልማት ሲሸለም ተመልከትኩ\n',
 'አጠቃላይ ወጪው አስራ ስምንት ሺ ዶላር እንደሆነ ለማወቅ ተችሏል\n',
 'ሌሎቹ በሙሉ ጤነኞች ናቸው\n',
 'መለስና ኢሳያስ እርስ በእርስ ለመገለባበጥ እየሞከሩ ነው\n',
 'የአየር ሀይላችን አውሮፕላኖችም ግዳጃቸውን በተገቢው ፈጽመው በሰላም ተመልሰዋል ሲል የመንግስት ቃል አቀባይ ጽፈት ቤት መግለጫ አስታውቋል\n',
 'ከሟቾቹ ጋር የነበሩ ሁለት ታጣቂዎች በሰላም እጃቸውን

In [None]:
#@title Character Error Rate Metric

cerMetric = load_metric("cer")
print("Test CER: {:.3f}".format(cerMetric.compute(predictions=results, references=true_value)))

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Test CER: 0.089


In [None]:
output_data = []
with open("val.join") as file:
  lines = file.readlines()
  for a in lines:
    output_data.append(a.strip())

In [None]:
correct_data = []
with open("val.spa") as file:
  lines = file.readlines()
  for a in lines:
    correct_data.append(a.strip())

In [None]:
print("Test CER: {:.3f}".format(cer_metric.compute(predictions=output_data, references=correct_data)))
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=output_data, references=correct_data)))

Test CER: 0.133
Test WER: 0.961
