In [None]:
#!git clone https://github.com/MLRS/malti.git

In [1]:
import requests
import pandas as pd
import os.path
from pathlib import Path
from src.transliterate import transliterate
import src.token_rankers as token_rankers

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
token_mappings = ["./src/token_mappings/small_closed_class.map", "./src/token_mappings/additional_closed_class.map"]
token_rankers = [
    token_rankers.WordModelScoreRanker("./language_models/aggregated_country/lm/word/tn-maghreb.arpa"),
    token_rankers.CharacterModelScoreRanker("./language_models/aggregated_country/lm/char/tn-maghreb.arpa"),
]

Loading the LM will be faster if you build a binary file.
Reading /Users/katebelcher/Documents/LCT-MALTA/iwslt/malti/language_models/aggregated_country/lm/word/tn-maghreb.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Loading the LM will be faster if you build a binary file.
Reading /Users/katebelcher/Documents/LCT-MALTA/iwslt/malti/language_models/aggregated_country/lm/char/tn-maghreb.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************


In [3]:
#get token mappings 
TOKEN_MAPPINGS = {}
def get_token_mappings_aeb(path: str) -> dict[str, str]:
    if path not in TOKEN_MAPPINGS:
        with open(path, "r", encoding="utf-8") as file:
            mappings = {}
            for line in file:
                token, eng, mapping = line.strip().split("\t")
                mappings[token] = mapping
            TOKEN_MAPPINGS[path] = mappings
    return TOKEN_MAPPINGS[path]

In [11]:
#get aeb translation
mt_en_aeb_translations = "./src/translations/1504_mt_en_aeb.txt"
def translate_token_aeb(token: str) -> str:
    try:
        return get_token_mappings_aeb(mt_en_aeb_translations)[token]
    except KeyError:
        return transliterate(token, token_mappings, token_rankers, return_token_merge=False)

In [7]:
%load_ext autoreload
%autoreload 2
import pickle
import sys
sys.path.append('./src')
from src.etymology_classification import featurise
with open("./src/etymology_data/model.pickle", "rb") as file:
    model = pickle.load(file)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
text = "Il-karozza Porsche tal-2022 għandha speed fenomenali!"
tokens = requests.get("https://mlrs.research.um.edu.mt/tools/mlrsapi/tokenise", params={"text": text}).json()["result"]
labels = model.predict([featurise(tokens)])[0]
labels.tolist()

Loading the LM will be faster if you build a binary file.
Reading /Users/katebelcher/Documents/LCT-MALTA/iwslt/malti/language_models/aggregated_country/lm/word/tn-maghreb.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Loading the LM will be faster if you build a binary file.
Reading /Users/katebelcher/Documents/LCT-MALTA/iwslt/malti/language_models/aggregated_country/lm/char/tn-maghreb.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************


['Arabic',
 'Non-Arabic',
 'Name',
 'Arabic',
 'Symbol',
 'Arabic',
 'Code-Switching',
 'Non-Arabic',
 'Symbol']

In [17]:
transliteration_translation_ar = []
for i, label in enumerate(labels):

    if label in ("Arabic", "Symbol"):
        transliteration_translation_ar.append(transliterate(tokens[i], token_mappings, token_rankers, return_token_merge=False))
    else:
        transliteration_translation_ar.append(translate_token_aeb(tokens[i]))

boolean_list = []
for token in tokens:
    transliteration, merge_bool = transliterate(token, token_mappings, token_rankers, return_token_merge=True)
    boolean_list.append(merge_bool)

combined_str = ''
for transformed_tok, merge_bool in zip(transliteration_translation_ar, boolean_list):
    if merge_bool:
        combined_str += transformed_tok
    else:
        combined_str += transformed_tok + ' '
print(combined_str)

الالسيارات برسكهي تاع ال٢٠٢٢ عندها صباد الظواهر ! 
