In [1]:
# Download the pre-trained model
# !wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin

In [19]:
# Imports
import fasttext
import os
import random
import re

In [3]:
# Load the pre-trained fastText language identification model
model_path = 'lid.176.bin'  # Path to the fastText model
fasttext_model = fasttext.load_model(model_path)

# Function to predict the language of a given text
def predict_language(text, model):
    predictions = model.predict(text)
    language = predictions[0][0].replace('__label__', '')  # Clean up the output
    confidence = predictions[1][0]  # Confidence score
    return language, confidence

In [4]:
# Example text for testing
text = "Ceci est un texte en français."
predicted_language, confidence = predict_language(text, fasttext_model)

print(f"Predicted Language: {predicted_language}")
print(f"Confidence Score: {confidence}")

Predicted Language: fr
Confidence Score: 0.9996371865272522


In [6]:
# Gathering data for ALL languages

# FLORES dev set should be fine for this?
datapath = '/Users/Suzenator/Documents/Uni/M4/MThesis/Data/flores_subset/dev/'
output_file = 'fine_tune_data.txt'

# Mapping languages to fastText codes
langs = {
    'Czech': 'cs',
    'Welsh': 'cy',
    'German': 'de',
    'French': 'fr',
    'Irish': 'ga',
    'Igbo': 'ig',
    'Japanese': 'ja',
    'Limburgish': 'li',
    'Luxembourgish': 'lb',
    'Dutch': 'nl',
    'Nepali': 'ne',
    'Punjabi': 'pa',
    'Russian': 'ru',
    'Sango': 'sg',
    'Tagalog': 'tl',
    'Chinese': 'zh'
}

# Mapping file names to fastText language codes
files = [
    'ces_Latn.dev', 'cym_Latn.dev', 'deu_Latn.dev', 'fra_Latn.dev', 
    'gle_Latn.dev', 'ibo_Latn.dev', 'jpn_Jpan.dev', 'lim_Latn.dev',
    'ltz_Latn.dev', 'nld_Latn.dev', 'npi_Deva.dev', 'pan_Guru.dev',
    'rus_Cyrl.dev', 'sag_Latn.dev', 'tgl_Latn.dev', 'zho_Hans.dev'
]

file_to_lang_code = {
    'ces_Latn.dev': 'cs',
    'cym_Latn.dev': 'cy',
    'deu_Latn.dev': 'de',
    'fra_Latn.dev': 'fr',
    'gle_Latn.dev': 'ga',
    'ibo_Latn.dev': 'ig',
    'jpn_Jpan.dev': 'ja',
    'lim_Latn.dev': 'li',
    'ltz_Latn.dev': 'lb',
    'nld_Latn.dev': 'nl',
    'npi_Deva.dev': 'ne',
    'pan_Guru.dev': 'pa',
    'rus_Cyrl.dev': 'ru',
    'sag_Latn.dev': 'sg',
    'tgl_Latn.dev': 'tl',
    'zho_Hans.dev': 'zh'
}

In [8]:
# # Preprocessing that data to be in the format that the FastText model wants it to be:
# # __label__<language_code> <text>

# # Initialize list to store processed lines
# processed_data = []

# # Process each file
# for file in files:
#     lang_code = file_to_lang_code[file]  # Get the language code
#     file_path = os.path.join(datapath, file)

#     # Read the file and process lines
#     with open(file_path, 'r', encoding='utf-8') as f:
#         for line in f:
#             line = line.strip()  # Remove any leading/trailing whitespace
#             if line:  # Skip empty lines
#                 # Add fastText label to each line
#                 processed_data.append(f"__label__{lang_code} {line}")

# # Shuffle the dataset (optional but recommended)
# random.shuffle(processed_data)

# # Write processed data to the output file
# with open(output_file, 'w', encoding='utf-8') as f:
#     for line in processed_data:
#         f.write(line + '\n')

# print(f"Processed data saved to {output_file}")

Processed data saved to fine_tune_data.txt


In [9]:
# Finetuning FastText
# finetuned_model = fasttext.train_supervised(input="fine_tune_data.txt",
#                  epoch=1500,   
#                  lr=0.004,      
#                  wordNgrams=4)  

# finetuned_model.save_model('fine_tuned_model.bin')

finetuned_model = fasttext.load_model('fine_tuned_model.bin')

Read 0M words
Number of words:  94873
Number of labels: 16
Progress: 100.0% words/sec/thread:  603640 lr:  0.000000 avg.loss:  0.138896 ETA:   0h 0m 0s


In [12]:
test_datapath = '/Users/Suzenator/Documents/Uni/M4/MThesis/Data/flores_subset/devtest_labeled/'
files = ['ces_Latn.devtest', 'cym_Latn.devtest', 'deu_Latn.devtest', 'fra_Latn.devtest', 
         'gle_Latn.devtest', 'ibo_Latn.devtest', 'jpn_Jpan.devtest', 'lim_Latn.devtest', 
         'ltz_Latn.devtest', 'nld_Latn.devtest', 'npi_Deva.devtest', 'pan_Guru.devtest', 
         'rus_Cyrl.devtest', 'sag_Latn.devtest', 'tgl_Latn.devtest', 'zho_Hans.devtest']
file_to_lang_code = {
    'ces_Latn.devtest': 'cs', 'cym_Latn.devtest': 'cy', 'deu_Latn.devtest': 'de',
    'fra_Latn.devtest': 'fr', 'gle_Latn.devtest': 'ga', 'ibo_Latn.devtest': 'ig',
    'jpn_Jpan.devtest': 'ja', 'lim_Latn.devtest': 'li', 'ltz_Latn.devtest': 'lb',
    'nld_Latn.devtest': 'nl', 'npi_Deva.devtest': 'ne', 'pan_Guru.devtest': 'pa',
    'rus_Cyrl.devtest': 'ru', 'sag_Latn.devtest': 'sg', 'tgl_Latn.devtest': 'tl',
    'zho_Hans.devtest': 'zh'
}

# Adding Fasttext labels to the test data, ONLY DO THIS ONCE
# for file in files:
#     lang_code = file_to_lang_code[file]
#     file_path = os.path.join(test_datapath, file)
#     labeled_lines = []
#     with open(file_path, 'r') as f:
#         for line in f:
#             line = line.strip()
#             if line:
#                 labeled_lines.append(f"__label__{lang_code} {line}\n")
#     with open(file_path, 'w') as f:
#         f.writelines(labeled_lines)

In [40]:
# Function to calculate off-target ratio and return off-target sentences
def get_off_target_sentences(model, file_path, correct_language_label):
    off_target_count = 0
    total_count = 0
    off_target_sentences = []
    total_confidence = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:  # Skip empty lines
                total_count += 1
                
                # Predict language for the sentence
                predicted_label, confidence = model.predict(line)
                
                # Compare the predicted label with the correct language label
                if predicted_label[0] != correct_language_label:
                    off_target_count += 1
                    off_target_sentences.append(line)  # Store the off-target sentence
                
                total_confidence.append(confidence)
    
    # Calculate and return the ratio of off-target sentences and the sentences themselves
    if total_count == 0:
        return 0.0, off_target_sentences  # To avoid division by zero
    
    off_target_ratio = off_target_count / total_count
    avg_confidence = sum(total_confidence)/len(total_confidence)
    
    return off_target_ratio, off_target_sentences, avg_confidence

In [63]:
def print_off_target_ratio(model, alpha, scenario):
    
    total_ratio = []
    total_confidence = []
    
    xconst_name = alpha + "_gpt-mt"
    
    # Define language pairs for evaluation
    if scenario == "high_high":
        language_pairs = [("nl", "de"), ("nl", "zh"), ("fr", "cs"), ("fr", "de"),
                          ("jp", "zh"), ("jp", "ru"), ("tl", "ru"), ("tl", "cs")
                         ]
    elif scenario == "high_low":
        language_pairs = [("nl", "ig"), ("fr", "li"), ("jp", "li"), ("tl", "li"),
                          ("nl", "ne"), ("fr", "ne"), ("jp", "ig"), ("tl", "ig"),
                          ("nl", "cy"), ("fr", "cy"), ("jp", "cy"), ("tl", "ne")
                         ]
    elif scenario == "low_low":
        language_pairs = [("li", "lu"), ("li", "pa"), ("ig", "lb"), ("ig", "sg"),
                          ("ne", "pa"), ("ne", "ga"), ("cy", "ga"), ("cy", "sg")
                         ]
    else: 
        print("SCENARIO NOT RECOGNIZED")
    
    path = '/Users/Suzenator/Documents/Uni/M4/MThesis/output/backup/total/'+scenario+'/'+xconst_name+'/zeroshot/'
    files = os.listdir(path)
    lang_code_to_short = {key.split('.')[0].split('_')[0]: value for key, value in file_to_lang_code.items()}
    
    # Reverse mapping to get the key for a given short code
    short_to_key = {value: key.split('.')[0].split('_')[0] for key, value in file_to_lang_code.items()}
    
    # Generate filter list using the keys from file_to_lang_code
    filter_list = []
    for src, tgt in language_pairs:
        src_key = short_to_key.get(src, None)
        tgt_key = short_to_key.get(tgt, None)
        if src_key and tgt_key:  # Ensure both language codes exist in the mapping
            filter_list.append(f"{src_key}2{tgt_key}_translation")
            filter_list.append(f"{tgt_key}2{src_key}_translation")

    # Process the files
    for file in files:
    
        # Check if the file is in the filter list
        if any(f in file for f in filter_list):
            
            # Extract language pair from the file name
            result = re.search(r'2(.*?)_translation', file).group(1)
            target = lang_code_to_short.get(result, None)
    
            if target:  # Ensure the target language is found
                correct_language_label = '__label__' + target
    
                # Calculate off-target ratio and append to total
                off_target_ratio, off_target_sentences, confidence = get_off_target_sentences(model, path + file, correct_language_label)
                total_ratio.append(off_target_ratio)
                total_confidence.append(confidence)
    
    # Compute average off-target ratio
    avg_ratio = sum(total_ratio) / len(total_ratio) * 100
    avg_confidence = sum(total_confidence)/len(total_confidence) * 100
    print(f"Average Off-Target Ratio for {xconst_name}, {scenario}: {avg_ratio:.2f}, confidence: {avg_confidence[0]:.2f}")
#     print(f"{avg_ratio:.2f} ({avg_confidence[0]:.2f}%) ")    
         

In [64]:
alphas = ["0", "01"]
alphas_full = ["0", "001", "0025", "005", "01", "025", "05", "1"]
scenarios = ["high_high", "high_low", "low_low"]

for s in scenarios:
    
    if s == "high_high": 
        # Use the standard model for HH
        model = fasttext_model
    else:
        # HL and LL use the finetuned model since they contain languages the standard model does not recognize
        model = finetuned_model 
        
    for a in alphas_full:
        print_off_target_ratio(model, a, s)
    print()

Average Off-Target Ratio for 0_gpt-mt, high_high: 0.21, confidence: 98.03
Average Off-Target Ratio for 001_gpt-mt, high_high: 0.17, confidence: 98.08
Average Off-Target Ratio for 0025_gpt-mt, high_high: 0.21, confidence: 98.06
Average Off-Target Ratio for 005_gpt-mt, high_high: 0.16, confidence: 98.13
Average Off-Target Ratio for 01_gpt-mt, high_high: 0.17, confidence: 98.09
Average Off-Target Ratio for 025_gpt-mt, high_high: 0.19, confidence: 98.13
Average Off-Target Ratio for 05_gpt-mt, high_high: 0.26, confidence: 98.11
Average Off-Target Ratio for 1_gpt-mt, high_high: 0.29, confidence: 98.19

Average Off-Target Ratio for 0_gpt-mt, high_low: 7.90, confidence: 94.40
Average Off-Target Ratio for 001_gpt-mt, high_low: 6.42, confidence: 94.46
Average Off-Target Ratio for 0025_gpt-mt, high_low: 6.63, confidence: 94.50
Average Off-Target Ratio for 005_gpt-mt, high_low: 6.36, confidence: 94.51
Average Off-Target Ratio for 01_gpt-mt, high_low: 6.35, confidence: 94.38
Average Off-Target Rati