# Reproduction of BLOOM model as proposed in Ryan et al. (2023)


### Credits:
**Original Code**: [https://github.com/XenonMolecule/MultiSim](https://github.com/XenonMolecule/MultiSim)

**Original Paper**: Michael Ryan, Tarek Naous, and Wei Xu. 2023. [Revisiting non-English Text Simplification: A Unified Multilingual Benchmark](https://aclanthology.org/2023.acl-long.269/). In *Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)*, pages 4898â€“4927, Toronto, Canada. Association for Computational Linguistics.


### Instructions:
To reproduce BLOOM as used in Ryan et al. (2023) for German text simplification, we slightly adapted their code. If you want to use this notebook please follow the installation intructions of the original code. Further, please add the German training and evaluation data to the required directories.

In [None]:
# from dotenv import load_dotenv
import pandas as pd
import numpy as np
import os
import subprocess
from sklearn.neighbors import NearestNeighbors
from numpy.random import default_rng
from easse.sari import corpus_sari
from sacrebleu import corpus_bleu
import json
import requests
from collections import defaultdict
from tqdm import tqdm
import sys

In [None]:
# pip install ipywidgets

In [None]:
from huggingface_hub import notebook_login
from huggingface_hub import HfFolder


#enter your API key, you can make one for free on HF
notebook_login()

In [None]:
# load_dotenv()
token = HfFolder.get_token()  # os.environ.get("HUGGING_FACE_API_TOKEN")
print(token)

In [None]:
os.getcwd()

In [None]:
%env LASER=../../LASER

In [None]:
def laser_embed(df, name, split, laser_version=""):
    txt_path = "./laser_embeddings/" + name + "_" + split + ".txt"
    bin_path = "./laser_embeddings/" + name + "_" + split + ".bin"
    with open(txt_path, 'w') as f:
        for txt in df['original']:
            f.write(txt.replace('\n','') + '\n')
    subprocess.run(["bash","../../LASER/tasks/embed/embed.sh",txt_path,bin_path,laser_version])
    os.remove(txt_path)

def load_laser_embeddings(name, split):
    dim = 1024
    bin_path = "./laser_embeddings/" + name + "_" + split + ".bin"

    embeddings = np.fromfile(bin_path, dtype=np.float32, count=-1)                                                                          
    embeddings.resize(embeddings.shape[0] // dim, dim)

    return embeddings

def calc_distances_to_neighbors(train_emb, eval_emb, neighbors):
    # Find distances to all neighbors
    A = train_emb[neighbors, :]
    B = eval_emb

    dot_product = np.dot(A, B.T).diagonal(0,0,2).T

    # Compute the L2 norm of the vectors in A and B
    norm_A = np.linalg.norm(A, axis=2)
    norm_B = np.linalg.norm(B, axis=1)

    # Compute the cosine distance between each pair of vectors using broadcasting
    cosine_distances = 1 - (dot_product / (norm_A.T * norm_B).T)

    return cosine_distances

def generate_preprocessing_sim(name, train_emb, eval_emb, split="test"):
    K=20

    model = NearestNeighbors(n_neighbors=K,
                            metric='cosine',
                            algorithm='brute',
                            n_jobs=-1)
    model.fit(train_emb)

    closest_neighbors = model.kneighbors(eval_emb, return_distance=False)

    cosine_distances = calc_distances_to_neighbors(train_emb, eval_emb, closest_neighbors)

    pd.DataFrame(closest_neighbors).to_csv("./few_shot_preprocessing/" + name + "_" + split + "_similarity.csv")
    pd.DataFrame(cosine_distances).to_csv("./few_shot_preprocessing/" + name + "_" + split + "_similarity_dist.csv")

def generate_preprocessing_rand(name, train_emb, eval_emb, split="test"):
    K = 20

    rng = np.random.default_rng(3600)
    random_neighbors = rng.integers(low=0, high=train_emb.shape[0], size=(eval_emb.shape[0], K))
    cosine_distances = calc_distances_to_neighbors(train_emb, eval_emb, random_neighbors)

    pd.DataFrame(random_neighbors).to_csv("./few_shot_preprocessing/" + name + "_" + split + "_random.csv")
    pd.DataFrame(cosine_distances).to_csv("./few_shot_preprocessing/" + name + "_" + split + "_random_dist.csv")


def preprocess_dataset(train_path, test_path, name, split="test"):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    # train = train[(train['original'].notna()) & (train['simple'].notna()) ]
    # test = test[(test['original'].notna()) & (test['simple'].notna()) ]
    print(len(train))
    print(len(test))

    laser_version = ""
    if name == "SimplifyUR":
        laser_version = "urd_Arab"
    
    laser_embed(train, name, "train", laser_version)
    laser_embed(test, name, "test", laser_version)

    train_embeddings = load_laser_embeddings(name, "train")
    test_embeddings = load_laser_embeddings(name, "test")

    generate_preprocessing_sim(name, train_embeddings, test_embeddings, split)
    generate_preprocessing_rand(name, train_embeddings, test_embeddings, split)

In [None]:
def calc_bleu_sari(df_ref, sentences):

    num_refs = df_ref.shape[1]-1

    bleu_scores = np.zeros((num_refs))
    sari_scores = np.zeros((num_refs))

    examples = [{"original": [], "sentences": [], "references": []} for _ in range(num_refs)]

    assert df_ref.shape[0] == len(sentences)

    for (index,row), sentence in zip(df_ref.iterrows(), sentences):
        original = row['original']
        simple = sentence
        ref_list = []
        for col in row.index:
            if col != 'original' and type(row[col]) != float:
                ref_list.append(row[col])
        num_ref = len(ref_list)
        examples[num_ref-1]['original'].append(original)
        examples[num_ref-1]['sentences'].append(simple)
        examples[num_ref-1]['references'].append(ref_list)

    counts = np.array([len(e['original']) for e in examples])
    total = sum(counts)
    weights = np.divide(counts, total)

    for i_bleu in range(len(examples)):
        if counts[i_bleu] > 0:
            references = np.array(examples[i_bleu]['references']).T.tolist()
            bleu_scores[i_bleu] = corpus_bleu(
                                examples[i_bleu]['sentences'],
                                references,
                                force = True,
                                tokenize = '13a',
                                lowercase = True
                            ).score
            sari_scores[i_bleu] = corpus_sari(
                                orig_sents = examples[i_bleu]['original'],
                                sys_sents = examples[i_bleu]['sentences'],
                                refs_sents = references,
                                tokenizer = '13a'
                            )
    
    bleu = np.dot(bleu_scores, weights)
    sari = np.dot(sari_scores, weights)

    return bleu, sari

In [None]:
API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
# headers = {"Authorization": f"Bearer {token}"}
token = "hf_YOUR-HF-TOKEN"  # todo: add your hugginface token here or use one of the previous methods to authenticate

headers = {"Authorization": f"Bearer {token}"}

# from huggingface_hub import InferenceApi

# inference = InferenceApi("bigscience/bloom",token=HfFolder.get_token())

def query(payload):
    # print(payload)
    data = {"inputs": payload}
    response = requests.post(API_URL, headers=headers, json=data)
    return response.json()


def load_fewshot_examples(train, test, mapping, offset=0):
    output = defaultdict(lambda:[])
    for j, (example,ref) in enumerate(zip(test['original'], test['simple'])):
        output['original'].append(example)
        output['ref'].append(ref)
        i_off = 0
        # print(j, mapping)
        # print(mapping.iloc[j])
        for i, idx in enumerate(mapping.iloc[j]):
            if i != 0 and i > offset:
                output["ex" + str(i_off) + "_orig"].append(train.iloc[idx]["original"])
                output["ex" + str(i_off) + "_simp"].append(train.iloc[idx]["simple"])
                i_off += 1
    
    out_df = pd.DataFrame(output)
    return out_df
    
def construct_example(example_row, k=3):
    output = []
    for i_ex in range(k):
        output.append("Original: \"" + example_row["ex" + str(i_ex) +"_orig"] + "\"\n")
        output.append("Simple: \"" + example_row["ex" + str(i_ex) + "_simp"] + "\"\n\n")

    output.append("Original: \"" + example_row["original"] + "\"\nSimple: \"")
    return "".join(output)

REQUERY_LIMIT = 5
def generate_fewshot(example_row, k=3):
    ex = construct_example(example_row, k=k)

    new = ""
    new_total = ""
    for n in range(REQUERY_LIMIT):
        response = query(ex)
        if type(response) == dict and "error" in response.keys():
            print(response)
        
        res = response[0]['generated_text']
        new = res[len(ex):]
        new_total += res[len(ex):]
        if "\"\n""" in new_total:
            # print("newline found")
            return new_total.split("\"\n""")[0]
        elif "Original:" in new_total:
            # print("original found")
            return new_total.split("Original:")[0]
        else:
            ex += new
    return new_total

def fewshot_eval(train_path, test_path, preprocessed_path, k=3, output_csv="", checkpoint=""):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    preprocessed = pd.read_csv(preprocessed_path)
    examples = load_fewshot_examples(train, test, preprocessed)
    sentences = []
    if (not checkpoint == "" and os.path.exists(checkpoint)):
        ckpt = pd.read_csv(checkpoint)
        sentences_pd = list(ckpt['fewshot output'])
        sentences = []
        for nr_sent, s in enumerate(sentences_pd):
            # print("check", s)
            if not type(s) == float:
                sentences.append(s)
            else:
                try:
                    output = generate_fewshot(examples.iloc[nr_sent], k)
                    # print(nr_sent, output)
                    sentences.append(output)
                except:
                    # print("---")
                    # print("ERROR:  DUMPING GENERATED SENTENCES!")
                    # print()
                    # print(sentences)
                    # print()
                    # print("ERROR ON " + examples.iloc[i]['original'])
                    # print("---")
                    sentences.append("")
            exit = True
            for s in sentences_pd[nr_sent:]:
                if not type(s) == float:
                    exit = False
            if exit:
                break
    for i_tqdm in tqdm(range(len(examples))):
        if i_tqdm < len(sentences):
            continue
        row = examples.iloc[i_tqdm]
        try:
            output = generate_fewshot(row, k)
            sentences.append(output)
        except:
            # print("---")
            # print("ERROR:  DUMPING GENERATED SENTENCES!")
            # print()
            # print(sentences)
            # print()
            # print("ERROR ON " + row['original'])
            # print("---")
            sentences.append("")
    if not output_csv == "":
        output = {"original":list(test['original']), "fewshot output": sentences}
        output_df = pd.DataFrame(output)
        output_df.to_csv(output_csv, index=False)
    bleu, sari = calc_bleu_sari(test, sentences)
    return bleu, sari

# Try k-shots to fill in blanks, but if the error persists try k-=1
def few_shot_backoff(train_path, test_path, preprocessed_path, k=3, output_csv="", checkpoint=""):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    preprocessed = pd.read_csv(preprocessed_path)
    examples = load_fewshot_examples(train, test, preprocessed)
    sentences = []
    if (not checkpoint == "" and os.path.exists(checkpoint)):
        ckpt = pd.read_csv(checkpoint)
        sentences_pd = list(ckpt['fewshot output'])
        sentences = []
        for i_sent, s in tqdm(enumerate(sentences_pd)):
            if not type(s) == float:
                sentences.append(s)
            else:
                curr_k = k
                while curr_k >= 0:
                    try:
                        generated = generate_fewshot(examples.iloc[i_sent], curr_k)
                        # print(i_sent, generated)
                        sentences.append(generated)
                        break
                    except:
                        curr_k -= 1
                if curr_k < 0:
                    print("ERROR ON INPUT: " + examples.iloc[i_sent]['original'])
        if not output_csv == "":
            output = {"original":list(test['original']), "fewshot output": sentences}
            output_df = pd.DataFrame(output)
            output_df.to_csv(output_csv, index=False)
        bleu, sari = calc_bleu_sari(test, sentences)
        return bleu, sari


In [None]:
query("The sense of life is ")

In [None]:
result_scores = [
    ['name', "type", 'k', 'bleu', 'sari', 'bleu', 'sari'], 
     # ['TextComplexityDE', 'similarity', 0, 10.498930834393574, 35.270106181558134, 10.498930834393574, 35.270106181558134], 
    ['TextComplexityDE', 'similarity', 10, 7.205176465320519e-20, 27.05798251998486, 21.468699660404642, 39.8973797381036], 
     ['TextComplexityDE', 'random', 0, 8.295951045328993, 34.93129793635445, 8.295951045328993, 34.93129793635445],
    ['TextComplexityDE', 'random', 10, 0.0, 26.674574612964037, 15.973901603823865, 35.50831232720633],
    
    ['GEOLino', 'random', 0, 29.117056851318097, 28.713748042875253, 29.117056851318097, 28.713748042875253], 
    ['GEOLino', 'random', 10, 51.56820676420725, 37.18657466545419, 51.56820676420725, 37.18657466545419],
    ['GEOLino', 'similarity', 10, 12.141214691406692, 28.219600294595057, 49.97869326480118, 40.63871137249447], 
    # ['GEOLino', 'random', 0, 29.117056851318097, 28.713748042875253, 29.117056851318097, 28.713748042875253], 
    
     # ['GEOLino-full', 'similarity', 0, 29.501461657873758, 32.11658520872407, 29.501461657873758, 32.11658520872407],
    
    ['GEOLino-full', 'random-zero', 0, 29.501461657873758, 32.11658520872407, 29.501461657873758, 32.11658520872407],
    ['GEOLino-full', 'random', 10, 46.84634239044782, 38.843746957264734, 48.7504291447407, 39.10700124798857],
    ['GEOLino-full', 'similarity', 10, 22.540366238748927, 37.8468078899627, 58.02791540402962, 50.112829617920376],
    
    ['simple-german-corpus', 'random-zero', 0, 4.078741855153335, 31.311442966902234, 4.078741855153335, 31.311442966902234],
    ['simple-german-corpus', 'random', 10, 4.123620228852482, 33.49020474271444, 5.022534384992534, 32.49837499073984],
    ['simple-german-corpus', 'similarity', 10, 3.179767831264162, 40.01604919508432, 14.107995580454697, 44.685470973766314],
     
    ['DEplain-APA', 'random', 0, 17.23222418819839, 35.191632840161695, 17.23222418819839, 35.191632840161695],
    ['DEplain-APA', 'random', 10, 19.2266957201598, 35.52235825908088, 19.2266957201598, 35.52235825908088],
    ['DEplain-APA', 'similarity', 10, 21.616008179234996, 41.29645080601255, 22.205236460328297, 41.20599406417358],
    
    ['DEplain-web', 'random', 0, 11.444314254398583, 30.76291493800329, 11.444314254398583, 30.76291493800329],
    ['DEplain-web', 'random', 10, 3.20, 33.43, 11.54, 30.97],
    ['DEplain-web', 'similarity', 10, 0.10, 33.97, 12.07, 37.10],
    
    ['BiSECT', 'random', 10, 0.0, 26.25786416446166, 16.309267554152836, 37.27606416280923], 
    ['BiSECT', 'similarity', 10, 0.0, 26.25786416446166, 15.936765840861158, 37.31072088667093],
    
]

In [None]:
# result_scores = [["name", "type", "k", "bleu", "sari", "bleu", "sari"]]
for data_path in [
                "../../data/German/GEOLino Corpus_", 
                "../../data/German/TextComplexityDE Parallel Corpus_",
                "../../data/German/GEOLino-full_", 
                "../../data/German/TextComplexityDE-full_", # only zero-shot
                 "../../data/German/simple-german-corpus_", 
                  "../../data/German/DEplain-APA_", 
                  "../../data/German/DEplain-web_", 
                 "../../data/German/BiSECT_", 
                 "../../data/German/ABGB_", # only zero-shot
                 "../../data/German/APA-LHA-or-a2_",
                 "../../data/German/APA-LHA-or-b1_",
                 ]:
    train_set = data_path + "train.csv"
    test_set = data_path + "test.csv"
    
    name = data_path.split("/")[-1][:-1].split(" ")[0]
    print(name)
    if "TextComplexityDE-full_" in data_path or "ABGB" in data_path:
        train_set = data_path + "test.csv"
    
    if not os.path.exists("../../fewshot-outputs/"+name):
        os.makedirs("../../fewshot-outputs/"+name)
    
    preprocess_dataset(train_set, test_set, name)
    
    for demonstration, k in [("random", 0), ("random", 10), ("similarity", 10)]:
        split = "test"

        
        # for k in [0]:    
        print(name, demonstration, k, split)
        result_row = [name, demonstration, k]
        print("TESTING " + str(k) + "-SHOT:")
        mapping = "./few_shot_preprocessing/" + name + "_" + split + "_" + demonstration + ".csv"
        dem = "sim" if (demonstration == "similarity") else ("rand" if (demonstration == "random") else "unk")
        output = "../../fewshot-outputs/" + name + "/" + str(k) + "." + dem + ".csv"
        bleu, sari = fewshot_eval(train_set, test_set, mapping, k=k, output_csv=output, checkpoint=output)
        print("BLEU", bleu)
        print("SARI", sari)
        result_row.extend([bleu, sari])

        bleu, sari = few_shot_backoff(train_set, test_set, mapping, k=k, output_csv=output, checkpoint=output)
        print("BLEU", bleu)
        print("SARI", sari)
        result_row.extend([bleu, sari])
        result_scores.append(result_row)
        print(result_scores)
        
    


In [None]:
print(result_scores)

In [None]:
pd.DataFrame(result_scores[1:], columns=result_scores[0]).round(2)

In [None]:
pd.DataFrame(result_scores[1:], columns=result_scores[0]).round(2).to_csv("result_scores.csv")

In [None]:
# few_shot_backoff("../data/Urdu/SimplifyUR_train.csv", "../data/Urdu/SimplifyUR_test.csv", "./few_shot_preprocessing/SimplifyUR_test_similarity.csv", k=5, output_csv="../../fewshot-outputs/SimplifyUR/5.sim.csv", checkpoint="../../fewshot-outputs/SimplifyUR/5.sim.csv")

In [None]:
for input_path in [
                # "../../data/German/GEOLino Corpus_", 
                # "../../data/German/TextComplexityDE Parallel Corpus_",
                # "../../data/German/GEOLino-full_", 
                # "../../data/German/TextComplexityDE-full_", # only zero-shot
                 # "../../data/German/simple-german-corpus_", 
                 #  "../../data/German/DEplain-APA_", 
                 #  "../../data/German/DEplain-web_", 
                 ("BiSECT", "BiSECT")
                 ("ABGB", "ABGB") # only zero-shot
                 # "../../data/German/APA-LHA-or-a2_",
                 # "../../data/German/APA-LHA-or-b1_",
                ("TextComplexityDE-full", "TextComplexityDE"),
                ("TextComplexityDE", "tcde-small"),
                ("GEOLino-full", "geolino"),
                ("GEOLino", "geolino-small")
                 ]:
    # _,_, d, lang, name = input_path.split("/")
    name, output_name = input_path
    if "TextComplexityDE-full" in name or "ABGB" in name:
        t_list = [("0.rand", "0-random")]
    else:
        t_list = [("0.rand", "0-random"), ("10.rand", "10-random"), ("10.sim", "10-similarity")]
    for t, t_name in t_list:
        print(name)
        data = pd.read_csv("../../fewshot-outputs/"+name+"/"+t+".csv")
        # print(data.head())
        with open("/home/SSD1TB/easse-de-clean/easse-de/easse/resources/data/system_outputs/sentence_level/"+output_name+"/test/BLOOM-"+t_name+".txt", "w") as f:
            for i,row in data.iterrows():
                text = row["fewshot output"]
                text = text.replace("\n"," ")
                f.write(text+"\n")