## Reproduction based on Checkpoints

We regenerate the outputs of TS systems for which a public checkpoint exist on huggingface, i.e., 
- https://huggingface.co/DEplain/trimmed_mbart_sents_apa,
- https://huggingface.co/DEplain/trimmed_mbart_sents_apa_web, and
- https://huggingface.co/josh-oo/custom-decoder-ats

For the first models, we use the huggingface pipeline "text2text-generation". 
For the last model, we followed the instructions in the model repo. We added truncation to max_length of 1024, as some test sets records are too long.

In [None]:
!pip install datasets
!pip install transformers
!pip install pandas
!pip install torch

In [None]:

from datasets import load_dataset, Dataset
# Use a pipeline as a high-level helper
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
import pandas as pd
import os

In [None]:
CUDA_LAUNCH_BLOCKING=1

In [None]:
def generate_predictions(pipe, test_name_path, model_name, test_set_path="", sys_out_path=""):
    print(test_set_path+test_name_path)
    test_dataset = load_dataset("csv", sep="\t", column_names=["original"],  data_files={"test": test_set_path+test_name_path})
    test_name_elements = test_name_path.split("/")
    test_name = test_name_elements[0]
    # test_file 
    predictions = list()
    for out in pipe(KeyDataset(test_dataset["test"], "original")):
        predictions.append(out[0]["generated_text"])
    if not os.path.exists(sys_out_path+test_name+"/test/"):
        os.makedirs(sys_out_path+test_name+"/test/")
    print(test_name, model_name, sys_out_path+test_name+"/test/"+model_name+".txt")
    # pd.DataFrame(predictions).to_csv(sys_out_path+test_name+"/test/"+model_name+".txt", index=None, header=None)
    with open(sys_out_path+test_name+"/test/"+model_name+".txt", 'w') as f:
        for line in predictions:
            f.write("%s\n" % line)
    return 1

### Sentence Level

In [None]:
# base_name_easse = "/home/SSD1TB/easse-de/easse/resources/data/"
base_name_easse = "../../resources/data/"  # edit path your directory
test_set_path = base_name_easse+"test_sets/sentence_level/"
system_out_path = base_name_easse+"system_outputs/sentence_level/"

In [None]:
test_sets_sentence_level = [
                        "TextComplexityDE/TextComplexityDE_test.org", 
                        "ZEST/geolino.test.org", 
                        "BiSECT/BiSECT_test.org", 
                        "DEplain-web/manual-public/DEplain-web-manual-public.test.org", 
                        "DEplain-APA/DEplain-APA.test.org", # available upon request
                        "simple-german-corpus/simple-german-corpus_test.org", # preprocessing required
                        "APA_LHAor-a2/APA_LHAor-a2_test.org", # available upon request
                        "APA_LHAor-b1/APA_LHAor-b1_test.org",  # available upon request
                        "ABGB/ABGB_test.org",
                        ## "DEplain-APA-ref/DEplain-APA-ref_test.org",
                        ## "DEplain-web-ref/DEplain-web-ref_test.org",
                        ## "hda_easy_to_read_language/hda_easy_to_read_language_test.org"
                        
                    ]

In [None]:
models = ["DEplain/trimmed_mbart_sents_apa_web", "DEplain/trimmed_mbart_sents_apa"]

for model_name in models:
    pipe = pipeline("text2text-generation", model=model_name, device=0, trust_remote_code=True)
    model_name_out = model_name.replace("/", "_")
    for test_data in test_sets_sentence_level:
        print(test_data)
        generate_predictions(pipe=pipe, test_name_path=test_data, model_name=model_name_out, 
                             test_set_path=test_set_path, sys_out_path=system_out_path)

In [None]:
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("josh-oo/custom-decoder-ats")


model = AutoModelForSeq2SeqLM.from_pretrained("josh-oo/custom-decoder-ats", trust_remote_code=True, revision="4accedbe0b57d342d95ff546b6bbd3321451d504")
decoder_tokenizer = AutoTokenizer.from_pretrained("josh-oo/german-gpt2-easy")
decoder_tokenizer.add_tokens(['<</s>>','<<s>>','<<pad>>'])
model_name_out = "josh-oo/custom-decoder-ats".replace("/", "_")
##

# example_text = "In tausenden Schweizer Privathaushalten kümmern sich Haushaltsangestellte um die Wäsche, betreuen die Kinder und sorgen für Sauberkeit. Durchschnittlich bekommen sie für die Arbeit rund 30 Franken pro Stunde Bruttolohn. Der grösste Teil von ihnen erhält aber 28 Franken."

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

for test_data in test_sets_sentence_level:
    test_dataset = load_dataset("csv", sep="\t", column_names=["original"],  data_files={"test": test_set_path+test_data})
    test_name_elements = test_data.split("/")
    test_name = test_name_elements[0]
    
    
    
    predictions = list()
    for text in test_dataset["test"]["original"]:
        test_input = tokenizer([text], return_tensors="pt", padding=True, pad_to_multiple_of=1024)
        for key, value in test_input.items():
          test_input[key] = value.to(device)
        output = model.generate(**test_input, num_beams=3, max_length=1024)
        prediction = decoder_tokenizer.batch_decode(output)
        predictions.append(prediction)
    with open(system_out_path+test_name+"/test/"+model_name_out+".txt", 'w') as f:
            for line in predictions:
                f.write("%s\n" % line)

In [None]:
model_name_out = "josh-oo/custom-decoder-ats".replace("/", "_")
for test_data in test_sets_sentence_level:
    test_dataset = load_dataset("csv", sep="\t", column_names=["original"],  data_files={"test": test_set_path+test_data})
    test_name_elements = test_data.split("/")
    if "DEplain-web" in test_name_elements:
        test_name = "_".join(test_name_elements[0:2])
    else:
        test_name = test_name_elements[0]
    result_files = [name for name in os.listdir(system_out_path+test_name+"/test/") if model_name_out in name]
    print(result_files)
    for filename in result_files:
        with open(system_out_path+test_name+"/test/"+filename, 'r') as f:
            content = f.read()
        print(system_out_path+test_name+"/test/"+filename+"_clean.txt")
        with open(system_out_path+test_name+"/test/"+filename+"_clean.txt", 'w') as f:
            content = content.replace("['<s>", "")
            content = content.replace("<s>", "")
            content = content.replace('"[<s>', "")
            content = content.replace("<</s>>']", "")
            content = content.replace("</s>']", "")
            content = content.replace('<</s>>"]', "")
            content = content.replace("</s>']", "")
            content = content.replace("</s>", "")
            
            
            f.write(content)
            

In [None]:
# easse-de/easse/resources/data/system_outputs/sentence_level/DEplain-web/test/josh-oo_custom-decoder-ats.txt

## Document-Level Corpora

In [None]:
import os

In [None]:
base_name_easse = "../../resources/data/"
test_set_path = base_name_easse+"test_sets/document_level/"
system_out_path = base_name_easse+"system_outputs/document_level/"

In [None]:
test_sets_document_level = [
                      "20Minuten/20Minuten_test.org", 
                      "DEplain-APA/DEplain-APA.test.org",
                      ## "hda_easy_to_read_language/hda_easy_to_read_langauge_test.org",
                      "klexikon/klexikon_test.org",
                      "DEplain-web/auto-public/DEplain-web-auto-public.test.org",
                      "DEplain-web/manual-public/DEplain-web-manual-public.test.org",
                    ]

In [None]:
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)


##gerpt

#model = AutoModelForSeq2SeqLM.from_pretrained("josh-oo/custom-decoder-ats", trust_remote_code=True, revision="35197269f0235992fcc6b8363ca4f48558b624ff")
#decoder_tokenizer = AutoTokenizer.from_pretrained("josh-oo/gerpt2")

##dbmdz

tokenizer = AutoTokenizer.from_pretrained("josh-oo/custom-decoder-ats")

model = AutoModelForSeq2SeqLM.from_pretrained("josh-oo/custom-decoder-ats", trust_remote_code=True, revision="4accedbe0b57d342d95ff546b6bbd3321451d504")
decoder_tokenizer = AutoTokenizer.from_pretrained("josh-oo/german-gpt2-easy")
decoder_tokenizer.add_tokens(['<</s>>','<<s>>','<<pad>>'])
model_name_out = "josh-oo/custom-decoder-ats".replace("/", "_")
##

# example_text = "In tausenden Schweizer Privathaushalten kümmern sich Haushaltsangestellte um die Wäsche, betreuen die Kinder und sorgen für Sauberkeit. Durchschnittlich bekommen sie für die Arbeit rund 30 Franken pro Stunde Bruttolohn. Der grösste Teil von ihnen erhält aber 28 Franken."


model.to(device)
model.eval()

for test_data in test_sets_document_level:
    test_dataset = load_dataset("csv", sep="\t", column_names=["original"],  data_files={"test": test_set_path+test_data})
    test_name_elements = test_data.split("/")
    if "DEplain-web" in test_name_elements:
        test_name = "_".join(test_name_elements[0:2])
    else:
        test_name = test_name_elements[0]
    print(test_data, len(test_dataset["test"]))
    # print(test_dataset)

    
    
    
    predictions = list()
    for text in test_dataset["test"]["original"]:
        # test_input = tokenizer([text], return_tensors="pt", padding="max_length", truncation=True, max_length=1024)  # pad_to_multiple_of=1024, 
        # test_input = tokenizer([text], return_tensors="pt", padding=True, pad_to_multiple_of=1024,)
        test_input = tokenizer([text], return_tensors="pt", padding="max_length", truncation=True, max_length=4096)
        # print(test_input["input_ids"].shape[1])
        print(len(text), len(text.split(" ")), test_input["input_ids"].shape)
        # test_input = tokenizer([text], return_tensors="pt", padding=True, pad_to_multiple_of=1024, truncation=True, max_length=test_input["input_ids"].shape[1])

        for key, value in test_input.items():
          test_input[key] = value.to(device)
        # print(len(text), len(text.split(" ")), test_input["input_ids"].shape)
        output = model.generate(**test_input, num_beams=3, max_length=1024)
        prediction = decoder_tokenizer.batch_decode(output)
        predictions.append(prediction)
    if not os.path.exists(system_out_path+test_name+"/test/"):
        os.makedirs(system_out_path+test_name+"/test/")
                          
    with open(system_out_path+test_name+"/test/"+model_name_out+"_trunc_4096.txt", 'w') as f:
            for line in predictions:
                f.write("%s\n" % line)

In [None]:
import os

In [None]:
model_name_out = "josh-oo/custom-decoder-ats".replace("/", "_")
for test_data in test_sets_document_level:
    test_dataset = load_dataset("csv", sep="\t", column_names=["original"],  data_files={"test": test_set_path+test_data})
    test_name_elements = test_data.split("/")
    if "DEplain-web" in test_name_elements:
        test_name = "_".join(test_name_elements[0:2])
    else:
        test_name = test_name_elements[0]
    result_files = [name for name in os.listdir(system_out_path+test_name+"/test/") if model_name_out in name]
    for filename in result_files:
        with open(system_out_path+test_name+"/test/"+filename, 'r') as f:
            content = f.read()
        with open(system_out_path+test_name+"/test/"+filename+"_clean.txt", 'w') as f:
            content = content.replace("['<s>", "")
            content = content.replace("<s>", "")
            content = content.replace('"[<s>', "")
            content = content.replace("<</s>>']", "")
            content = content.replace("</s>']", "")
            content = content.replace('<</s>>"]', "")
            content = content.replace("</s>']", "")
            content = content.replace("</s>", "")
            f.write(content)
            