# Preparation

In [None]:
!pip install datasets accelerate sentencepiece transformers torch scikit-learn numpy tensorboardX evaluate sacrebleu huggingface_hub jiwer

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl.metadata (20 kB)
Collecting accelerate
  Downloading accelerate-0.26.1-py3-none-any.whl.metadata (18 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.37.2-py3-none-any.whl.metadata (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn
  Downloading scikit_learn-1.4.0-1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting tensorboardX
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting sacrebleu
  Downloading sacrebleu-

In [None]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer, AutoTokenizer, get_scheduler, BitsAndBytesConfig, GenerationConfig, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, MT5ForConditionalGeneration, MT5Tokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset, Dataset
from huggingface_hub import login, logout
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [None]:
dataset_train = load_dataset("csv", data_files="train.csv", split='train').remove_columns("src")

In [None]:
dataset_valid = load_dataset("csv", data_files="valid.csv", split='train').remove_columns("src")

# t5-small

In [None]:
MODEL = 't5-small'
BATCH_SIZE = 48
NUM_PROCS = 16
EPOCHS = 5
OUT_DIR = 'results_t5small'
MAX_LENGTH = 256
LEARNING_RATE = 0.0005
WEIGHT_DECAY = 0.02

In [None]:
tokenizer = T5Tokenizer.from_pretrained(MODEL)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Function to convert text data into model inputs and targets
def preprocess_function(examples):
    inputs = [f"convert to inclusive sentence: {non_incl_sent}" for non_incl_sent in examples['non_incl_sent']]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )

    # Set up the tokenizer for targets
    outputs = [incl_sent for incl_sent in examples["auto_incl_sent"]]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            outputs,
            max_length=MAX_LENGTH,
            truncation=True,
            padding='max_length'
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
import evaluate, numpy as np
sacrebleu_metric = evaluate.load("sacrebleu")
wer_metric = evaluate.load("wer")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    bleu_result = sacrebleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    bleu_score = round(bleu_result["score"], 4)

    wer_score = round(wer_metric.compute(predictions=decoded_preds, references=[x for xs in decoded_labels for x in xs]), 4)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    gen_len = round(np.mean(prediction_lens), 4)

    result = {"bleu": bleu_score, "wer": wer_score, "gen_len": gen_len}
    return result

In [None]:
sample_preds = np.random.randint(0, 100, size=(10, 20))
sample_labels = np.random.randint(0, 100, size=(10, 20))

decoded_preds = tokenizer.batch_decode(sample_preds, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(sample_labels, skip_special_tokens=True)

sample_eval_preds = (sample_preds, sample_labels)
result = compute_metrics(sample_eval_preds)

print(result)

{'bleu': 0.4994, 'wer': 1.04, 'gen_len': 19.7}


In [None]:
from huggingface_hub import login, logout

login("API_KEY")

In [None]:
dataset_train = load_dataset("csv", data_files="train.csv", split='train').remove_columns("src").select(range(60000))

In [None]:
dataset_valid = load_dataset("csv", data_files="valid.csv", split='train').remove_columns("src").select(range(6000))

Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


In [None]:
tokenized_train = dataset_train.map(preprocess_function, batched=True, num_proc=NUM_PROCS)
tokenized_valid = dataset_valid.map(preprocess_function, batched=True, num_proc=NUM_PROCS)

Map (num_proc=16):   0%|          | 0/60000 [00:00<?, ? examples/s]



Map (num_proc=16):   0%|          | 0/6000 [00:00<?, ? examples/s]



In [None]:
model = T5ForConditionalGeneration.from_pretrained(MODEL)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

model.config.max_length = MAX_LENGTH
print("Max length for generation:", model.config.max_length)

60,506,624 total parameters.
60,506,624 training parameters.
Max length for generation: 256


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=WEIGHT_DECAY,
    logging_dir=OUT_DIR,
    logging_steps=10,
    evaluation_strategy='steps',
    save_steps=500,
    eval_steps=500,
    load_best_model_at_end=True,
    save_total_limit=3,
    report_to='tensorboard',
    learning_rate=LEARNING_RATE,
    fp16=True,
    dataloader_num_workers=4,
    predict_with_generate=True,
    do_train=True,
    do_eval=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
t5_history = trainer.train()

Step,Training Loss,Validation Loss,Bleu,Wer,Gen Len
500,0.0195,0.015295,93.2753,0.0427,62.7732
1000,0.0128,0.010865,94.7989,0.0312,62.7715
1500,0.0121,0.009365,95.0581,0.0295,62.266
2000,0.0105,0.008743,95.2607,0.0286,62.1432
2500,0.009,0.008251,95.3219,0.0274,62.2098
3000,0.0096,0.007733,95.4354,0.0268,62.2532
3500,0.0071,0.00752,95.5026,0.0261,62.3815
4000,0.0072,0.007318,95.5419,0.0259,62.5688
4500,0.0074,0.007067,95.6607,0.0254,62.7457
5000,0.0076,0.006849,95.6779,0.0251,62.892


Checkpoint destination directory results_t5small/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 256}
Checkpoint destination directory results_t5small/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 256}
Non-default generation parameters: {'max_length': 256}
Non-default generation parameters: {'max_length': 256}
Non-default generation parameters: {'max_length': 256}
Non-default generation parameters: {'max_length': 256}
Non-default generation parameters: {'max_length': 256}
Non-default generation parameters: {'max_length': 256}
Non-default generation parameters: {'max_length': 256}
Non-default generation parameters: {'max_length': 256}
Non-default generation parameters: {'max_length': 256}
Non-default generation parameters: {'max_length': 256}
There were missing keys in the checkpoi

In [None]:
trainer.push_to_hub("genre-t5-small-60k")

Non-default generation parameters: {'max_length': 256}


events.out.tfevents.1706347294.6d32e7e75490:   0%|          | 0.00/109k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/spidersouris/results_t5small/commit/1ae50ec5f608e20603e768f697bfae2d669148c6', commit_message='genre-t5-small-60k', commit_description='', oid='1ae50ec5f608e20603e768f697bfae2d669148c6', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
trainer.evaluate()

{'eval_loss': 0.006737459916621447,
 'eval_bleu': 95.7548,
 'eval_wer': 0.0246,
 'eval_gen_len': 62.3093,
 'eval_runtime': 686.6416,
 'eval_samples_per_second': 8.738,
 'eval_steps_per_second': 0.182,
 'epoch': 5.0}

In [None]:
tokenizer.push_to_hub("genre-t5-small-60k")

README.md:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/spidersouris/genre-t5-small-60k/commit/f9e6409b151d30271f042de13989a2b1f674679e', commit_message='Upload tokenizer', commit_description='', oid='f9e6409b151d30271f042de13989a2b1f674679e', pr_url=None, pr_revision=None, pr_num=None)

## Inference

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model = T5ForConditionalGeneration.from_pretrained("spidersouris/genre-t5-small-60k")
tokenizer = T5Tokenizer.from_pretrained("spidersouris/genre-t5-small-60k")

config.json:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/133 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def do_correction(text, model, tokenizer):
    input_text = f"convert to inclusive sentence: {text}"
    inputs = tokenizer.encode(
        input_text,
        return_tensors='pt',
        max_length=256,
        padding='max_length',
        truncation=True
    )

    # Get correct sentence ids.
    corrected_ids = model.generate(
        inputs,
        max_length=256,
        num_beams=5, # `num_beams=1` indicated temperature sampling.
        early_stopping=True
    )

    # Decode.
    corrected_sentence = tokenizer.decode(
        corrected_ids[0],
        skip_special_tokens=True
    )
    print(corrected_sentence)
    return corrected_sentence

In [None]:
import csv

def get_sents(inp_file):
  non_incl_sents = []
  incl_sents = []
  with open(inp_file, "r", encoding="utf8") as inp:
    reader = csv.reader(inp, delimiter=",")
    headers = next(reader)
    data = list(reader)

    for i, row in enumerate(data):
      non_incl_sents.append(row[1])
      print(f"Correcting sentence {i}")
      incl_sents.append(do_correction(row[1], model, tokenizer))

  return non_incl_sents, incl_sents

def write_to_file(inp_file, out_file):
  non_incl_sents, incl_sents = get_sents(inp_file)
  with open(out_file, "w", encoding="utf8") as out:
    writer = csv.writer(out, delimiter=",")
    writer.writerow(["non_incl_sent", "auto_incl_sent"])
    for non_incl_sent, incl_sent in zip(non_incl_sents, incl_sents):
      writer.writerow([non_incl_sent, incl_sent])

In [None]:
write_to_file("eupr_gen_eval.csv", "eupr_gen_t5-small-60k.csv")

Correcting sentence 0
Le Conseil de la ministraille a également reconnu que le train de mesures convenu lors du Conseil d'octobre avait pour but de traiter les conséquences à court terme découlant en particulier de la crise de l'ESB, et pense que des changements supplémentaires en ce qui concerne les primes pour le boeuf sont nécessaires pour aligner les fournitures de boeuf au sein de l'Union sur le niveau de demande interne et externe probable à long terme.
Correcting sentence 1
Par écrit.-(PT) Tout en étant fondamentalement d'accord avec les positions et les préoccupations de la Commission en ce qui concerne la politique de communication, en particulier le besoin de transparence et d'engagement vis-à-vis de la citoyenneté, il demeure selon moi nécessaire de dire clairement qu'une politique de communication ne constitue pas une fin en soi, mais seulement un moyen.
Correcting sentence 2
Le 28 avril, lors de la conférence de la ministraille de l'intérieur de l'UE, le commissaire a soul

In [None]:
sentence = "Les citoyens ont compris l'importance des voisins."

corrected_sentence = do_correction(sentence, model, tokenizer)
print(f"ORIGINAL SENTENCE: {sentence}\nINCLUSIVE SENTENCE: {corrected_sentence}")

ORIGINAL SENTENCE: Les citoyens ont compris l'importance des voisins.
INCLUSIVE SENTENCE: La citoyenneté a compris l'importance du voisinage.


# m2m100_418M

In [None]:
MODEL = 'facebook/m2m100_418M'
BATCH_SIZE = 8
NUM_PROCS = 16
EPOCHS = 5
OUT_DIR = 'genre-m2m100_418M'
MAX_LENGTH = 256
LEARNING_RATE = 0.0005
WEIGHT_DECAY = 0.02

In [None]:
tokenizer = M2M100Tokenizer.from_pretrained(MODEL)

tokenizer_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

In [None]:
tokenizer.src_lang = "fr"
tokenizer.tgt_lang = "fr"

In [None]:
# Function to convert text data into model inputs and targets
def preprocess_function(examples):
    inputs = [non_incl_sent for non_incl_sent in examples['non_incl_sent']]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )

    # Set up the tokenizer for targets
    outputs = [incl_sent for incl_sent in examples["auto_incl_sent"]]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            outputs,
            max_length=MAX_LENGTH,
            truncation=True,
            padding='max_length'
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
dataset_train = load_dataset("csv", data_files="train.csv", split='train').remove_columns("src").select(range(60000))

Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


In [None]:
dataset_valid = load_dataset("csv", data_files="valid.csv", split='train').remove_columns("src").select(range(6000))

Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


In [None]:
tokenized_train = dataset_train.map(preprocess_function, batched=True, num_proc=NUM_PROCS)
tokenized_valid = dataset_valid.map(preprocess_function, batched=True, num_proc=NUM_PROCS)

Map (num_proc=16):   0%|          | 0/60000 [00:00<?, ? examples/s]



Map (num_proc=16):   0%|          | 0/6000 [00:00<?, ? examples/s]



In [None]:
model = M2M100ForConditionalGeneration.from_pretrained(MODEL)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

483,905,536 total parameters.
483,905,536 training parameters.


In [None]:
import evaluate, numpy as np
metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
from huggingface_hub import login, logout

login("API_KEY")

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=1500,
    weight_decay=WEIGHT_DECAY,
    logging_dir=OUT_DIR,
    logging_steps=10,
    evaluation_strategy='steps',
    save_steps=1500,
    eval_steps=1500,
    load_best_model_at_end=True,
    save_total_limit=3,
    report_to='tensorboard',
    learning_rate=LEARNING_RATE,
    dataloader_num_workers=4,
    predict_with_generate=True,
    push_to_hub=True,
    do_train=True,
    do_eval=True,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
m2m100 = trainer.train()

Step,Training Loss,Validation Loss
1500,0.1257,0.095989
3000,0.1138,0.086417
4500,0.0745,0.065451
6000,0.0695,0.046442
7500,0.0926,0.044442
9000,0.0348,0.03481
10500,0.0626,0.03421
12000,0.0641,0.030483
13500,0.0249,0.028503
15000,0.0432,0.024506


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5}
'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: fc211071-546f-4e06-aac3-2931fa14e1f5)')' thrown while requesting PUT https://hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com/repos/37/bf/37bfea7ae887c67ca790bb62fd3573d0b33bf94b741fccd71e7d6247fe673cb0/15a404f813ece3c638bd25807fa9ab991233e184c9b3c98e488c24c3e51cb90b?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQFN2FTF47%2F20240126%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240126T142217Z&X-Amz-Expires=86400&X-Amz-Signature=0506bf964768e377dba4034d34a4a2f9d200d62a3359b57a6c465b35e9c199a4&X-Amz-SignedHeaders=host&partNumber=1&uploadId=ezMEv7uwUg7SR13US.ZN.EmVQFNv4k657dycSNUGbs2sKxvRv.P5VTMyzb.QRgAAob7T3js0PxjLZMiCexrkLtaISyUzbd56CRaBMMQvYXsmXZVHppogx0RoVxLwJnQx&x-id=UploadPart
Retrying in 1s [Retry 1/5].
Non-default g

In [None]:
metrics=trainer.evaluate()
print(metrics)

{'eval_loss': 0.013214160688221455, 'eval_runtime': 39.5534, 'eval_samples_per_second': 151.694, 'eval_steps_per_second': 18.962, 'epoch': 5.0}


In [None]:
trainer.push_to_hub("genre-m2m100_418M")

Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5}


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1706278590.b895321b9f5e:   0%|          | 0.00/607k [00:00<?, ?B/s]

events.out.tfevents.1706287330.b895321b9f5e:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/spidersouris/genre-m2m100_418M/commit/9d45e269606072fd3b7cec0bdee7b788d37c066c', commit_message='genre-m2m100_418M', commit_description='', oid='9d45e269606072fd3b7cec0bdee7b788d37c066c', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub("genre-m2m100_418M")

README.md:   0%|          | 0.00/2.60k [00:00<?, ?B/s]

events.out.tfevents.1706287330.b895321b9f5e:   0%|          | 0.00/316 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/spidersouris/genre-m2m100_418M/commit/465277b3ce4c99454d274458becce4911010f1b7', commit_message='Upload tokenizer', commit_description='', oid='465277b3ce4c99454d274458becce4911010f1b7', pr_url=None, pr_revision=None, pr_num=None)

## Inference

In [None]:
model = M2M100ForConditionalGeneration.from_pretrained("spidersouris/genre-m2m100_418M")
tokenizer = M2M100Tokenizer.from_pretrained("spidersouris/genre-m2m100_418M")

config.json:   0%|          | 0.00/931 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/19.8k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.01k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def do_correction(text, model, tokenizer):
    input_text = text
    inputs = tokenizer.encode(
        input_text,
        return_tensors='pt',
        max_length=256,
        padding='max_length',
        truncation=True
    )

    # Get correct sentence ids.
    corrected_ids = model.generate(
        inputs,
        max_length=256,
        num_beams=5, # `num_beams=1` indicated temperature sampling.
        early_stopping=True
    )

    # Decode.
    corrected_sentence = tokenizer.decode(
        corrected_ids[0],
        skip_special_tokens=True
    )
    print("Corrected sentence:", corrected_sentence)
    return corrected_sentence

In [None]:
import csv

def get_sents(inp_file):
  non_incl_sents = []
  incl_sents = []
  with open(inp_file, "r", encoding="utf8") as inp:
    reader = csv.reader(inp, delimiter=",")
    headers = next(reader)
    data = list(reader)

    for i, row in enumerate(data):
      non_incl_sents.append(row[1])
      print(f"Correcting sentence {i}")
      incl_sents.append(do_correction(row[1], model, tokenizer))

  return non_incl_sents, incl_sents

def write_to_file(inp_file, out_file):
  non_incl_sents, incl_sents = get_sents(inp_file)
  with open(out_file, "w", encoding="utf8") as out:
    writer = csv.writer(out, delimiter=",")
    writer.writerow(["non_incl_sent", "auto_incl_sent"])
    for non_incl_sent, incl_sent in zip(non_incl_sents, incl_sents):
      writer.writerow([non_incl_sent, incl_sent])

In [None]:
write_to_file("wiki_gen_eval.csv", "wiki_gen_m2m100.csv")

Correcting sentence 0
Corrected sentence: L'histoire se déroule dans un monde où l'humanité vit isolée du reste du monde par trois murs impénétrables leur permettant de se défendre contre les Titans, de gigantesques humanoïdes qui dévorent l'humanité sans raison apparente.
Correcting sentence 1
Corrected sentence: La plupart des coups de feu essuyés par la police ont donc été le fait de leurs propres collègues... ».
Correcting sentence 2
Corrected sentence: Yves Le polovre, l'auteur de la Terre de la prêtraille, fut juge à Plouescat entre 1910 et 1917.
Correcting sentence 3
Corrected sentence: Shere Khan retourne affronter Akela et demandant que Mowgli lui soit remis, tuant Akela en le jetant d'une falaise quand il apprend que Mowgli est sur le chemin du village de l'humanité.
Correcting sentence 4
Corrected sentence: Ces référendums ont lieu à l'initiative du président qui en fixe les questions posées ainsi que la date d'organisation, sous réserve d'un vote favorable à la majorité abs