# Installing libraries

In [1]:
!pip install transformers datasets torch

Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp38-cp38-manylinux_2_28_x86_64.whl (40.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 MB[0m [31m58.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m46.9 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!pip install accelerate -U

Defaulting to user installation because normal site-packages is not writeable
Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.31.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [3]:
pip install sentencepiece

Defaulting to user installation because normal site-packages is not writeable
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m64.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Importing libraries and reading data

In [1]:
import pandas as pd
from datasets import Dataset
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, Trainer, TrainingArguments,  AdamW

In [2]:
train_df = pd.read_csv('train.csv')
eval_df = pd.read_csv('test.csv')

train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)


In [3]:
len(eval_df)

24423

In [20]:
print("Train Dataset Sample:", train_df.head())
print("Test Dataset Sample:", eval_df.head())

Train Dataset Sample:                                                 data  \
0  In einem vollkommenen Welt würde es sein , das...   
1  Dennoch ist das sowohl unmöglich als auch unre...   
2  Auf dem ersten Blick sollte man glauben , dass...   
3  Die Leute , dessen Arbeit wichtiger für die Ge...   
4  Zum Beispiel spielen Ärzte , Lehrer , und Baue...   

                                          correction  \
0  In einer vollkommenen Welt wäre es so , dass d...   
1  Dennoch ist das sowohl unmöglich als auch unre...   
2  Auf den ersten Blick sollte man glauben , dass...   
3  Die Leute , deren Arbeit wichtiger für die Ges...   
4  Zum Beispiel spielen Ärzte , Lehrer und Bauern...   

                                         annotations  
0  [{'span': '1 2', 'error_type': 'R:DET:FORM', '...  
1  [{'span': '-1 -1', 'error_type': 'noop', 'sugg...  
2  [{'span': '1 2', 'error_type': 'R:DET:FORM', '...  
3  [{'span': '3 4', 'error_type': 'R:DET:FORM', '...  
4  [{'span': '6 7', 'error_ty

# Loading model and tokenizer

In [4]:
# Load the tokenizer and model
model_name = 'google/mt5-base'
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
  return self.fget.__get__(instance, owner)()


# Tokenizing the data

In [5]:
# Tokenize the data
def preprocess_data(examples):
    inputs = [ex for ex in examples['data']]
    targets = [ex for ex in examples['correction']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True, padding='max_length')

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

train_dataset = train_dataset.map(preprocess_data, batched=True)
test_dataset = eval_dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/107259 [00:00<?, ? examples/s]



Map:   0%|          | 0/24423 [00:00<?, ? examples/s]

# Defining training arguments

In [6]:

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results4',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=1000,
    evaluation_strategy="epoch",  # Evaluate after each epoch
    save_strategy="epoch", 
    gradient_accumulation_steps=1,
    save_total_limit=1,
    optim="adamw_torch",
    learning_rate=5e-6,
    max_grad_norm=1.0  # Gradient clipping# Specify the optimizer
)

# Initializing trainer

In [7]:

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    optimizers=(AdamW(model.parameters(), lr=5e-6), None)  # Specify the optimizer
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


# Training the model

In [8]:
from transformers import logging
logging.set_verbosity_debug()  # or set_verbosity_info() for less verbose output


In [None]:
# Train the model
trainer.train()

Currently training with a batch size of: 4
The following columns in the training set don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: correction, data, annotations. If correction, data, annotations are not expected by `MT5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 107,259
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 80,445
  Number of trainable parameters = 582,401,280


Epoch,Training Loss,Validation Loss
1,0.0506,0.017832


The following columns in the evaluation set don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: correction, data, annotations. If correction, data, annotations are not expected by `MT5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 24423
  Batch size = 4
Saving model checkpoint to ./results4/tmp-checkpoint-26815
Configuration saved in ./results4/tmp-checkpoint-26815/config.json
Configuration saved in ./results4/tmp-checkpoint-26815/generation_config.json
Model weights saved in ./results4/tmp-checkpoint-26815/model.safetensors
The following columns in the evaluation set don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: correction, data, annotations. If correction, data, annotations are not expected by `MT5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num example

In [15]:
# Save the model
model.save_pretrained('./mt5-finetuned')
tokenizer.save_pretrained('./mt5-finetuned')

Configuration saved in ./mt5-finetuned/config.json
Configuration saved in ./mt5-finetuned/generation_config.json
Model weights saved in ./mt5-finetuned/model.safetensors
tokenizer config file saved in ./mt5-finetuned/tokenizer_config.json
Special tokens file saved in ./mt5-finetuned/special_tokens_map.json


('./mt5-finetuned/tokenizer_config.json',
 './mt5-finetuned/special_tokens_map.json',
 './mt5-finetuned/spiece.model',
 './mt5-finetuned/added_tokens.json')

In [18]:
import pandas as pd
import ast
from datasets import Dataset, load_metric
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, Trainer, TrainingArguments, AdamW
from sklearn.metrics import precision_recall_fscore_support

# Load your data (assuming it's saved as CSV files with columns 'data', 'correction', 'annotation')
train_data_path = 'train.csv'
test_data_path = 'test.csv'

train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

train_df = train_df.head(100)
test_df = test_df.head(20)
# Process the data
train_df = train_df[['data', 'correction', 'annotations']]
train_df.columns = ['input', 'target', 'annotations']

test_df = test_df[['data', 'correction', 'annotations']]
test_df.columns = ['input', 'target', 'annotations']

# Convert to Hugging Face dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load the tokenizer and model
model_name = 'google/mt5-small'
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

# Tokenize the data
def preprocess_data(examples):
    inputs = [ex for ex in examples['input']]
    targets = [ex for ex in examples['target']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True, padding='max_length')

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

train_dataset = train_dataset.map(preprocess_data, batched=True)
test_dataset = test_dataset.map(preprocess_data, batched=True)


# Define training arguments
training_args = TrainingArguments(
    output_dir='./results_2',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",  # Evaluate after each epoch
    save_strategy="epoch", 
    save_total_limit=1,
    optim="adamw_torch"  # Specify the optimizer
)

# Define a function to compute metrics
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions.argmax(-1)

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    precision_list = []
    recall_list = []
    f0_5_list = []

    for pred_sentence, label_sentence, annotation in zip(pred_str, labels_str, test_df['annotation']):
        annotation = ast.literal_eval(annotation)
        error_spans = [ann['span'] for ann in annotation]

        pred_tokens = pred_sentence.split()
        label_tokens = label_sentence.split()

        tp, fp, fn = 0, 0, 0

        for span in error_spans:
            start, end = map(int, span.split())
            error_in_pred = " ".join(pred_tokens[start:end])
            error_in_label = " ".join(label_tokens[start:end])

            if error_in_pred == error_in_label:
                tp += 1
            else:
                fn += 1

        for i in range(len(pred_tokens)):
            if pred_tokens[i] != label_tokens[i] and i not in [int(span.split()[0]) for span in error_spans]:
                fp += 1

        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f0_5 = (1 + 0.5**2) * precision * recall / (0.5**2 * precision + recall) if precision + recall > 0 else 0

        precision_list.append(precision)
        recall_list.append(recall)
        f0_5_list.append(f0_5)

    return {
        'precision': sum(precision_list) / len(precision_list),
        'recall': sum(recall_list) / len(recall_list),
        'f0_5': sum(f0_5_list) / len(f0_5_list)
    }

# Initialize the Trainer with Adam optimizer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    optimizers=(AdamW(model.parameters(), lr=5e-5), None)  # Specify the optimizer
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained('./mt5-finetuned_2')
tokenizer.save_pretrained('./mt5-finetuned_2')


loading file spiece.model from cache at /home/jovyan/.cache/huggingface/hub/models--google--mt5-small/snapshots/73fb5dbe4756edadc8fbe8c769b0a109493acf7a/spiece.model
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /home/jovyan/.cache/huggingface/hub/models--google--mt5-small/snapshots/73fb5dbe4756edadc8fbe8c769b0a109493acf7a/special_tokens_map.json
loading file tokenizer_config.json from cache at /home/jovyan/.cache/huggingface/hub/models--google--mt5-small/snapshots/73fb5dbe4756edadc8fbe8c769b0a109493acf7a/tokenizer_config.json
loading file tokenizer.json from cache at None
loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--google--mt5-small/snapshots/73fb5dbe4756edadc8fbe8c769b0a109493acf7a/config.json
Model config MT5Config {
  "_name_or_path": "google/mt5-small",
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 1024,
  "d_kv": 64,
  "d

Map:   0%|          | 0/100 [00:00<?, ? examples/s]



Map:   0%|          | 0/20 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
Currently training with a batch size of: 8
The following columns in the training set don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: input, target, annotations. If input, target, annotations are not expected by `MT5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 100
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 39
  Number of trainable parameter

Epoch,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: input, target, annotations. If input, target, annotations are not expected by `MT5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 20
  Batch size = 8


AttributeError: 'tuple' object has no attribute 'argmax'