In [1]:
import json
import numpy as np
import pandas as pd
from functools import partial
from transformers import RobertaTokenizer, RobertaForTokenClassification,AutoTokenizer,AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import evaluate
from datasets import Dataset
from sklearn.model_selection import train_test_split

In [2]:
file_path = 'train.json'
all_df = pd.read_json(file_path)

In [3]:
all_df.head(3)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."


In [4]:
import transformers

print(transformers.__version__)

4.28.1


In [5]:
label_list = ['O','B-NAME_STUDENT','I-NAME_STUDENT','B-EMAIL','I-EMAIL','B-USERNAME','I-USERNAME','B-ID_NUM','I-ID_NUM','B-PHONE_NUM',
              'I-PHONE_NUM','B-URL_PERSONAL','I-URL_PERSONAL','B-STREET_ADDRESS','I-STREET_ADDRESS']

In [6]:
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {v: k for k, v in id2label.items()}

In [13]:
def mapped_labels(labels):
    mapped_labels = pd.DataFrame({
                        'mapped_labels': labels
                    })['mapped_labels'].map(label2id).tolist()
    return mapped_labels
all_df['mapped_labels'] = all_df['labels'].apply(create_mapped_labels)

In [14]:
all_df['mapped_labels'] = all_df['labels'].apply(create_mapped_labels)
all_df.head(3)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,mapped_labels,is_labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, ...",True
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...","[1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O...","[0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True


In [15]:
train_set, test_set = train_test_split(all_df, test_size=0.1, shuffle=True, random_state=1278)

print("train data rows:", train_set.shape[0])
print("test data rows:", test_set.shape[0])

train data rows: 6126
test data rows: 681


In [16]:
def create_dataset(df):
    ds = Dataset.from_dict({
        'document': [d for d in df['document']],
        'full_text': [ft for ft in df['full_text']],
        'tokens': [t for t in df['tokens']],
        'trailing_whitespace': [tw for tw in df['trailing_whitespace']],
        'labels': [l for l in df['labels']],
        'mapped_labels': [ml for ml in df['mapped_labels']]
    })
    return ds

In [17]:
train_ds = create_dataset(train1)
valid_ds = create_dataset(valid)

In [18]:
# model_checkpoint = "albert/albert-base-v2"
# model_checkpoint = 'FacebookAI/roberta-base'
# model_checkpoint = "ArBert/albert-base-v2-finetuned-ner"
# model_checkpoint = "google-bert/bert-base-uncased"
# model_checkpoint = "microsoft/deberta-v3-base"
# model_checkpoint='distilbert/distilbert-base-uncased'

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [20]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [21]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, max_length=512
    )
    all_labels = examples["mapped_labels"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [22]:
tokenized_train = train_ds.map(
    tokenize_and_align_labels,
    batched=True
)
tokenized_valid = valid_ds.map(
    tokenize_and_align_labels,
    batched=True
)

Map:   0%|          | 0/6126 [00:00<?, ? examples/s]

Map:   0%|          | 0/681 [00:00<?, ? examples/s]

In [23]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, pad_to_multiple_of=16)

In [24]:
from seqeval.metrics import recall_score, precision_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

def compute_metrics(p, label_list):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    if precision == 0 and recall == 0:
        f1_score = 0
    else:
        f1_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)

    results = {
        'precision': precision,
        'recall': recall,
        'f1': f1_score
    }
    return results

In [25]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=15,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at ArBert/albert-base-v2-finetuned-ner and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([15, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([15]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
training_args = TrainingArguments(
    output_dir='output',
    learning_rate=2.5e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.02,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    report_to='none'
)

In [27]:
def make_trainer(train, valid):
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train,
        eval_dataset=valid,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=partial(compute_metrics, label_list=label_list)
    )
    return trainer

In [28]:
trainer = make_trainer(tokenized_train, tokenized_valid)
untrained_eval_results = trainer.evaluate(tokenized_valid)
print('pretrained model non-finetuned results: ',untrained_eval_results)

You're using a AlbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


pretrained model non-finetuned results:  {'eval_loss': 2.8416874408721924, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0, 'eval_runtime': 16.5265, 'eval_samples_per_second': 41.206, 'eval_steps_per_second': 41.206}


In [30]:
trainer = make_trainer(tokenized_train, tokenized_valid)
print("Starting training...")
trainer.train()

Starting training...


################################Further enhancement on the optimal Model: Roberta ###############################

In [2]:
## load original train set which is same as above fine tune process
file_path = 'train.json'
all_df = pd.read_json(file_path)
all_df.head(3)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."


In [3]:
# Create the same test set as above fine tined process
train_set, test_set = train_test_split(all_df, test_size=0.1, shuffle=True, random_state=1278)
print("train data rows:", train_set.shape[0])
print("test data rows:", test_set.shape[0])

train data rows: 6126
test data rows: 681


In [20]:
### load external dataset generated by LLM ###
from ast import literal_eval
df_train_external = pd.read_csv('C:\\Users\\Admin\\Downloads\\pii_dataset.csv', converters={
    'tokens': literal_eval,
    'labels': literal_eval,
    'trailing_whitespace': literal_eval
})
df_train_external.rename(columns={'text': 'full_text'}, inplace=True) ## keep same column name as original dataset
df_train_external = df_train_external.iloc[:, :5]  ## only select necessary columns
df_train_external

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,1073d46f-2241-459b-ab01-851be8d26436,"My name is Aaliyah Popova, and I am a jeweler ...","[My, name, is, Aaliyah, Popova,, and, I, am, a...","[True, True, True, True, True, True, True, Tru...","[O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O, O..."
1,5ec717a9-17ee-48cd-9d76-30ae256c9354,"My name is Konstantin Becker, and I'm a develo...","[My, name, is, Konstantin, Becker,, and, I'm, ...","[True, True, True, True, True, True, True, Tru...","[O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O, O..."
2,353da41e-7799-4071-ab20-d959b362612e,"As Mieko Mitsubishi, an account manager at a p...","[As, Mieko, Mitsubishi,, an, account, manager,...","[True, True, True, True, True, True, True, Tru...","[O, B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O..."
3,9324ee01-7bdc-41b1-a7a5-01307f72c20d,"My name is Kazuo Sun, and I'm an air traffic c...","[My, name, is, Kazuo, Sun,, and, I'm, an, air,...","[True, True, True, True, True, True, True, Tru...","[O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O, O..."
4,971fe266-2739-4f1b-979b-7f64e07d5a4a,"My name is Arina Sun, and I'm a dental hygieni...","[My, name, is, Arina, Sun,, and, I'm, a, denta...","[True, True, True, True, True, True, True, Tru...","[O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O, O..."
...,...,...,...,...,...
4429,a90b6b71-0e77-4089-a3a4-9a5b5c5aec78,"Hello, I'm Nicholas Moore, a man with a rich t...","[Hello,, I'm, Nicholas, Moore,, a, man, with, ...","[True, True, True, True, True, True, True, Tru...","[O, O, B-NAME_STUDENT, I-NAME_STUDENT, O, O, O..."
4430,1492ed0e-f162-424f-9c40-f3edde790ca1,"Hello, my name is Alexey Novikov and I'm a psy...","[Hello,, my, name, is, Alexey, Novikov, and, I...","[True, True, True, True, True, True, True, Tru...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
4431,57ef34c1-48db-4413-9573-774021e57f63,"My name is Ludmila Inoue, and I'm a person wit...","[My, name, is, Ludmila, Inoue,, and, I'm, a, p...","[True, True, True, True, True, True, True, Tru...","[O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O, O..."
4432,a4486627-1c62-48b0-bf4f-53259ecc0a28,"Dr. Tu Garcia, a renowned dermatologist, embar...","[Dr., Tu, Garcia,, a, renowned, dermatologist,...","[True, True, True, True, True, True, True, Tru...","[O, B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O..."


In [31]:
train_set.document = train_set.document.astype(int)
max_doc_id = train_set.document.max()
print("Last doc id used:", max_doc_id)
df_train_external.document = pd.Series(np.arange(max_doc_id+1, max_doc_id+1+len(df_train_external)))

Last doc id used: 22687


In [32]:
df_train_external

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,22688,"My name is Aaliyah Popova, and I am a jeweler ...","[My, name, is, Aaliyah, Popova,, and, I, am, a...","[True, True, True, True, True, True, True, Tru...","[O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O, O..."
1,22689,"My name is Konstantin Becker, and I'm a develo...","[My, name, is, Konstantin, Becker,, and, I'm, ...","[True, True, True, True, True, True, True, Tru...","[O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O, O..."
2,22690,"As Mieko Mitsubishi, an account manager at a p...","[As, Mieko, Mitsubishi,, an, account, manager,...","[True, True, True, True, True, True, True, Tru...","[O, B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O..."
3,22691,"My name is Kazuo Sun, and I'm an air traffic c...","[My, name, is, Kazuo, Sun,, and, I'm, an, air,...","[True, True, True, True, True, True, True, Tru...","[O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O, O..."
4,22692,"My name is Arina Sun, and I'm a dental hygieni...","[My, name, is, Arina, Sun,, and, I'm, a, denta...","[True, True, True, True, True, True, True, Tru...","[O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O, O..."
...,...,...,...,...,...
4429,27117,"Hello, I'm Nicholas Moore, a man with a rich t...","[Hello,, I'm, Nicholas, Moore,, a, man, with, ...","[True, True, True, True, True, True, True, Tru...","[O, O, B-NAME_STUDENT, I-NAME_STUDENT, O, O, O..."
4430,27118,"Hello, my name is Alexey Novikov and I'm a psy...","[Hello,, my, name, is, Alexey, Novikov, and, I...","[True, True, True, True, True, True, True, Tru...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
4431,27119,"My name is Ludmila Inoue, and I'm a person wit...","[My, name, is, Ludmila, Inoue,, and, I'm, a, p...","[True, True, True, True, True, True, True, Tru...","[O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O, O..."
4432,27120,"Dr. Tu Garcia, a renowned dermatologist, embar...","[Dr., Tu, Garcia,, a, renowned, dermatologist,...","[True, True, True, True, True, True, True, Tru...","[O, B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O..."


In [33]:
print('original train set: ', len(train_set))
train_set_with_external_samples = pd.concat([train_set, df_train_external], axis=0)
print('combined train set: ', len(train_set_with_external_samples))

original train set:  6126
combined train set:  10560


In [34]:
label_list = ['O','B-NAME_STUDENT','I-NAME_STUDENT','B-EMAIL','I-EMAIL','B-USERNAME','I-USERNAME','B-ID_NUM','I-ID_NUM','B-PHONE_NUM',
              'I-PHONE_NUM','B-URL_PERSONAL','I-URL_PERSONAL','B-STREET_ADDRESS','I-STREET_ADDRESS']

In [35]:
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {v: k for k, v in id2label.items()}

In [36]:
def mapped_labels(labels):
    mapped_labels = pd.DataFrame({
                        'mapped_labels': labels
                    })['mapped_labels'].map(label2id).tolist()
    return mapped_labels

In [37]:
train_set_with_external_samples['mapped_labels'] = train_set_with_external_samples['labels'].apply(mapped_labels)
train_set_with_external_samples.head(3)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,mapped_labels
1138,9924,ADNOC Classification: Internal\n\nReflection –...,"[ADNOC, Classification, :, Internal, \n\n, Ref...","[True, False, True, False, False, True, True, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3499,15034,Final Assignment\n\nChallenge & Selection\n\nW...,"[Final, Assignment, \n\n, Challenge, &, Select...","[True, False, False, True, True, False, False,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6490,22000,Challenge & Selection Part of my role as Pro...,"[Challenge, &, Selection, , Part, of, my, ro...","[True, True, True, False, True, True, True, Tr...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [38]:
test_set['mapped_labels'] = test_set['labels'].apply(mapped_labels)
test_set.head(3)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,mapped_labels
514,8266,Mohammed Isah 11-08-2020\n\nReflection – St...,"[Mohammed, Isah, , 11, -, 08, -, 2020, \n\n...","[True, True, False, False, False, False, False...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...","[1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6798,22672,Challenge: I am working as an acting Team L...,"[Challenge, :, , I, am, working, as, an, ac...","[False, True, False, True, True, True, True, T...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3753,15685,REFLECTION\n\n1. Challenge\n\nI work at an Ele...,"[REFLECTION, \n\n, 1, ., Challenge, \n\n, I, w...","[False, False, False, True, False, False, True...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [39]:
train_set_with_external_samples['is_labels'] = train_set_with_external_samples['labels'].apply(lambda labels: any(label != 'O' for label in labels))
train_set_with_external_samples_pos = train_set_with_external_samples[train_set_with_external_samples['is_labels'] == True]
print(len(train_set_with_external_samples_pos), "left after filtering.")

5272 left after filtering.


In [40]:
def create_dataset(df):
    ds = Dataset.from_dict({
        'document': [d for d in df['document']],
        'full_text': [ft for ft in df['full_text']],
        'tokens': [t for t in df['tokens']],
        'trailing_whitespace': [tw for tw in df['trailing_whitespace']],
        'labels': [l for l in df['labels']],
        'mapped_labels': [ml for ml in df['mapped_labels']]
    })
    return ds

In [41]:
train_set = create_dataset(train_set_with_external_samples_pos)
test_set = create_dataset(test_set)

In [42]:
###  Select the optimal fine-tuned model from above comparative experiment
### Roberta performed best
model_path = 'C:\\Users\\Admin\\Downloads\\output\\Roberta\\roberta_best'
tokenizer = AutoTokenizer.from_pretrained(model_path,add_prefix_space=True)

In [43]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [44]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, max_length=512
    )
    all_labels = examples["mapped_labels"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [45]:
tokenized_train = train_set.map(
    tokenize_and_align_labels,
    batched=True
)
tokenized_valid = test_set.map(
    tokenize_and_align_labels,
    batched=True
)


Map:   0%|          | 0/5272 [00:00<?, ? examples/s]

Map:   0%|          | 0/681 [00:00<?, ? examples/s]

In [46]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, pad_to_multiple_of=16)

In [47]:
from seqeval.metrics import recall_score, precision_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

def compute_metrics(p, label_list):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    if precision == 0 and recall == 0:
        f1_score = 0
    else:
        f1_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
    
    results = {
        'recall': recall,
        'precision': precision,
        'f1': f1_score
    }
    return results

In [48]:
model = AutoModelForTokenClassification.from_pretrained(
    model_path,
    num_labels=15,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

In [49]:
training_args = TrainingArguments(
    output_dir='output',
    learning_rate=2.5e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.02,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    report_to='none'
)

In [50]:
def make_trainer(train, valid):
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train,
        eval_dataset=valid,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=partial(compute_metrics, label_list=label_list)
    )
    return trainer

In [51]:
trainer = make_trainer(tokenized_train, tokenized_valid)
untrained_eval_results = trainer.evaluate(tokenized_valid)
print('Fine-tuned model results for epoch 2: ',untrained_eval_results)

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Fine-tuned model results for epoch 2:  {'eval_loss': 0.0009061108576133847, 'eval_recall': 0.9432624113475178, 'eval_precision': 0.8109756097560976, 'eval_f1': 0.9373814041745729, 'eval_runtime': 14.0834, 'eval_samples_per_second': 48.355, 'eval_steps_per_second': 48.355}


In [52]:
trainer = make_trainer(tokenized_train, tokenized_valid)
print("Starting training...")
trainer.train()

Starting training...




Epoch,Training Loss,Validation Loss,Recall,Precision,F1
1,0.0092,0.001954,0.929078,0.752874,0.920789
2,0.0057,0.002457,0.829787,0.769737,0.827305
3,0.0038,0.001774,0.964539,0.809524,0.957487


TrainOutput(global_step=15816, training_loss=0.008624976324685367, metrics={'train_runtime': 1147.9853, 'train_samples_per_second': 13.777, 'train_steps_per_second': 13.777, 'total_flos': 3270260095028640.0, 'train_loss': 0.008624976324685367, 'epoch': 3.0})