In [1]:
import pandas as pd
import numpy as np
import collections

import datasets
from datasets import Dataset, Value, ClassLabel, Features
from datasets import load_metric

import torch
from torch import nn

import transformers
from transformers import RobertaTokenizerFast
from transformers import AutoTokenizer

from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers import Trainer

import seaborn as sns

import shutil

  from pandas.core import (


### Final preprocessing of benchmark data

In [2]:
def normalize_tokens(row):
    tmp_tok = row['tok'].lower()
    tmp_tok = tmp_tok.replace("'",'').replace('=','').replace('_','').replace('-','').replace('@@','*').replace('@','*').replace('#',",").replace('dummy',",")
    return tmp_tok

In [3]:
BENCHMARK_REDUC_FR = 'data_benchmark/benchmark_reduc_fr.csv'
BENCHMARK_BC_FR = 'data_benchmark/benchmark_bc_fr.csv'
BENCHMARK_PROM_FR = 'data_benchmark/benchmark_prom_fr.csv'

RESULT_FOLDER ="results/"
FIGS_FOLDER ="figs/"
MODELS_FOLDER ="models/"

df_red = pd.read_csv(BENCHMARK_REDUC_FR)
df_bc = pd.read_csv(BENCHMARK_BC_FR)
df_prom = pd.read_csv(BENCHMARK_PROM_FR)

df_red['tok'] = df_red['tok'].fillna(',') 
df_bc['tok'] = df_bc['tok'].fillna(',')
df_prom['tok'] = df_prom['tok'].fillna(',')

df_red['tok'] = df_red.apply(normalize_tokens,axis=1)
df_bc['tok'] = df_bc.apply(normalize_tokens,axis=1)
df_prom['tok'] = df_prom.apply(normalize_tokens,axis=1)


df_bc['dur'] = df_bc['end']-df_bc['start']
df_prom['dur'] = df_prom['end']-df_prom['start']

df_red['label'] = df_red['reduc_ratio']<0.7
df_bc['label'] = df_bc['backchanneled']
df_prom['label'] = df_prom['prom']>1.5


In [30]:
#Unused (did not bring anything)

#df_red['prev_red'] = df_red['label_reduc'].shift(1)
#df_red['next_red'] = df_red['label_reduc'].shift(-1)

#def smooth_reduc(row):
#    if not row['label_reduc'] and row['prev_red'] and row['next_red'] and row['tok'] != '#':
#        return True
#    else:
#        return row['label_reduc']
        
#df_red['label_smooth_reduc'] = df_red.apply(smooth_reduc,axis=1)


### Checking Label Distributions

In [5]:
pd.Series(collections.Counter(df_red['label']))

False    49218
True     24108
dtype: int64

In [6]:
pd.Series(collections.Counter(df_bc['label']))

False    67659
True      5667
dtype: int64

In [7]:
pd.Series(collections.Counter(df_prom['label']))

False    127248
True      14594
dtype: int64

### Add folds

In [4]:
FOLDS = {1:['AB','CM'],2:['AC','MB'],3:['AG','YM'],4:['AP','LJ'],5:['BX','MG'],6:['EB','SR'],7:['IM','ML'],8:['NH','LL']}

def addfold(row):
    for fold in FOLDS.keys():
        if row['spk'] in FOLDS[fold]:
            return fold

df_bc['fold'] = df_bc.apply(addfold,axis=1)
df_red['fold'] = df_red.apply(addfold,axis=1)
df_prom['fold'] = df_prom.apply(addfold,axis=1)

### Changing structure of data set
#### From 1 line = 1 token to 1 line = 1 "sentence" (actually Inter-Pausal Unit)

In [5]:
def token2sent(df,threshold=0.5):
    res = []
    
    tmp_toks = []
    tmp_labels = []

    for index,row in df.iterrows():
        if (row['tok'] in ['#','dummy',',']) and row['dur'] > threshold:
            if tmp_toks != []:
                res.append([tmp_toks,tmp_labels,row['fold']])#
                tmp_toks = []
                tmp_labels = []
        else:
            tmp_toks.append(row['tok'])
            tmp_labels.append(int(row['label']))
    return pd.DataFrame(res,columns=['tok','label','fold']) 

df_red_ready = token2sent(df_red)
df_bc_ready = token2sent(df_bc)
df_prom_ready = token2sent(df_prom)


In [10]:
df_prom_ready.iloc[10:20]

Unnamed: 0,tok,label,fold
10,"[moi, je, peux, prendre, ça, comme, quelque, c...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",1
11,"[particulier, ,, aussi, ,, particulier]","[1, 0, 0, 0, 1]",1
12,"[qui, ne, coule, pas, de, source, ,, qui]","[0, 0, 0, 0, 0, 1, 0, 1]",1
13,"[qui, peuvent, être, extraordinaires]","[0, 0, 0, 1]",1
14,[mh],[0],1
15,"[tout, t, était, normal, logique]","[0, 0, 0, 0, 1]",1
16,"[bon, je, vois, que, tu, es, tellement, à, cou...","[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, ...",1
17,[oui],[1],1
18,"[c, est, une, région, perdue, du, canada, oui,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",1
19,[mh],[1],1


### Create BIO Labels from binary
##### There is an option to keep the labels binary (all '1' become 'B-XXX')

In [6]:
def bin2bio(row,label_str,pure_binary=False):
    binlist = row['label']
    res = []
    prev = 0
    for item in binlist:
        if item == 1 :
            if prev != 1:
                res.append('B-'+label_str) 
            else:
                if not pure_binary:
                    res.append('I-'+label_str)
                else:
                    res.append('B-'+label_str)
            prev = 1
        else:
            res.append('O')
            prev = 0
    return res

df_red_ready ['label_bio'] = df_red_ready.apply(bin2bio,args=('RED',False),axis=1)
df_bc_ready ['label_bio'] = df_bc_ready.apply(bin2bio,args=('BC',False),axis=1)
df_prom_ready ['label_bio'] = df_prom_ready.apply(bin2bio,args=('PROM',False),axis=1)


### Create HF datasets from the dataframes

In [7]:
def get_label_list(labels):
    unique_labels = set()
    for label in labels:
        unique_labels = unique_labels | set(label)
    label_list = list(unique_labels)
    label_list.sort()
    return label_list

In [8]:
dataset_prom = Dataset.from_pandas(df_prom_ready)
dataset_prom = dataset_prom.map(lambda ex: {"tags": ex["label_bio"]}) #red_bio
all_labels_prom = get_label_list(dataset_prom["tags"])
dataset_prom = dataset_prom.cast_column("tags", datasets.Sequence(datasets.ClassLabel(names=all_labels_prom)))
label_list_prom = dataset_prom.features["tags"].feature.names
# unused now
#label_all_tokens = True # change this flag to label only first subtoken of a token

Map:   0%|          | 0/9369 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9369 [00:00<?, ? examples/s]

In [9]:
dataset_bc = Dataset.from_pandas(df_bc_ready)
dataset_bc = dataset_bc.map(lambda ex: {"tags": ex["label_bio"]}) #red_bio
all_labels_bc = get_label_list(dataset_bc["tags"])
dataset_bc = dataset_bc.cast_column("tags", datasets.Sequence(datasets.ClassLabel(names=all_labels_bc)))
label_list_bc = dataset_bc.features["tags"].feature.names
# unused now
#label_all_tokens = True # change this flag to label only first subtoken of a token


Map:   0%|          | 0/4854 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4854 [00:00<?, ? examples/s]

In [10]:
dataset_red = Dataset.from_pandas(df_red_ready)
dataset_red = dataset_red.map(lambda ex: {'tags': ex["label_bio"]}) #red_bio
all_labels_red = get_label_list(dataset_red['tags'])
dataset_red = dataset_red.cast_column('tags', datasets.Sequence(datasets.ClassLabel(names=all_labels_red)))
label_list_red = dataset_red.features['tags'].feature.names
# unused now
#label_all_tokens = True # change this flag to label only first subtoken of a token


Map:   0%|          | 0/4849 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/4849 [00:00<?, ? examples/s]

# More tools for running the experiments

### Function to handle labels and subwords

In [11]:
# adapted from HF examples
def tokenize_and_align_labels(examples,tokenizer):
    
    tokenized_inputs = tokenizer(examples["tok"], padding='max_length', truncation=True, is_split_into_words=True)
    
    labels = []
    
    for i, label in enumerate(examples["tags"]):
        
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                # Modified to make sure it follows the BIO scheme
                if label[word_idx]==0:
                    good_label = 1
                else:
                    good_label = label[word_idx]
                label_ids.append(good_label)
            previous_word_idx = word_idx

        labels.append(label_ids)
 
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


### Custom Trainer when we want weights for evaluating the labels


In [12]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 1.0, 0.2]).to('cuda'))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

### Compute metrics

In [13]:
metric = load_metric("seqeval")

def prepare_compute_metric_with_labellist(label_list):
    def compute_metric(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [
            [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
                for prediction, label in zip(predictions, labels)
            ]
    
        true_labels = [
            [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
                for prediction, label in zip(predictions, labels)
            ]

        results = metric.compute(predictions=true_predictions, references=true_labels)
        return {
            "f1": results["overall_f1"],
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "accuracy": results["overall_accuracy"],
        }
    return compute_metric



  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [14]:
def run_one_fold(task,fold,split_dataset,checkpoint,tokenizer,label_list,weighted=False,verbose=False,error_analysis=False,keep_models=False):
    
    print(fold)
 
    tokenized_split_dataset = split_dataset.map(lambda d : tokenize_and_align_labels(d,tokenizer), batched=True)
 
    model = AutoModelForTokenClassification.from_pretrained(checkpoint, num_labels=len(label_list))
    
    if verbose:
        print(model)
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print("Device in use:",device)
    
    #model.to('cuda')                                      #####
    
    if len(checkpoint.split("/"))>1:
        model_name = str(fold)+'-'+checkpoint.split("/")[-2]
    else:
        model_name = str(fold)+'-'+checkpoint
        
    print(model_name)
   
    args = TrainingArguments(
        MODELS_FOLDER+model_name+"-finetuned-"+task,
        evaluation_strategy = "epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        )

    
    data_collator = DataCollatorForTokenClassification(tokenizer)

    compute_metric = prepare_compute_metric_with_labellist(label_list)
    
    if weighted:
        trainer = CustomTrainer(model,args,
                          tokenized_split_dataset["train"],tokenized_split_dataset["valid"],
                          data_collator,tokenizer,compute_metrics=compute_metric)
    else:
        trainer = Trainer(model,args,
                          train_dataset=tokenized_split_dataset["train"],
                          eval_dataset=tokenized_split_dataset["valid"],
                          data_collator=data_collator,tokenizer=tokenizer,compute_metrics=compute_metric) 
        

    trainer.train()
    
    predictions, labels, _ = trainer.predict(tokenized_split_dataset["test"])
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
        ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
        ]
    
    if error_analysis:
        EA_df = pd.DataFrame(tokenized_split_dataset["test"])
        EA_df['predict'] = true_predictions
        EA_df['gold'] = true_labels
        EA_df.to_csv(RESULT_FOLDER+"error_analysis_'+task+'red_"+str(fold)+'.csv')

    if not keep_models:
        shutil.rmtree(MODELS_FOLDER+model_name+"-finetuned-"+task)
    
    return metric.compute(predictions=true_predictions, references=true_labels)

In [15]:
def run_crossvalid(task,base_dataset,checkpoint,label_list,weighted=False,verbose=False,error_analysis=False,keep_models=False):
    
    tokenizer = RobertaTokenizerFast.from_pretrained(checkpoint, max_len=512,add_prefix_space=True)
    
    assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

    results = {'fs':[],'prec':[],'rec':[]}
    for i in range(1,9):
        print(i)
        split_ds = datasets.DatasetDict({
            'train': base_dataset.filter(lambda example: example["fold"] not in [i,(i%8+1)]),
            'test': base_dataset.filter(lambda example: example["fold"] == i),
            'valid': base_dataset.filter(lambda example: example["fold"] == i%8+1)
        })
        print(label_list)
        res = run_one_fold(task,i,split_ds,checkpoint,tokenizer,label_list,weighted,verbose,error_analysis,keep_models)
        print(res.keys())
        results['fs'].append(res['overall_f1'])
        results['prec'].append(res['overall_precision'])
        results['rec'].append(res['overall_recall'])
    return results

# Run experiments

In [None]:
def run_complete_expe(expe_name,ds,label_list,weighted=False):
    ds = ds.remove_columns("label")
    print('====')
    print(expe_name)
    print('====')
    print('running conversation ')
    res_cv_conv = run_crossvalid(expe_name,ds,'../toy_llm/models/fr_10M_10K_conv/',label_list,
                                 weighted=weighted,verbose=False,error_analysis=True,keep_models=False)
    print('running wiki')
    res_cv_wiki = run_crossvalid(expe_name,ds,'../toy_llm/models/fr_10M_10K_wiki/',label_list,
                                 weighted=weighted,verbose=False,error_analysis=True,keep_models=False)
    print('running roberta')
    res_cv_rob = run_crossvalid(expe_name,ds,'xlm-roberta-base',label_list,
                                weighted=weighted,verbose=False,error_analysis=True,keep_models=False)

    res_cv_conv_df = pd.DataFrame(res_cv_conv)
    res_cv_wiki_df = pd.DataFrame(res_cv_wiki)
    res_cv_rob_df = pd.DataFrame(res_cv_rob)

    res_cv_conv_df['model'] = 'conv10K'
    res_cv_wiki_df['model'] = 'wiki10K'
    res_cv_rob_df['model'] = 'roberta'

    res_cv_all = pd.concat([res_cv_conv_df,res_cv_wiki_df,res_cv_rob_df])

    res_cv_all.to_csv(RESULT_FOLDER+expe_name+'10K_cv.csv')

    plot = sns.boxplot(data=res_cv_all,x='model',y='fs')
    plot.figure.savefig(FIGS_FOLDER+expe_name+'_fscore.png',dpi=300)
    plot.figure.clf() 

    plot = sns.boxplot(data=res_cv_all,x='model',y='prec')
    plot.figure.savefig(FIGS_FOLDER+expe_name+'prec.png',dpi=300)
    plot.figure.clf() 

    plot = sns.boxplot(data=res_cv_all,x='model',y='rec')
    plot.figure.savefig(FIGS_FOLDER+expe_name+'rec.png',dpi=300)
    plot.figure.clf() 
    return 0

run_complete_expe('red',dataset_red,label_list_red)
run_complete_expe('prom',dataset_prom,label_list_prom)
run_complete_expe('bc',dataset_red,label_list_bc,weighted=True)


====
red
====
running conversation 
1




Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

['B-RED', 'I-RED', 'O']
1


Map:   0%|          | 0/3504 [00:00<?, ? examples/s]

Map:   0%|          | 0/594 [00:00<?, ? examples/s]

Map:   0%|          | 0/751 [00:00<?, ? examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ../toy_llm/models/fr_10M_10K_conv/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1-fr_10M_10K_conv


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.52137,0.34429,0.335389,0.353677,0.77784
2,No log,0.506201,0.35874,0.34537,0.373187,0.782121
3,0.602700,0.500435,0.368318,0.361231,0.375688,0.787313


dict_keys(['RED', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
2


Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

['B-RED', 'I-RED', 'O']
2


Map:   0%|          | 0/3479 [00:00<?, ? examples/s]

Map:   0%|          | 0/751 [00:00<?, ? examples/s]

Map:   0%|          | 0/619 [00:00<?, ? examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ../toy_llm/models/fr_10M_10K_conv/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2-fr_10M_10K_conv


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.582103,0.286763,0.304374,0.271078,0.748606
2,No log,0.576493,0.293191,0.310119,0.278015,0.746932
3,0.590500,0.575672,0.299667,0.312139,0.288154,0.748765


dict_keys(['RED', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
3


Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

['B-RED', 'I-RED', 'O']
3


Map:   0%|          | 0/3480 [00:00<?, ? examples/s]

Map:   0%|          | 0/619 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ../toy_llm/models/fr_10M_10K_conv/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


3-fr_10M_10K_conv


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.621734,0.301543,0.321113,0.284222,0.738462
2,No log,0.610033,0.308543,0.342273,0.280865,0.743392
3,0.574300,0.608287,0.317083,0.344722,0.293547,0.743511


dict_keys(['RED', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
4


Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

['B-RED', 'I-RED', 'O']
4


Map:   0%|          | 0/3650 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Map:   0%|          | 0/449 [00:00<?, ? examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ../toy_llm/models/fr_10M_10K_conv/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4-fr_10M_10K_conv


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.572225,0.248747,0.272852,0.228555,0.732377
2,No log,0.566767,0.271186,0.270642,0.271733,0.738859
3,0.577000,0.562698,0.267494,0.262313,0.272884,0.735942


dict_keys(['RED', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
5


Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

['B-RED', 'I-RED', 'O']
5


Map:   0%|          | 0/3845 [00:00<?, ? examples/s]

Map:   0%|          | 0/449 [00:00<?, ? examples/s]

Map:   0%|          | 0/555 [00:00<?, ? examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ../toy_llm/models/fr_10M_10K_conv/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


5-fr_10M_10K_conv


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.633036,0.27321,0.29683,0.253071,0.738085
2,No log,0.629139,0.277358,0.298233,0.259214,0.741168
3,0.578800,0.623013,0.289987,0.308011,0.273956,0.743811


dict_keys(['RED', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
6


Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

['B-RED', 'I-RED', 'O']
6


Map:   0%|          | 0/3656 [00:00<?, ? examples/s]

Map:   0%|          | 0/555 [00:00<?, ? examples/s]

Map:   0%|          | 0/638 [00:00<?, ? examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ../toy_llm/models/fr_10M_10K_conv/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


6-fr_10M_10K_conv


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.546815,0.318168,0.340397,0.298664,0.770089
2,No log,0.526997,0.326664,0.375,0.289367,0.781069
3,0.590200,0.519999,0.331856,0.364649,0.304474,0.781495


dict_keys(['RED', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
7


Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

['B-RED', 'I-RED', 'O']
7


Map:   0%|          | 0/3718 [00:00<?, ? examples/s]

Map:   0%|          | 0/638 [00:00<?, ? examples/s]

Map:   0%|          | 0/493 [00:00<?, ? examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ../toy_llm/models/fr_10M_10K_conv/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


7-fr_10M_10K_conv


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.535738,0.282912,0.298475,0.268891,0.772293
2,No log,0.522485,0.292147,0.313131,0.273798,0.776141
3,0.597300,0.515929,0.305138,0.323789,0.288518,0.780918


dict_keys(['RED', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
8


Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

['B-RED', 'I-RED', 'O']
8


Map:   0%|          | 0/3762 [00:00<?, ? examples/s]

Map:   0%|          | 0/493 [00:00<?, ? examples/s]

Map:   0%|          | 0/594 [00:00<?, ? examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ../toy_llm/models/fr_10M_10K_conv/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


8-fr_10M_10K_conv


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.546431,0.28591,0.293869,0.278371,0.768338
2,No log,0.533241,0.289492,0.311538,0.27036,0.7744
3,0.594800,0.528834,0.300275,0.3101,0.291055,0.773707


dict_keys(['RED', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
running wiki
1




Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

['B-RED', 'I-RED', 'O']
1


Map:   0%|          | 0/3504 [00:00<?, ? examples/s]

Map:   0%|          | 0/594 [00:00<?, ? examples/s]

Map:   0%|          | 0/751 [00:00<?, ? examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ../toy_llm/models/fr_10M_10K_wiki/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1-fr_10M_10K_wiki


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.537957,0.310878,0.285177,0.341671,0.76933
2,No log,0.512695,0.324778,0.304329,0.348174,0.77954
3,0.609300,0.501337,0.344266,0.330119,0.35968,0.786283


dict_keys(['RED', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
2


Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

['B-RED', 'I-RED', 'O']
2


Map:   0%|          | 0/3479 [00:00<?, ? examples/s]

Map:   0%|          | 0/751 [00:00<?, ? examples/s]

Map:   0%|          | 0/619 [00:00<?, ? examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ../toy_llm/models/fr_10M_10K_wiki/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2-fr_10M_10K_wiki


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.589048,0.243459,0.251565,0.235859,0.736683
2,No log,0.579617,0.254885,0.281029,0.233191,0.741512
3,0.598600,0.572311,0.26877,0.27414,0.263607,0.742377


dict_keys(['RED', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
3


Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

['B-RED', 'I-RED', 'O']
3


Map:   0%|          | 0/3480 [00:00<?, ? examples/s]

Map:   0%|          | 0/619 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ../toy_llm/models/fr_10M_10K_wiki/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


3-fr_10M_10K_wiki


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.638169,0.290102,0.287812,0.292428,0.728621
2,No log,0.63215,0.299859,0.325612,0.277881,0.732951
3,0.578200,0.622133,0.305071,0.325306,0.287206,0.736307


dict_keys(['RED', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
4


Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

['B-RED', 'I-RED', 'O']
4


Map:   0%|          | 0/3650 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Map:   0%|          | 0/449 [00:00<?, ? examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ../toy_llm/models/fr_10M_10K_wiki/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4-fr_10M_10K_wiki


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.55677,0.262019,0.274041,0.251007,0.743127
2,No log,0.554856,0.268335,0.271336,0.2654,0.74426
3,0.582700,0.551257,0.266889,0.258065,0.276339,0.745846


dict_keys(['RED', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
5


Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

['B-RED', 'I-RED', 'O']
5


Map:   0%|          | 0/3845 [00:00<?, ? examples/s]

Map:   0%|          | 0/449 [00:00<?, ? examples/s]

Map:   0%|          | 0/555 [00:00<?, ? examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ../toy_llm/models/fr_10M_10K_wiki/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


5-fr_10M_10K_wiki


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.628926,0.233187,0.238739,0.227887,0.728165
2,No log,0.627412,0.266842,0.286926,0.249386,0.740079
3,0.587400,0.619207,0.278553,0.288058,0.269656,0.744023


dict_keys(['RED', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
6


Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

['B-RED', 'I-RED', 'O']
6


Map:   0%|          | 0/3656 [00:00<?, ? examples/s]

Map:   0%|          | 0/555 [00:00<?, ? examples/s]

Map:   0%|          | 0/638 [00:00<?, ? examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ../toy_llm/models/fr_10M_10K_wiki/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


6-fr_10M_10K_wiki


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.555439,0.289538,0.303765,0.276583,0.766044
2,No log,0.527756,0.317727,0.357558,0.28588,0.784943
3,0.595500,0.524721,0.32454,0.34373,0.307379,0.783015


dict_keys(['RED', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
7


Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

['B-RED', 'I-RED', 'O']
7


Map:   0%|          | 0/3718 [00:00<?, ? examples/s]

Map:   0%|          | 0/638 [00:00<?, ? examples/s]

Map:   0%|          | 0/493 [00:00<?, ? examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ../toy_llm/models/fr_10M_10K_wiki/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


7-fr_10M_10K_wiki


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.545671,0.251245,0.288071,0.222767,0.762962
2,No log,0.533934,0.280374,0.297685,0.264966,0.764882
3,0.603400,0.525995,0.283186,0.283744,0.28263,0.773044


dict_keys(['RED', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
8


Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

['B-RED', 'I-RED', 'O']
8


Map:   0%|          | 0/3762 [00:00<?, ? examples/s]

Map:   0%|          | 0/493 [00:00<?, ? examples/s]

Map:   0%|          | 0/594 [00:00<?, ? examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ../toy_llm/models/fr_10M_10K_wiki/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


8-fr_10M_10K_wiki


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.558407,0.258599,0.247259,0.271028,0.762975
2,No log,0.538596,0.269203,0.291149,0.250334,0.77636
3,0.601400,0.535668,0.288175,0.285996,0.290387,0.771898


dict_keys(['RED', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
running roberta


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLMRobertaTokenizer'. 
The class this function is called from is 'RobertaTokenizerFast'.


1




Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

['B-RED', 'I-RED', 'O']
1


Map:   0%|          | 0/3504 [00:00<?, ? examples/s]

Map:   0%|          | 0/594 [00:00<?, ? examples/s]

Map:   0%|          | 0/751 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1-xlm-roberta-base


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.50727,0.366003,0.359556,0.372686,0.779536
2,No log,0.509303,0.35999,0.344292,0.377189,0.769624
3,0.593800,0.487361,0.377707,0.367773,0.388194,0.784412


dict_keys(['RED', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
2


Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

['B-RED', 'I-RED', 'O']
2


Map:   0%|          | 0/3479 [00:00<?, ? examples/s]

Map:   0%|          | 0/751 [00:00<?, ? examples/s]

Map:   0%|          | 0/619 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2-xlm-roberta-base


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.565658,0.322521,0.378049,0.281217,0.758996
2,No log,0.555332,0.335723,0.381275,0.299893,0.763701
3,0.583100,0.553797,0.342523,0.358183,0.328175,0.761


dict_keys(['RED', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
3


Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

['B-RED', 'I-RED', 'O']
3


Map:   0%|          | 0/3480 [00:00<?, ? examples/s]

Map:   0%|          | 0/619 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


3-xlm-roberta-base


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.625283,0.33244,0.341598,0.32376,0.733231
2,No log,0.630666,0.350682,0.392874,0.316673,0.747236
3,0.564100,0.627127,0.350686,0.381978,0.324133,0.743483


dict_keys(['RED', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
4


Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

['B-RED', 'I-RED', 'O']
4


Map:   0%|          | 0/3650 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Map:   0%|          | 0/449 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4-xlm-roberta-base


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.527766,0.292157,0.337868,0.25734,0.765167
2,No log,0.519738,0.334307,0.339466,0.329303,0.77199
3,0.570800,0.522381,0.335257,0.336032,0.334485,0.770514


dict_keys(['RED', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
5


Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

['B-RED', 'I-RED', 'O']
5


Map:   0%|          | 0/3845 [00:00<?, ? examples/s]

Map:   0%|          | 0/449 [00:00<?, ? examples/s]

Map:   0%|          | 0/555 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


5-xlm-roberta-base


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.60502,0.298332,0.319104,0.280098,0.750945
2,No log,0.59689,0.314508,0.349887,0.285627,0.759005
3,0.575700,0.590672,0.324721,0.337757,0.312654,0.758607


dict_keys(['RED', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
6


Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

['B-RED', 'I-RED', 'O']
6


Map:   0%|          | 0/3656 [00:00<?, ? examples/s]

Map:   0%|          | 0/555 [00:00<?, ? examples/s]

Map:   0%|          | 0/638 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


6-xlm-roberta-base


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.546012,0.349984,0.392909,0.315514,0.773124
2,No log,0.52995,0.368574,0.415452,0.331203,0.77999
3,0.576800,0.527049,0.37995,0.406353,0.356769,0.7795


dict_keys(['RED', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
7


Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

['B-RED', 'I-RED', 'O']
7


Map:   0%|          | 0/3718 [00:00<?, ? examples/s]

Map:   0%|          | 0/638 [00:00<?, ? examples/s]

Map:   0%|          | 0/493 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


7-xlm-roberta-base


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.519749,0.281231,0.365204,0.228656,0.776343
2,No log,0.495537,0.324353,0.351259,0.301276,0.786423
3,0.585400,0.493004,0.346786,0.352227,0.341511,0.784691


dict_keys(['RED', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
8


Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4849 [00:00<?, ? examples/s]

['B-RED', 'I-RED', 'O']
8


Map:   0%|          | 0/3762 [00:00<?, ? examples/s]

Map:   0%|          | 0/493 [00:00<?, ? examples/s]

Map:   0%|          | 0/594 [00:00<?, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


8-xlm-roberta-base


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.529411,0.321557,0.34953,0.29773,0.777496
2,No log,0.515188,0.322103,0.361596,0.290387,0.784598
3,0.584400,0.513007,0.346141,0.359454,0.333778,0.782772


dict_keys(['RED', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
====
prom
====
running conversation 
1




Filter:   0%|          | 0/9369 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9369 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9369 [00:00<?, ? examples/s]

['B-PROM', 'I-PROM', 'O']
1


Map:   0%|          | 0/6844 [00:00<?, ? examples/s]

Map:   0%|          | 0/1229 [00:00<?, ? examples/s]

Map:   0%|          | 0/1296 [00:00<?, ? examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ../toy_llm/models/fr_10M_10K_conv/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1-fr_10M_10K_conv


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.373266,0.102439,0.15245,0.077135,0.830738
2,0.357600,0.371678,0.099073,0.221338,0.06382,0.83521
3,0.317400,0.366486,0.168546,0.265286,0.123508,0.835092


dict_keys(['PROM', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
2


Filter:   0%|          | 0/9369 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9369 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9369 [00:00<?, ? examples/s]

['B-PROM', 'I-PROM', 'O']
2


Map:   0%|          | 0/6813 [00:00<?, ? examples/s]

Map:   0%|          | 0/1296 [00:00<?, ? examples/s]

Map:   0%|          | 0/1260 [00:00<?, ? examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ../toy_llm/models/fr_10M_10K_conv/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2-fr_10M_10K_conv


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.312701,0.13905,0.192719,0.108761,0.867447
2,0.362700,0.305617,0.175681,0.22,0.146224,0.864967
3,0.320400,0.30532,0.160568,0.210166,0.129909,0.864438


dict_keys(['PROM', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
3


Filter:   0%|          | 0/9369 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9369 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9369 [00:00<?, ? examples/s]

['B-PROM', 'I-PROM', 'O']
3


Map:   0%|          | 0/6836 [00:00<?, ? examples/s]

Map:   0%|          | 0/1260 [00:00<?, ? examples/s]

Map:   0%|          | 0/1273 [00:00<?, ? examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ../toy_llm/models/fr_10M_10K_conv/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


3-fr_10M_10K_conv


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.31635,0.113978,0.198925,0.07987,0.8661
2,0.377300,0.311303,0.143575,0.214362,0.107933,0.866662
3,0.327800,0.312648,0.175254,0.223618,0.144091,0.862449


dict_keys(['PROM', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'])
4


Filter:   0%|          | 0/9369 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9369 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9369 [00:00<?, ? examples/s]

['B-PROM', 'I-PROM', 'O']
4


Map:   0%|          | 0/7279 [00:00<?, ? examples/s]

Map:   0%|          | 0/1273 [00:00<?, ? examples/s]

Map:   0%|          | 0/817 [00:00<?, ? examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ../toy_llm/models/fr_10M_10K_conv/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4-fr_10M_10K_conv


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.368544,0.133119,0.253049,0.090316,0.849261
2,0.368100,0.368144,0.167772,0.296552,0.116975,0.854116
