In [None]:
!pip install -qq datasets

In [None]:
import pandas as pd
from sklearn.model_selection import KFold
kf = KFold(n_splits=10,shuffle=True,random_state=42)

In [None]:
!pip -qq install transformers
from transformers import pipeline

import torch
if torch.cuda.is_available() : device=0
else: device=-1

In [None]:
model_name = "twitter-roberta-base-sentiment-mlm"

model_checkpoint_lm_class =      "neoyipeng/"+model_name+"-class"
model_checkpoint_lm_skep_class = "neoyipeng/"+model_name+"-skep-class"

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_lm_class, use_fast=True)
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments,EarlyStoppingCallback
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint_lm_class,num_labels=3)

batch_size=64
training_args = TrainingArguments(
    "Finbert-EXT-mlm-class",
    report_to='none',
    warmup_ratio=0.1,
    overwrite_output_dir=True,       #overwrite the content of the output directory to save space
    save_total_limit=1,               #prevents saving many models
    learning_rate=2e-5,
    lr_scheduler_type='cosine_with_restarts', #similar to fastai's one cycle
    fp16=True, #used in NEZHA (mixed precision, positional encoding, WWM)
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=batch_size,   # batch size for evaluation
    logging_strategy  = 'epoch',                    # we will log every epoch
    evaluation_strategy = "epoch",
    save_strategy='epoch',
    metric_for_best_model='accuracy',#needed for early stopping callback to determine when to stop
    load_best_model_at_end=True,
    num_train_epochs=4,          # following simple way to fine tune bert - 20 epochs, 2e-5 and early stopping.
    remove_unused_columns=True,  ## If we have ref files, need to avoid it removed by trainer. if not doing MLM REMOVE!
    eval_accumulation_steps=5,        # evaluation every X steps. default takes entire dataset and puts it into
    label_smoothing_factor =0.1 #add label smoothing,
)

In [None]:
from datasets import load_dataset,concatenate_datasets

vals_ds   = load_dataset('financial_phrasebank','sentences_allagree',split=[f'train[{k}%:{k+10}%]' for k in range(0, 100, 10)])
trains_ds = load_dataset('financial_phrasebank','sentences_allagree',split=[f'train[:{k}%]+train[{k+10}%:]' for k in range(0, 100, 10)])

In [None]:
def tokenize_function(examples): return tokenizer(examples["sentence"],padding='max_length',max_length=128,truncation=True)

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall}

In [None]:
acc,f1=[],[]
for trn,val in zip(trains_ds,vals_ds):
    
    tokenized_val = val.map(tokenize_function, batched=True, remove_columns=['sentence'])
    tokenized_trn = trn.map(tokenize_function, batched=True, remove_columns=['sentence'])
    
    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_trn,
    eval_dataset=tokenized_val,
    callbacks=[EarlyStoppingCallback(1)],
    compute_metrics=compute_metrics)
    
    trainer.train()
    preds=trainer.predict(tokenized_val)
    preds=torch.argmax(preds[0],dim=1).tolist()
    
    acc.append(accuracy_score(val['label'],preds))
    f1.append (f1_score(val['label'],preds,average='macro'))

print('*'*99)
f'10-fold accuracy is {sum(acc)/len(acc):0.0%} and f1 is {sum(f1)/len(f1):0.0%}!'