In [None]:
!pip install ekphrasis
!pip install tqdm boto3 requests regex sentencepiece sacremoses pytorch-transformers
!pip install transformers 
!pip install datasets
!pip install optuna

In [None]:
%env WANDB_PROJECT=nlu_sentiment_analysis
!wandb login 2cad8a8279143c69ce071f54bf37c1f5a5f4e5ff
import wandb

from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import itertools
import requests, re, string, datetime, copy
from functools import partial

import torch
import torchvision.transforms as T, torch.nn.functional as F, torch.nn as nn

from datasets import Dataset
from transformers import TrainingArguments, EarlyStoppingCallback, TrainerCallback
from transformers import Trainer
from transformers.modeling_outputs import SequenceClassifierOutput

from sklearn.model_selection import train_test_split

PATH = "../input/sentipolc2016/"

In [None]:
train = pd.read_csv(PATH + "training_set_sentipolc16.csv")
train.head()

In [None]:
file1 = open(PATH + "test_set_sentipolc16_gold2000.csv", 'r') 
Lines = file1.readlines()
 
test = []
for line in Lines:
  arr = line.split("\",")
  if len(arr) != 9:
    arr[8] = arr[8] + arr[9]  #to account for tweets containing the delimiter charachter that would create more splits than needed
    del arr[9:]
  for i in range(8):
    arr[i] = int(arr[i].strip("\""))
  test.append(arr)

test = pd.DataFrame(test, columns=train.columns)
test.head()

In [None]:
"""Tokenization classes for Italian AlBERTo models."""
import collections
import os

#from transformers import BertTokenizer, WordpieceTokenizer

def separate2united_labels(row):
  """
  Return a single scalar integer label associated to the polarity of the tweet.

  Negative -> 0
  Neutral  -> 1
  Positive -> 2
  Mixed    -> 3
  """
  if row["opos"] == 0 and row["oneg"] == 0:
    return 1
  elif row["oneg"] == 0 and row["opos"] == 1:
    return 2
  elif row["oneg"] == 1 and row["opos"] == 0:
    return 0
  else:
    return 3

def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
    with open(vocab_file, "r", encoding="utf-8") as reader:
        tokens = reader.readlines()
    for index, token in enumerate(tokens):
        token = token.rstrip("\n")
        vocab[token] = index
    return vocab

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'user', 'percent', 'money', 'phone', 'time', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag"},
    fix_html=True,  # fix HTML tokens

    unpack_hashtags=True,  # perform word segmentation on hashtags

    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts=[emoticons]
)

class AlBERTo_Preprocessing(object):
    def __init__(self, do_lower_case=True, **kwargs):
        self.do_lower_case = do_lower_case

    def preprocess(self, text):
        if self.do_lower_case:
            text = text.lower()
        text = str(" ".join(text_processor.pre_process_doc(text)))
        text = re.sub(r'[^a-zA-ZÀ-ú</>!?♥♡\s\U00010000-\U0010ffff]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'(\w)\1{2,}', r'\1\1', text)
        text = re.sub(r'^\s', '', text)
        text = re.sub(r'\s$', '', text)
        return text

a = AlBERTo_Preprocessing(do_lower_case=True)
s = "#IlGOverno presenta le linee guida sulla scuola #labuonascuola - http://t.co/SYS1T9QmQN"
b = a.preprocess(s)
print(b)

In [None]:
def k_steps_evidence(num_iterations, early_stopping_patience, training_args, net, training_opos, training_oneg, testing_opos, testing_oneg, validating_opos, validating_oneg):
    np.random.seed(0)
    torch.manual_seed(0)

    keys = ["eval_loss", "eval_accuracy", "eval_f1", "eval_precision", "eval_recall"]
    metrics = { i + sa: [] for i in keys for sa in ["_opos", "_oneg"]}
    for i in range(num_iterations):
        for subtask , training , testing , validating in zip(["opos", "oneg"], [training_opos, training_oneg], [testing_opos, testing_oneg], [validating_opos, validating_oneg]):
            trainer = Trainer(
                model=net(2),
                args=training_args, 
                train_dataset=training.shuffle(seed=i), 
                eval_dataset=validating, 
                compute_metrics=compute_metrics,
                callbacks=[EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)]
            )
            trainer.train()
            res = trainer.evaluate(testing)
            for m in keys:
                metrics[m + "_" + subtask].append(res[m])

    for m in metrics.keys():
        print("{:18s}\t: {:.2} ± {:.2}".format(m, np.mean(metrics[m]), np.std(metrics[m])))
    print("\nOverall F1: {:.2}".format(np.mean(
        [np.mean(metrics["eval_f1_opos"]), np.mean(metrics["eval_f1_oneg"])]
    )))

In [None]:
from transformers import AutoTokenizer, AutoModel

a = AlBERTo_Preprocessing(do_lower_case=True)
s: str = "#IlGOverno presenta le linee guida sulla scuola #labuonascuola - http://t.co/SYS1T9QmQN"
b = a.preprocess(s)

tok = AutoTokenizer.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0")
pretrained_model = AutoModel.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0")
tok.model_max_length = 128 #model.config.max_position_embeddings
tokens = tok.tokenize(b)
print(tokens)

In [None]:
s = "#IlGOverno presenta le linee guida sulla scuola #labuonascuola - http://t.co/SYS1T9QmQN"
o = pretrained_model(**tok("ciao " * 1000, return_tensors="pt", truncation=True))

o[0].shape , o[1].shape

In [None]:
class MyNet(nn.Module):
    """
        Attach a FC layer on top of the BERT head in order to produce a classification output.
        Hyperparameters are taken from Alberto.

        The pooled_output output of BERT is basically a projection of the [CLS] embeddings via another FC layer (768 -> 768 hidden units).
        We stack another FC layer with Dropout on top of that, as reported in https://github.com/google-research/bert/blob/eedf5716ce1268e56f0a50264a88cafad334ac61/run_classifier.py#L574
    """
    def __init__(self, num_labels):
        super(MyNet, self).__init__()

        self.num_labels = num_labels
        self.model = copy.deepcopy(pretrained_model)#AutoModel.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0")
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, num_labels)

        if self.num_labels >= 2:
            self.loss_fct = nn.CrossEntropyLoss()
        else:
            self.loss_fct = nn.BCEWithLogitsLoss()


    def forward(self, labels, input_ids, attention_mask, **args):
        #For the output format -> https://huggingface.co/transformers/_modules/transformers/models/bert/modeling_bert.html#BertForSequenceClassification.forward
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, **args)
        x = self.dropout(outputs[1])
        logits = self.linear(x)

        loss = self.loss_fct(logits, labels)

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [None]:
def tokenize_function(examples):
    sa = tok(examples["text"], padding="max_length", truncation=True)
    return sa

def separate2united_labels2(row):
    return str(row["opos"]) + str(row["oneg"])

def process_dataset_task(example, subtask):
    example["labels"] = int(example["labels"][0]) if subtask == "opos" else int(example["labels"][1])
    return example

#train set
dataset = pd.DataFrame({"text": train.text.apply(a.preprocess), "idx": train.index, "labels": train[["opos", "oneg"]].apply(separate2united_labels2, axis=1)})
X_train, X_val = train_test_split(dataset, test_size=0.2, random_state=42, stratify=dataset["labels"])

X_train = Dataset.from_pandas(X_train)
X_val = Dataset.from_pandas(X_val)


##
# Create a Dataset for each subtask (evaluated separately by SentiPolc16).
# To do that, simply take, respectively, the first or the second char of the label for opos and oneg.
##
training_opos = X_train\
                    .map(process_dataset_task, fn_kwargs={"subtask": "opos"}, batched=False)\
                    .map(tokenize_function, batched=True)\
                    .shuffle(seed=42)\
                    .with_format("torch")
training_oneg = X_train\
                   .map(process_dataset_task, fn_kwargs={"subtask": "oneg"}, batched=False)\
                   .map(tokenize_function, batched=True)\
                   .shuffle(seed=42)\
                   .with_format("torch")
validating_opos = X_val\
                    .map(process_dataset_task, fn_kwargs={"subtask": "opos"}, batched=False)\
                    .map(tokenize_function, batched=True)\
                    .with_format("torch")
validating_oneg = X_val\
                   .map(process_dataset_task, fn_kwargs={"subtask": "oneg"}, batched=False)\
                   .map(tokenize_function, batched=True)\
                   .with_format("torch")


#test set
dataset = pd.DataFrame({"text": test.text.apply(a.preprocess), "idx": test.index, "labels": test[["opos", "oneg"]].apply(separate2united_labels2, axis=1)})
dataset = Dataset.from_pandas(dataset)

##
# Create a Dataset for each subtask (evaluated separately by SentiPolc16).
# To do that, simply take, respectively, the first or the second char of the label for opos and oneg.
##
testing_opos = dataset.map(process_dataset_task, fn_kwargs={"subtask": "opos"}, batched=False).map(tokenize_function, batched=True).shuffle().with_format("torch")
testing_oneg = dataset\
                    .map(process_dataset_task, fn_kwargs={"subtask": "oneg"}, batched=False)\
                    .map(tokenize_function, batched=True)\
                    .with_format("torch")

In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

class MemorySaverCallback(TrainerCallback):
    "A callback that deleted the folder in which checkpoints are saved, to save memory"
    def __init__(self, run_name):
        super(MemorySaverCallback, self).__init__()
        self.run_name = run_name

    def on_train_begin(self, args, state, control, **kwargs):
        print("Removing dirs...")
        if os.path.isdir(f'./{self.run_name}'):
            import shutil
            shutil.rmtree(f'./{self.run_name}')
        else:
            print("\n\nDirectory does not exists")

TRAIN_BATCH_SIZE = 64 
PREDICT_BATCH_SIZE = 64
EVAL_BATCH_SIZE = 64 
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3
MAX_SEQ_LENGTH = 128
WARMUP_PROPORTION = 0.1
num_train_steps = int(len(training_opos) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)+1
NUM_WARMUP_STEPS =  int(NUM_TRAIN_EPOCHS * WARMUP_PROPORTION)
RUN_NAME = "test_trainer"

## Baseline with train-val splitting

In [None]:
args = TrainingArguments(
                "test_trainer", 
                num_train_epochs=15,
                per_device_train_batch_size=TRAIN_BATCH_SIZE,
                per_device_eval_batch_size=PREDICT_BATCH_SIZE,
                save_total_limit=2,
                learning_rate=LEARNING_RATE,
                warmup_steps=NUM_WARMUP_STEPS,
                weight_decay=0.01,
                adam_beta1=0.9,
                adam_beta2=0.999,
                adam_epsilon=1e-6,
                evaluation_strategy="epoch",
                logging_strategy="epoch", #before was 'step', check this
                logging_first_step=False,
                overwrite_output_dir=True,
                save_strategy="no",
                report_to="none",
                load_best_model_at_end=True,  
                metric_for_best_model="eval_loss",
            )

k_steps_evidence(num_iterations=5, 
                 early_stopping_patience=3,
                 training_args=args, 
                 net=MyNet, 
                 training_opos=training_opos, 
                 training_oneg=training_oneg, 
                 validating_opos=validating_opos,
                 validating_oneg=validating_oneg,
                 testing_opos=testing_opos, 
                 testing_oneg=testing_oneg)            

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4847,0.410385,0.821188,0.768502,0.779933,0.75967
2,0.3472,0.415323,0.809717,0.769663,0.762605,0.778852
3,0.2295,0.430693,0.827935,0.785527,0.784892,0.786174


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.526,0.475239,0.767881,0.748846,0.767177,0.742418
2,0.3679,0.451778,0.78475,0.776958,0.776209,0.777801
3,0.2573,0.465462,0.792173,0.783779,0.784099,0.783471
4,0.1489,0.631098,0.7861,0.778357,0.777603,0.779204
5,0.0759,0.79973,0.785425,0.778931,0.777108,0.781637


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5047,0.394078,0.813765,0.744409,0.780025,0.725921
2,0.3608,0.381903,0.825911,0.778438,0.784121,0.773478
3,0.2483,0.411073,0.823887,0.783046,0.779375,0.787141
4,0.1559,0.478996,0.82996,0.772665,0.79807,0.756696
5,0.0909,0.639659,0.809717,0.772114,0.763243,0.784877


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5218,0.449956,0.788799,0.782865,0.780771,0.78637
2,0.3739,0.47258,0.784076,0.78062,0.779466,0.789501
3,0.2529,0.537459,0.786775,0.778764,0.778332,0.779224
4,0.1449,0.6164,0.789474,0.786178,0.785059,0.795384


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4895,0.387546,0.831309,0.783222,0.792421,0.775703
2,0.3486,0.414078,0.818489,0.754411,0.783927,0.73747
3,0.2362,0.432762,0.831984,0.771817,0.805126,0.752824


## Hyper-parameter tuning

In [None]:
training_args = TrainingArguments(
    RUN_NAME, 
    num_train_epochs=15,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=PREDICT_BATCH_SIZE,
    evaluation_strategy="epoch",
    logging_strategy="steps",
    logging_steps=1,
    logging_first_step=False,
    overwrite_output_dir=True,
    save_strategy="no",
    save_total_limit=1,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    run_name="alberto-repr-kaggle-opos-7"
)

np.random.seed(0)
torch.manual_seed(0)
trainer = Trainer(
    model_init=partial(MyNet,2),
    args=training_args, 
    train_dataset=training_opos, 
    eval_dataset=validating_opos,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3), MemorySaverCallback(RUN_NAME)]
)


def my_hp_space_optuna(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 2e-6, 2e-4, log=True),
        "warmup_steps":  trial.suggest_float("warmup_steps", 0., 0.9, step=0.3),
        "weight_decay":  trial.suggest_float("weight_decay", 1e-6, 1e-1)
    }
def my_objective(metrics):
    return metrics["eval_f1"]

sa = trainer.hyperparameter_search(
    direction="maximize", 
    n_trials=10,
    hp_space=my_hp_space_optuna, 
    compute_objective=my_objective
)

#trainer.evaluate()
#trainer.train()
#wandb.finish()
sa

In [None]:
args = TrainingArguments(
                "test_trainer", 
                num_train_epochs=15,
                per_device_train_batch_size=TRAIN_BATCH_SIZE,
                per_device_eval_batch_size=PREDICT_BATCH_SIZE,
                save_total_limit=2,
                learning_rate=sa["learning_rate"],
                warmup_steps=sa["warmup_steps"],
                weight_decay=sa["weight_decay"],
                adam_beta1=0.9,
                adam_beta2=0.999,
                adam_epsilon=1e-6,    
                valuation_strategy="epoch",
                logging_strategy="steps",
                logging_steps=1,
                logging_first_step=False,
                overwrite_output_dir=True,
                save_strategy="no",
                report_to="none",
                load_best_model_at_end=True,  
                metric_for_best_model="eval_loss",
            )

k_steps_evidence(num_iterations=5, 
                 early_stopping_patience=3,
                 training_args=args, 
                 net=MyNet, 
                 training_opos=training_opos, 
                 training_oneg=training_oneg, 
                 validating_opos=validating_opos,
                 validating_oneg=validating_oneg,
                 testing_opos=testing_opos, 
                 testing_oneg=testing_oneg)            

## Architecture exploration

#### Add BatchNormalization to classification head

In [None]:
class BertPoolerWithBN(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.LeakyReLU()
        self.bn = nn.LayerNorm(config.hidden_size)

    def forward(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.bn(pooled_output)
        pooled_output = self.activation(pooled_output)
        return pooled_output
    
    
class MyNetBN(nn.Module):
    """
        Attach a FC layer on top of the BERT head in order to produce a classification output.
        Hyperparameters are taken from Alberto.

        The pooled_output output of BERT is basically a projection of the [CLS] embeddings via another FC layer (768 -> 768 hidden units).
        We stack another FC layer with Dropout on top of that, as reported in https://github.com/google-research/bert/blob/eedf5716ce1268e56f0a50264a88cafad334ac61/run_classifier.py#L574
    """
    def __init__(self, num_labels):
        super(MyNetBN, self).__init__()

        self.num_labels = num_labels
        self.model = copy.deepcopy(pretrained_model)#AutoModel.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0")
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, num_labels)
        
        self.model.pooler = BertPoolerWithBN(self.model.config)

        if self.num_labels >= 2:
            self.loss_fct = nn.CrossEntropyLoss()
        else:
            self.loss_fct = nn.BCEWithLogitsLoss()


    def forward(self, labels, input_ids, attention_mask, **args):
        #For the output format -> https://huggingface.co/transformers/_modules/transformers/models/bert/modeling_bert.html#BertForSequenceClassification.forward
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, **args)
        x = self.dropout(outputs[1])
        logits = self.linear(x)

        loss = self.loss_fct(logits, labels)

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [None]:
RUN_NAME = "BN_NET"

training_args = TrainingArguments(
    RUN_NAME, 
    num_train_epochs=15,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=PREDICT_BATCH_SIZE,
    evaluation_strategy="epoch",
    logging_strategy="steps",
    logging_steps=1,
    logging_first_step=True,
    overwrite_output_dir=True,
    save_strategy="no",
    save_total_limit=1,
    learning_rate=4.723383529363845e-06,
    warmup_steps=0.6,
    weight_decay=0.009760393798851559,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-6,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    report_to="none",
    run_name="alberto-repr-kaggle-opos-9-hyper-tuning1_MyNetBN"
)

np.random.seed(0)
torch.manual_seed(0)
trainer = Trainer(
    model_init=partial(MyNetBN, 2),
    args=training_args, 
    train_dataset=training_opos, 
    eval_dataset=validating_opos,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print(trainer.evaluate())
trainer.train()
print(trainer.evaluate())
print(trainer.predict(testing_opos))
#wandb.finish()

In [None]:
AlBERTo baseline with val set
'test_f1': 0.7281719646005023
--------------------------------------
learning_rate=4.723383529363845e-06,
warmup_steps=0.6,
weight_decay=0.009760393798851559
1	0.475600	0.476718	0.782051	0.637935	0.797978	0.624925
2	0.443400	0.406835	0.823212	0.742419	0.816199	0.715883
3	0.507200	0.385035	0.824561	0.775645	0.782895	0.769533
4	0.233800	0.380086	0.836032	0.781769	0.805750	0.766165
5	0.297100	0.386492	0.834683	0.789434	0.795611	0.784060
6	0.172400	0.398375	0.834683	0.785256	0.798699	0.775023
7	0.105000	0.412933	0.823212	0.782684	0.778505	0.787427
'test_f1': 0.7589981278875193
--------------------------------------
MyNetBN
learning_rate=4.723383529363845e-06,
warmup_steps=0.6,
weight_decay=0.009760393798851559
1	0.646700	0.594625	0.696356	0.673514	0.679184	0.722333
2	0.571800	0.464716	0.799595	0.751313	0.749690	0.753028
3	0.598400	0.430946	0.824561	0.779153	0.781366	0.777064
4	0.398500	0.404444	0.833333	0.774075	0.806625	0.755263
5	0.507500	0.423892	0.821188	0.769649	0.779302	0.761929
6	0.359600	0.430221	0.826586	0.772422	0.789431	0.760389
'test_f1': 0.685691933390074
--------------------------------------
MyNetBN with LayerNorm instead
learning_rate=4.723383529363845e-06,
warmup_steps=0.6,
weight_decay=0.009760393798851559
1	0.531000	0.463256	0.778003	0.649657	0.757834	0.634176
2	0.509800	0.398287	0.821188	0.768887	0.779716	0.760423
3	0.287400	0.384489	0.830634	0.765049	0.809589	0.742854
4	0.291200	0.371507	0.838057	0.785421	0.807558	0.770577
5	0.317900	0.378747	0.834008	0.780444	0.801411	0.766272
6	0.336200	0.388687	0.832659	0.788032	0.792307	0.784167
7	0.149800	0.390517	0.836032	0.789463	0.798551	0.781980
8	0.258800	0.434438	0.813090	0.777745	0.767678	0.793234
9	0.254100	0.425094	0.833333	0.790678	0.792266	0.789152
10	0.145100	0.441016	0.835358	0.789790	0.796821	0.783773
11	0.288900	0.450081	0.831309	0.788940	0.789380	0.788506
12	0.186100	0.465647	0.833333	0.790357	0.792417	0.788399
'test_f1': 0.7224176465802333

#### GRU over all hidden states

In [None]:
class BertPoolerWithBN(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.gru = nn.GRU(input_size=config.hidden_size, 
                          hidden_size=config.hidden_size, 
                          bidirectional=True, 
                          dropout=0.1,
                          num_layers=2)
        self.activation = nn.LeakyReLU()
        self.bn = nn.LayerNorm(config.hidden_size)

    def forward(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding to the first token.
        first_token_tensor = hidden_states[:, :]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.bn(pooled_output)
        pooled_output = self.activation(pooled_output)
        return pooled_output
    
    
class MyNetGRU(nn.Module):
    """
        Attach a FC layer on top of the BERT head in order to produce a classification output.
        Hyperparameters are taken from Alberto.

        The pooled_output output of BERT is basically a projection of the [CLS] embeddings via another FC layer (768 -> 768 hidden units).
        We stack another FC layer with Dropout on top of that, as reported in https://github.com/google-research/bert/blob/eedf5716ce1268e56f0a50264a88cafad334ac61/run_classifier.py#L574
    """
    def __init__(self, num_labels):
        super(MyNetBN, self).__init__()

        self.num_labels = num_labels
        self.model = copy.deepcopy(pretrained_model)#AutoModel.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0")
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, num_labels)
        
        self.model.pooler = BertPoolerWithBN(self.model.config)

        if self.num_labels >= 2:
            self.loss_fct = nn.CrossEntropyLoss()
        else:
            self.loss_fct = nn.BCEWithLogitsLoss()


    def forward(self, labels, input_ids, attention_mask, **args):
        #For the output format -> https://huggingface.co/transformers/_modules/transformers/models/bert/modeling_bert.html#BertForSequenceClassification.forward
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, **args)
        x = self.dropout(outputs[1])
        logits = self.linear(x)

        loss = self.loss_fct(logits, labels)

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )