In [1]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import itertools
import requests, re, string, datetime, copy
from functools import partial

import torch
import torchvision.transforms as T, torch.nn.functional as F, torch.nn as nn

from datasets import Dataset
import datasets
import logging
datasets.logging.get_verbosity = lambda: logging.NOTSET

from transformers import TrainingArguments, EarlyStoppingCallback, TrainerCallback
from transformers import Trainer
from transformers.modeling_outputs import SequenceClassifierOutput

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

PATH = "./data/Sentipolc16/"

In [2]:
train = pd.read_csv(PATH + "training_set_sentipolc16.csv")
train.head()

Unnamed: 0,idtwitter,subj,opos,oneg,iro,lpos,lneg,top,text
0,122449983151669248,1,0,1,0,0,1,1,Intanto la partita per Via Nazionale si compli...
1,125485104863780865,1,0,1,0,0,1,1,"False illusioni, sgradevoli realtà Mario Monti..."
2,125513454315507712,1,0,1,0,0,1,1,"False illusioni, sgradevoli realtà #editoriale..."
3,125524238290522113,1,0,1,0,0,1,1,Mario Monti: Berlusconi risparmi all'Italia il...
4,125527933224886272,1,0,1,0,0,1,1,Mario Monti: Berlusconi risparmi all'Italia il...


In [3]:
file1 = open(PATH + "test_set_sentipolc16_gold2000.csv", 'r') 
Lines = file1.readlines()
 
test = []
for line in Lines:
    arr = line.split("\",")
    if len(arr) != 9:
        arr[8] = arr[8] + arr[9]  #to account for tweets containing the delimiter charachter that would create more splits than needed
        del arr[9:]
    for i in range(8):
        arr[i] = int(arr[i].strip("\""))
    test.append(arr)

test = pd.DataFrame(test, columns=train.columns)
test.head()

Unnamed: 0,idtwitter,subj,opos,oneg,iro,lpos,lneg,top,text
0,507074506880712705,0,0,0,0,0,0,2,"""Tra 5 minuti presentazione piano scuola del g..."
1,507075789456961536,1,1,0,0,1,0,2,"""\""@matteorenzi: Alle 10 appuntamento su http:..."
2,507077511902425088,1,0,1,0,0,1,2,"""#labuonascuola gli #evangelisti #digitali non..."
3,507079183315787777,0,0,0,0,0,0,2,"""Riforma scuola Tutto il discorso di Renzi su..."
4,507080190225563648,1,0,0,0,0,0,2,""".@matteorenzi @MiurSocial #labuonascuola bast..."


In [4]:
import collections
import os

def separate2united_labels(row):
  """
  Return a single scalar integer label associated to the polarity of the tweet.

  Negative -> 0
  Neutral  -> 1
  Positive -> 2
  Mixed    -> 3
  """
  if row["opos"] == 0 and row["oneg"] == 0:
    return 1
  elif row["oneg"] == 0 and row["opos"] == 1:
    return 2
  elif row["oneg"] == 1 and row["opos"] == 0:
    return 0
  else:
    return 3

def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
    with open(vocab_file, "r", encoding="utf-8") as reader:
        tokens = reader.readlines()
    for index, token in enumerate(tokens):
        token = token.rstrip("\n")
        vocab[token] = index
    return vocab

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'user', 'percent', 'money', 'phone', 'time', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag"},
    fix_html=True,  # fix HTML tokens

    unpack_hashtags=True,  # perform word segmentation on hashtags

    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts=[emoticons]
)

class AlBERTo_Preprocessing(object):
    def __init__(self, do_lower_case=True, **kwargs):
        self.do_lower_case = do_lower_case

    def preprocess(self, text):
        if self.do_lower_case:
            text = text.lower()
        text = str(" ".join(text_processor.pre_process_doc(text)))
        text = re.sub(r'[^a-zA-ZÀ-ú</>!?♥♡\s\U00010000-\U0010ffff]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'(\w)\1{2,}', r'\1\1', text)
        text = re.sub(r'^\s', '', text)
        text = re.sub(r'\s$', '', text)
        return text

a = AlBERTo_Preprocessing(do_lower_case=True)
s = "#IlGOverno presenta le linee guida sulla scuola #labuonascuola - http://t.co/SYS1T9QmQN"
b = a.preprocess(s)
print(b)

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading english - 1grams ...
Reading english - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


Reading english - 1grams ...
<hashtag> il governo </hashtag> presenta le linee guida sulla scuola <hashtag> la buona scuola </hashtag> <url>


In [5]:
def k_steps_evidence_sa(num_iterations, early_stopping_patience, training_args, net, training_opos, training_oneg, testing_opos, testing_oneg, validating_opos, validating_oneg):
    np.random.seed(0)
    torch.manual_seed(0)

    keys = ["eval_loss", "eval_accuracy", "eval_f1", "eval_precision", "eval_recall"]
    metrics = { i + sa: [] for i in keys for sa in ["_opos", "_oneg"]}
    for i in range(num_iterations):
        for subtask , training , testing , validating in zip(["opos", "oneg"], [training_opos, training_oneg], [testing_opos, testing_oneg], [validating_opos, validating_oneg]):
            trainer = Trainer(
                model=net(2),
                args=training_args, 
                train_dataset=training.shuffle(seed=i), 
                eval_dataset=validating, 
                compute_metrics=compute_metrics,
                callbacks=[EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)] if early_stopping_patience is not None else None
            )
            trainer.train()
            res = trainer.evaluate(testing)
            for m in keys:
                metrics[m + "_" + subtask].append(res[m])

    for m in metrics.keys():
        print("{:18s}\t: {:.4} ± {:.4}".format(m, np.mean(metrics[m]), np.std(metrics[m])))
    print("\nOverall F1: {:.4}".format(np.mean(
        [np.mean(metrics["eval_f1_opos"]), np.mean(metrics["eval_f1_oneg"])]
    )))
    
def k_steps_evidence_subj(num_iterations, early_stopping_patience, training_args, net, training, testing, validating):
    np.random.seed(0)
    torch.manual_seed(0)

    keys = ["eval_loss", "eval_accuracy", "eval_f1", "eval_precision", "eval_recall"]
    metrics = { i: [] for i in keys}
    for i in range(num_iterations):
        trainer = Trainer(
            model=net(2),
            args=training_args, 
            train_dataset=training.shuffle(seed=i), 
            eval_dataset=validating, 
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)] if early_stopping_patience is not None else None
        )
        trainer.train()
        res = trainer.evaluate(testing)
        for m in keys:
            metrics[m].append(res[m])

    for m in metrics.keys():
        print("{:18s}\t: {:.4} ± {:.4}".format(m, np.mean(metrics[m]), np.std(metrics[m])))
    print("\nOverall F1: {:.4}".format(np.mean(
        [np.mean(metrics["eval_f1"]), np.mean(metrics["eval_f1"])]
    )))
    return trainer

### loading model

In [6]:
from transformers import AutoTokenizer, AutoModel

a = AlBERTo_Preprocessing(do_lower_case=True)
s: str = "#IlGOverno presenta le linee guida sulla scuola #labuonascuola - http://t.co/SYS1T9QmQN"
b = a.preprocess(s)

tok = AutoTokenizer.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0")
pretrained_model = AutoModel.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0")
tok.model_max_length = 128 #model.config.max_position_embeddings
tokens = tok.tokenize(b)
print(tokens)

['<', 'ha', '##shtag', '>', 'il', 'governo', '<', '/', 'ha', '##shtag', '>', 'presenta', 'le', 'linee', 'guida', 'sulla', 'scuola', '<', 'ha', '##shtag', '>', 'la', 'buona', 'scuola', '<', '/', 'ha', '##shtag', '>', '<', 'ur', '##l', '>']


### model

In [7]:
class MyNet(nn.Module):
    """
        Attach a FC layer on top of the BERT head in order to produce a classification output.
        Hyperparameters are taken from Alberto.

        The pooled_output output of BERT is basically a projection of the [CLS] embeddings via another FC layer (768 -> 768 hidden units).
        We stack another FC layer with Dropout on top of that, as reported in https://github.com/google-research/bert/blob/eedf5716ce1268e56f0a50264a88cafad334ac61/run_classifier.py#L574
    """
    def __init__(self, num_labels):
        super(MyNet, self).__init__()

        self.num_labels = num_labels
        self.model = copy.deepcopy(pretrained_model)#AutoModel.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0")
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, num_labels)

        if self.num_labels >= 2:
            self.loss_fct = nn.CrossEntropyLoss()
        else:
            self.loss_fct = nn.BCEWithLogitsLoss()


    def forward(self, labels, input_ids, attention_mask, **args):
        #For the output format -> https://huggingface.co/transformers/_modules/transformers/models/bert/modeling_bert.html#BertForSequenceClassification.forward
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, **args)
        x = self.dropout(outputs[1])
        logits = self.linear(x)

        loss = self.loss_fct(logits, labels)

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

# SENTIMENT ANALYSIS

### dataset

In [8]:
def tokenize_function(examples):
    sa = tok(examples["text"], padding="max_length", truncation=True)
    return sa

def separate2united_labels2(row):
    return str(row["opos"]) + str(row["oneg"])

def process_dataset_task(example, subtask):
    example["labels"] = int(example["labels"][0]) if subtask == "opos" else int(example["labels"][1])
    return example

#train set
dataset = pd.DataFrame({"text": train.text.apply(a.preprocess), "idx": train.index, "labels": train[["opos", "oneg"]].apply(separate2united_labels2, axis=1)})
X_train, X_val = train_test_split(dataset, test_size=0.2, random_state=42, stratify=dataset["labels"])

X_train = Dataset.from_pandas(X_train)
X_val = Dataset.from_pandas(X_val)


##
# Create a Dataset for each subtask (evaluated separately by SentiPolc16).
# To do that, simply take, respectively, the first or the second char of the label for opos and oneg.
##
training_opos = X_train\
                    .map(process_dataset_task, fn_kwargs={"subtask": "opos"}, batched=False)\
                    .map(tokenize_function, batched=True)\
                    .shuffle(seed=42)\
                    .with_format("torch", columns=["input_ids", "attention_mask", "labels", "token_type_ids"])
training_oneg = X_train\
                   .map(process_dataset_task, fn_kwargs={"subtask": "oneg"}, batched=False)\
                   .map(tokenize_function, batched=True)\
                   .shuffle(seed=42)\
                   .with_format("torch", columns=["input_ids", "attention_mask", "labels", "token_type_ids"])
validating_opos = X_val\
                    .map(process_dataset_task, fn_kwargs={"subtask": "opos"}, batched=False)\
                    .map(tokenize_function, batched=True)\
                    .with_format("torch", columns=["input_ids", "attention_mask", "labels", "token_type_ids"])
validating_oneg = X_val\
                   .map(process_dataset_task, fn_kwargs={"subtask": "oneg"}, batched=False)\
                   .map(tokenize_function, batched=True)\
                   .with_format("torch", columns=["input_ids", "attention_mask", "labels", "token_type_ids"])


#test set
dataset = pd.DataFrame({"text": test.text.apply(a.preprocess), "idx": test.index, "labels": test[["opos", "oneg"]].apply(separate2united_labels2, axis=1)})
dataset = Dataset.from_pandas(dataset)

##
# Create a Dataset for each subtask (evaluated separately by SentiPolc16).
# To do that, simply take, respectively, the first or the second char of the label for opos and oneg.
##
testing_opos = dataset.map(process_dataset_task, fn_kwargs={"subtask": "opos"}, batched=False).map(tokenize_function, batched=True).shuffle().with_format("torch", columns=["input_ids", "attention_mask", "labels", "token_type_ids"])
testing_oneg = dataset\
                    .map(process_dataset_task, fn_kwargs={"subtask": "oneg"}, batched=False)\
                    .map(tokenize_function, batched=True)\
                    .with_format("torch", columns=["input_ids", "attention_mask", "labels", "token_type_ids"])

### training loop

In [9]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

class MemorySaverCallback(TrainerCallback):
    "A callback that deleted the folder in which checkpoints are saved, to save memory"
    def __init__(self, run_name):
        super(MemorySaverCallback, self).__init__()
        self.run_name = run_name

    def on_train_begin(self, args, state, control, **kwargs):
        print("Removing dirs...")
        if os.path.isdir(f'./{self.run_name}'):
            import shutil
            shutil.rmtree(f'./{self.run_name}')
        else:
            print("\n\nDirectory does not exists")

In [10]:
TRAIN_BATCH_SIZE = 64 
PREDICT_BATCH_SIZE = 64
EVAL_BATCH_SIZE = 64 
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3
MAX_SEQ_LENGTH = 128
WARMUP_PROPORTION = 0.1
num_train_steps = int(len(training_opos) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)+1
NUM_WARMUP_STEPS = int(NUM_TRAIN_EPOCHS * WARMUP_PROPORTION)

args = TrainingArguments(
            "test_trainer", 
            num_train_epochs=NUM_TRAIN_EPOCHS,
            per_device_train_batch_size=TRAIN_BATCH_SIZE,
            per_device_eval_batch_size=PREDICT_BATCH_SIZE,
            save_total_limit=2,
            learning_rate=LEARNING_RATE,
            warmup_steps=NUM_WARMUP_STEPS,
            weight_decay=0.01,
            adam_beta1=0.9,
            adam_beta2=0.999,
            adam_epsilon=1e-6,
            report_to="none",
            load_best_model_at_end=True
        )

k_steps_evidence_sa(num_iterations=5, 
                 early_stopping_patience=None,
                 training_args=args, 
                 net=MyNet, 
                 training_opos=training_opos, 
                 training_oneg=training_oneg, 
                 validating_opos=validating_opos,
                 validating_oneg=validating_oneg,
                 testing_opos=testing_opos, 
                 testing_oneg=testing_oneg) 

Step,Training Loss


Step,Training Loss


Step,Training Loss


Step,Training Loss


Step,Training Loss


Step,Training Loss


Step,Training Loss


Step,Training Loss


Step,Training Loss


Step,Training Loss


eval_loss_opos    	: 0.3704 ± 0.01362
eval_loss_oneg    	: 0.6451 ± 0.03445
eval_accuracy_opos	: 0.8437 ± 0.007467
eval_accuracy_oneg	: 0.7474 ± 0.006053
eval_f1_opos      	: 0.7468 ± 0.005378
eval_f1_oneg      	: 0.6903 ± 0.01112
eval_precision_opos	: 0.7348 ± 0.009868
eval_precision_oneg	: 0.7893 ± 0.002023
eval_recall_opos  	: 0.7633 ± 0.003462
eval_recall_oneg  	: 0.685 ± 0.009169

Overall F1: 0.7186


# Subjectivity
## dataset

In [12]:
def tokenize_function(examples):
    sa = tok(examples["text"], padding="max_length", truncation=True)
    return sa

#train set
dataset = pd.DataFrame({"text": train.text.apply(a.preprocess), "idx": train.index, "labels": train["subj"].tolist()})
#X_train, X_val = train_test_split(dataset, test_size=0.2, random_state=42, stratify=dataset["labels"])

X_train = Dataset.from_pandas(dataset)
#X_val = Dataset.from_pandas(X_val)


training = X_train\
                .map(tokenize_function, batched=True)\
                .shuffle(seed=42)\
                .with_format("torch", columns=["input_ids", "attention_mask", "labels", "token_type_ids"])
#validating = X_val\
#                .map(tokenize_function, batched=True)\
#                .with_format("torch", columns=["input_ids", "attention_mask", "labels", "token_type_ids"])


#test set
dataset = pd.DataFrame({"text": test.text.apply(a.preprocess), "idx": test.index, "labels": test["subj"].tolist()})
testing = Dataset.from_pandas(dataset)\
                .map(tokenize_function, batched=True)\
                .with_format("torch", columns=["input_ids", "attention_mask", "labels", "token_type_ids"])

In [12]:
# TRAIN_BATCH_SIZE = 64 
# PREDICT_BATCH_SIZE = 64
# EVAL_BATCH_SIZE = 64 
# LEARNING_RATE = 2e-5
# NUM_TRAIN_EPOCHS = 3
# MAX_SEQ_LENGTH = 128
# WARMUP_PROPORTION = 0.1
# num_train_steps = int(len(training) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)+1
# NUM_WARMUP_STEPS = int(NUM_TRAIN_EPOCHS * WARMUP_PROPORTION)

# args = TrainingArguments(
#             "test_trainer", 
#             num_train_epochs=NUM_TRAIN_EPOCHS,
#             per_device_train_batch_size=TRAIN_BATCH_SIZE,
#             per_device_eval_batch_size=PREDICT_BATCH_SIZE,
#             save_total_limit=2,
#             learning_rate=LEARNING_RATE,
#             warmup_steps=NUM_WARMUP_STEPS,
#             weight_decay=0.01,
#             adam_beta1=0.9,
#             adam_beta2=0.999,
#             adam_epsilon=1e-6,
#             report_to="none",
#             load_best_model_at_end=True
#         )

# k_steps_evidence_subj(num_iterations=5, 
#                  early_stopping_patience=None,
#                  training_args=args, 
#                  net=MyNet, 
#                  training=training, 
#                  validating=None,
#                  testing=testing
#                 ) 

RuntimeError: CUDA out of memory. Tried to allocate 48.00 MiB (GPU 0; 10.76 GiB total capacity; 9.54 GiB already allocated; 29.94 MiB free; 9.69 GiB reserved in total by PyTorch)

In [13]:
TRAIN_BATCH_SIZE = 64 
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 2
WARMUP_PROPORTION = 0.1
num_train_steps = int(len(training) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)+1
NUM_WARMUP_STEPS = int(NUM_TRAIN_EPOCHS * WARMUP_PROPORTION)

args = TrainingArguments(
            "test_trainer", 
            num_train_epochs=NUM_TRAIN_EPOCHS,
            per_device_train_batch_size=32,
            per_device_eval_batch_size=16,
            save_total_limit=2,
            learning_rate=LEARNING_RATE,
            warmup_steps=NUM_WARMUP_STEPS,
            weight_decay=0.01,
            adam_beta1=0.9,
            adam_beta2=0.999,
            adam_epsilon=1e-6,
            report_to="none",
)
    
trainer = k_steps_evidence_subj(num_iterations=3, 
                             early_stopping_patience=None,
                             training_args=args, 
                             net=MyNet, 
                             training=training, 
                             validating=None,
                             testing=testing
                            ) 

preds , _ , _ = trainer.predict(testing)
print(classification_report(testing["labels"], np.argmax(preds, axis=1), target_names=["objective", "subjective"]))

Step,Training Loss


Step,Training Loss


Step,Training Loss


eval_loss         	: 0.4953 ± 0.01492
eval_accuracy     	: 0.779 ± 0.004708
eval_f1           	: 0.7719 ± 0.003566
eval_precision    	: 0.7727 ± 0.0009315
eval_recall       	: 0.7997 ± 0.0006808

Overall F1: 0.7719
              precision    recall  f1-score   support

   objective       0.65      0.85      0.73       695
  subjective       0.90      0.75      0.82      1305

    accuracy                           0.79      2000
   macro avg       0.77      0.80      0.78      2000
weighted avg       0.81      0.79      0.79      2000

