In [1]:
%env WANDB_PROJECT=nlu_sentiment_analysis
!wandb login 2cad8a8279143c69ce071f54bf37c1f5a5f4e5ff
import wandb

from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import itertools
import requests, re, string, datetime, copy
from functools import partial

import torch
import torchvision.transforms as T, torch.nn.functional as F, torch.nn as nn
from torch.utils.data import DataLoader

from datasets import Dataset
from transformers import TrainingArguments, EarlyStoppingCallback, TrainerCallback
from transformers import Trainer, get_linear_schedule_with_warmup
from transformers.modeling_outputs import SequenceClassifierOutput

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, f1_score


PATH = "./data/Sentipolc16/"

env: WANDB_PROJECT=nlu_sentiment_analysis
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/steve/.netrc


In [None]:
train = pd.read_csv(PATH + "training_set_sentipolc16.csv")
train.head()

In [None]:
file1 = open(PATH + "test_set_sentipolc16_gold2000.csv", 'r') 
Lines = file1.readlines()
 
test = []
for line in Lines:
  arr = line.split("\",")
  if len(arr) != 9:
    arr[8] = arr[8] + arr[9]  #to account for tweets containing the delimiter charachter that would create more splits than needed
    del arr[9:]
  for i in range(8):
    arr[i] = int(arr[i].strip("\""))
  test.append(arr)

test = pd.DataFrame(test, columns=train.columns)
test.head()

In [None]:
"""Tokenization classes for Italian AlBERTo models."""
import collections
import os


def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
    with open(vocab_file, "r", encoding="utf-8") as reader:
        tokens = reader.readlines()
    for index, token in enumerate(tokens):
        token = token.rstrip("\n")
        vocab[token] = index
    return vocab

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'user', 'percent', 'money', 'phone', 'time', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag"},
    fix_html=True,  # fix HTML tokens

    unpack_hashtags=True,  # perform word segmentation on hashtags

    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts=[emoticons]
)

class AlBERTo_Preprocessing(object):
    def __init__(self, do_lower_case=True, **kwargs):
        self.do_lower_case = do_lower_case

    def preprocess(self, text):
        if self.do_lower_case:
            text = text.lower()
        text = str(" ".join(text_processor.pre_process_doc(text)))
        text = re.sub(r'[^a-zA-ZÀ-ú</>!?♥♡\s\U00010000-\U0010ffff]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'(\w)\1{2,}', r'\1\1', text)
        text = re.sub(r'^\s', '', text)
        text = re.sub(r'\s$', '', text)
        return text

a = AlBERTo_Preprocessing(do_lower_case=True)
s = "#IlGOverno presenta le linee guida sulla scuola #labuonascuola - http://t.co/SYS1T9QmQN"
b = a.preprocess(s)
print(b)

In [None]:
from transformers import AutoTokenizer, AutoModel

def tokenize_function(examples):
    sa = tok(examples["text"], padding="max_length", truncation=True)
    return sa

a = AlBERTo_Preprocessing(do_lower_case=True)
s: str = "#IlGOverno presenta le linee guida sulla scuola #labuonascuola - http://t.co/SYS1T9QmQN"
b = a.preprocess(s)

tok = AutoTokenizer.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0")
pretrained_model = AutoModel.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0")
tok.model_max_length = 128 #model.config.max_position_embeddings
tokens = tok.tokenize(b)
print(tokens)

In [None]:
class MyNetMC(nn.Module):
    """
        Attach a FC layer on top of the BERT head in order to produce a classification output.

        The pooled_output output of BERT is basically a projection of the [CLS] embeddings via another FC layer (768 -> 768 hidden units).
        We stack another FC layer with Dropout on top of that, as reported in https://github.com/google-research/bert/blob/eedf5716ce1268e56f0a50264a88cafad334ac61/run_classifier.py#L574
    """
    def __init__(self, num_labels):
        super(MyNetMC, self).__init__()

        self.num_labels = num_labels
        self.model = copy.deepcopy(pretrained_model)#AutoModel.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0")
        self.dropout1 = nn.Dropout(0.1)
        self.linear1 = nn.Linear(768, 3)

        self.loss_fct = nn.CrossEntropyLoss()


    def forward(self, labels, input_ids, attention_mask, **args):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, **args)
        x = self.dropout1(outputs[1])
        logits = self.linear1(x)
        loss = self.loss_fct(logits, labels)
        return logits , loss
    
class MyNetMCTuned(nn.Module):
    """
        Attach a FC layer on top of the BERT head in order to produce a classification output.

        The pooled_output output of BERT is basically a projection of the [CLS] embeddings via another FC layer (768 -> 768 hidden units).
        We stack another FC layer without Dropout on top of that, as reported in https://github.com/google-research/bert/blob/eedf5716ce1268e56f0a50264a88cafad334ac61/run_classifier.py#L574
    """
    def __init__(self, num_labels):
        super(MyNetMCTuned, self).__init__()

        self.num_labels = num_labels
        self.model = copy.deepcopy(pretrained_model)
        self.linear1 = nn.Linear(768, 3)

        self.loss_fct = nn.CrossEntropyLoss()

    def forward(self, labels, input_ids, attention_mask, **args):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, **args)
        logits = self.linear(outputs[1])
        loss = self.loss_fct(logits, labels)
        return logits , loss
    
class EarlyStopping():    
    def __init__(self, min_delta = 0, patience = 0):        
        self.min_delta = min_delta
        self.patience = patience
        self.wait = 0
        self.stopped_epoch = 0
        self.best = -np.Inf
        self.stop_training = False
    
    def on_epoch_end(self, epoch, current_value):
        if np.greater((current_value - self.min_delta), self.best):
            self.best = current_value
            self.wait = 0
        else:
            self.wait += 1
            if self.wait > self.patience:
                self.stopped_epoch = epoch
                self.stop_training = True
        return self.stop_training


TRAIN_BATCH_SIZE = 64 
PREDICT_BATCH_SIZE = 64
EVAL_BATCH_SIZE = 64 
WEIGHT_DECAY = 0.01
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3
MAX_SEQ_LENGTH = 128
WARMUP_PROPORTION = 0.1
num_train_steps = int(len(training) / TRAIN_BATCH_SIZE * NUM_EPOCHS)+1
NUM_WARMUP_STEPS =  int(num_train_steps * WARMUP_PROPORTION)
RUN_NAME = "test_trainer"
device = "cuda:0"

In [None]:
def train_epoch(model, train_loader, optimizer, scheduler, epoch, logging):
    model.train()
    targets = []
    outputs = []
    cumulative_loss = 0.
    for i , data in tqdm(enumerate(train_loader, 0), total=len(train_loader)):        
        targets.extend(data["labels"].numpy())
        
        batch = {k: v.to(device) for k, v in data.items()}
        logits , loss = model(**batch)

        cumulative_loss += loss.detach()
        if (i+1) % 25 == 0 and logging:
            print(f'Epoch: {epoch}, Loss:  {cumulative_loss.item()/i}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        outputs.extend(logits.argmax(-1).cpu().detach().numpy().tolist())
    if logging: wandb.log({"train": {'loss': cumulative_loss.item() / len(outputs)}})
    return outputs, targets
    
        
def validation_epoch(model, epoch, val_loader, kind, logging):
    model.eval()
    targets = []
    outputs = []
    cumulative_loss = 0.
    with torch.no_grad():
        for _, data in enumerate(val_loader, 0):
            batch = {k: v.to(device) for k, v in data.items()}
            logits , loss = model(**batch)
            cumulative_loss += loss.detach()
            if (i+1) % 25 == 0 and logging:
                print(f'Epoch: {epoch}, Loss:  {cumulative_loss.item()/i}')
            
            targets.extend(batch["labels"].cpu().detach().numpy())
            outputs.extend(logits.argmax(-1).cpu().detach().numpy().tolist())
    if logging: wandb.log({kind: {'loss': cumulative_loss.item() / len(outputs)}})
    return outputs, targets

# SENTIPOLC16

In [None]:
def separate2united_labels(row):
    """
        Return a single scalar integer label associated to the polarity of the tweet.

        Negative -> 0
        Neutral  -> 1
        Positive -> 2
        Mixed    -> 3
    """
    if row["opos"] == 0 and row["oneg"] == 0:
        return 1
    elif row["oneg"] == 0 and row["opos"] == 1:
        return 2
    elif row["oneg"] == 1 and row["opos"] == 0:
        return 0
    else:
        return 3

#train set
dataset = pd.DataFrame({"text": train.text.apply(a.preprocess), "idx": train.index, "labels": train[["opos", "oneg"]].apply(separate2united_labels, axis=1)})
X_train, X_val = train_test_split(dataset, test_size=0.2, random_state=42, stratify=dataset["labels"])

X_train = Dataset.from_pandas(X_train)
X_val = Dataset.from_pandas(X_val)

training = X_train\
                    .map(tokenize_function, batched=True)\
                    .filter(lambda example: example['labels'] != 3)\
                    .shuffle(seed=42)\
                    .with_format("torch")
validating = X_val\
                    .map(tokenize_function, batched=True)\
                    .filter(lambda example: example['labels'] != 3)\
                    .with_format("torch")


#test set
dataset = pd.DataFrame({"text": test.text.apply(a.preprocess), "idx": test.index, "labels": test[["opos", "oneg"]].apply(separate2united_labels, axis=1)})
dataset = Dataset.from_pandas(dataset)

testing = dataset\
                    .map(tokenize_function, batched=True)\
                    .filter(lambda example: example['labels'] != 3)\
                    .with_format("torch")

### AlBERTo pretrained

In [None]:
test_loader = DataLoader(testing.with_format("torch", columns=["input_ids", "attention_mask", "labels", "token_type_ids"]), batch_size=64)
preds , trues = validation_epoch(model, None, test_loader, "test", logging=False)

dataset_test = pd.DataFrame({"text": test.text.apply(a.preprocess), "idx": test.index, "labels": test[["opos", "oneg"]].apply(separate2united_labels, axis=1)})
dataset_test = dataset_test[dataset_test.labels != 3]

In [None]:
#load best model's params
model = MyNetMCTuned(3).to(device)
model.load_state_dict(torch.load("data/models/alberto_multiclass.pt"))

preds , trues = validation_epoch(model, None, test_loader, "test", logging=False)

for n , (i , row) in enumerate(dataset_test.iterrows()):
    if preds[n] != trues[n]:
        print(f"{row['text']} ------ true={row['labels']} pred={preds[n]}  \n")
        assert row['labels'] == trues[n]
        
print(classification_report(trues, preds, target_names=["negative", "neutral", "positive"]))

cm = confusion_matrix(trues, preds, normalize=True)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["negative", "neutral", "positive"]).plot(values_format="d")
plt.title("Sentipolc16 - AlBERToMC")

### AlBERTo fine-tuned

# FEEL-IT