## Exploratory Analysis



In [1]:
%load_ext autoreload
%autoreload 2
import os
import pandas as pd

train_path = "../../data/sentiment/training_set_sentipolc16.csv"
test_path = "../../data/sentiment/test_set_sentipolc16_gold2000.csv"

# Read SENTIPOLC16 data
# Take care that data is separated by commas and double quotes
train_df = pd.read_csv(train_path, sep=",", quotechar='"')

In [2]:
# Try with csv reader
import csv


lines = list(open(test_path, "r"))

reader = csv.reader(lines, quotechar='"', delimiter=',',
                     quoting=csv.QUOTE_ALL, skipinitialspace=True)

test_data = []

for row in reader:
    while len(row) != 9:
        # Join last one with commas
        last = row.pop()
        before_last = row.pop()

        row.append(before_last + "," + last)
    test_data.append(row)


In [3]:
from collections import Counter
Counter(len(t) for t in test_data)

Counter({9: 2000})

In [4]:
df_test = pd.DataFrame(test_data, columns=["idtwitter","subj","opos","oneg","iro","lpos","lneg","top","text"])

In [5]:
from sklearn.model_selection import train_test_split


train_df, dev_df = train_test_split(
    train_df, test_size=0.2, random_state=2022)

In [6]:
from datasets import DatasetDict, Dataset, Value, Features, ClassLabel

features = Features({
    'idtwitter': Value('string'),
    'text': Value('string'),
    'subj': ClassLabel(num_classes=2, names=["objective", "subjective"]),
    'opos': ClassLabel(num_classes=2, names=["obj. non positive", "obj,  positive"]),
    "oneg": ClassLabel(num_classes=2, names=["obj. non negative", "obj,  negative"]),
    "iro": ClassLabel(num_classes=2, names=["non ironic", "ironic"]),
    "lpos": ClassLabel(num_classes=2, names=["lit. non positive", "lit. positive"]),
    "lneg": ClassLabel(num_classes=2, names=["lit. non negative", "lit. negative"]),
    "top": Value('int64'),
})

train = Dataset.from_pandas(train_df, features=features, preserve_index=False)
dev = Dataset.from_pandas(dev_df, features=features, preserve_index=False)
test = Dataset.from_pandas(df_test, features=features, preserve_index=False)

ds = DatasetDict(
    train=train,
    dev=dev,
    test=test
)

See http://www.di.unito.it/~tutreeb/sentipolc-evalita16/sentipolc-guidelines2016UPDATED130916.pdf for the guidelines.

In [7]:
from pysentimiento.sentipolc import load_datasets

ds = load_datasets()

ds

Using custom data configuration pysentimiento--it_sentipolc16-8c8db12a2ed2bd1e
Found cached dataset parquet (/users/jmperez/.cache/huggingface/datasets/pysentimiento___parquet/pysentimiento--it_sentipolc16-8c8db12a2ed2bd1e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /users/jmperez/.cache/huggingface/datasets/pysentimiento___parquet/pysentimiento--it_sentipolc16-8c8db12a2ed2bd1e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-0a1d8f889d9b04fd.arrow
Loading cached processed dataset at /users/jmperez/.cache/huggingface/datasets/pysentimiento___parquet/pysentimiento--it_sentipolc16-8c8db12a2ed2bd1e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-11aa6dec258a6c5d.arrow
Loading cached processed dataset at /users/jmperez/.cache/huggingface/datasets/pysentimiento___parquet/pysentimiento--it_sentipolc16-8c8db12a2ed2bd1e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-a2324e4886af8dca.arrow


DatasetDict({
    dev: Dataset({
        features: ['idtwitter', 'text', 'subj', 'opos', 'oneg', 'iro', 'lpos', 'lneg', 'top'],
        num_rows: 1482
    })
    train: Dataset({
        features: ['idtwitter', 'text', 'subj', 'opos', 'oneg', 'iro', 'lpos', 'lneg', 'top'],
        num_rows: 5928
    })
    test: Dataset({
        features: ['idtwitter', 'text', 'subj', 'opos', 'oneg', 'iro', 'lpos', 'lneg', 'top'],
        num_rows: 2000
    })
})

In [8]:
import torch 

ds = ds.map(lambda ex: {"labels": torch.Tensor([ex["opos"], ex["oneg"]])}, batched=False)

Loading cached processed dataset at /users/jmperez/.cache/huggingface/datasets/pysentimiento___parquet/pysentimiento--it_sentipolc16-8c8db12a2ed2bd1e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-d008cb808199df88.arrow
Loading cached processed dataset at /users/jmperez/.cache/huggingface/datasets/pysentimiento___parquet/pysentimiento--it_sentipolc16-8c8db12a2ed2bd1e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-0183ffeb7f69a3ef.arrow
Loading cached processed dataset at /users/jmperez/.cache/huggingface/datasets/pysentimiento___parquet/pysentimiento--it_sentipolc16-8c8db12a2ed2bd1e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-49b2b6a32026d0a2.arrow


In [10]:
from pysentimiento.training import train_and_eval
from pysentimiento.tuning import get_training_arguments

model_name = "m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0"

training_args = get_training_arguments(model_name, task_name="sentiment", lang="it", use_defaults_if_not_tuned=True, metric_for_best_model="macro_f1")


trainer, test_results = train_and_eval(
    model_name, ds, id2label=["pos", "neg"], lang="it", training_args=training_args, 
)



In [11]:
test_results

PredictionOutput(predictions=array([[-4.4218407 , -5.0760736 ],
       [-3.871193  , -5.0263996 ],
       [-4.6962266 , -4.291049  ],
       ...,
       [-1.8943374 ,  3.911604  ],
       [-4.185651  ,  3.0266716 ],
       [ 0.17544602,  0.6613426 ]], dtype=float32), label_ids=array([[0., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 1.],
       [0., 1.],
       [1., 0.]], dtype=float32), metrics={'test_loss': 0.5763788223266602, 'test_pos_f1': 0.5150421179302045, 'test_pos_precision': 0.44676409185803756, 'test_pos_recall': 0.6079545454545454, 'test_neg_f1': 0.6456692913385826, 'test_neg_precision': 0.82, 'test_neg_recall': 0.5324675324675324, 'test_emr': 0.624, 'test_macro_f1': 0.580355703830719, 'test_macro_precision': 0.6333820223808289, 'test_macro_recall': 0.5702110528945923, 'test_runtime': 3.7992, 'test_samples_per_second': 526.424, 'test_steps_per_second': 16.582})