## Exploratory Analysis



In [1]:
%load_ext autoreload
%autoreload 2
import os
import pandas as pd

train_path = "../../data/sentiment/training_set_sentipolc16.csv"
test_path = "../../data/sentiment/test_set_sentipolc16_gold2000.csv"

# Read SENTIPOLC16 data
# Take care that data is separated by commas and double quotes
train_df = pd.read_csv(train_path, sep=",", quotechar='"')

In [2]:
# Try with csv reader
import csv


lines = list(open(test_path, "r"))

reader = csv.reader(lines, quotechar='"', delimiter=',',
                     quoting=csv.QUOTE_ALL, skipinitialspace=True)

test_data = []

for row in reader:
    while len(row) != 9:
        # Join last one with commas
        last = row.pop()
        before_last = row.pop()

        row.append(before_last + "," + last)
    test_data.append(row)


In [3]:
from collections import Counter
Counter(len(t) for t in test_data)

Counter({9: 2000})

In [4]:
df_test = pd.DataFrame(test_data, columns=["idtwitter","subj","opos","oneg","iro","lpos","lneg","top","text"])

In [5]:
from sklearn.model_selection import train_test_split


train_df, dev_df = train_test_split(
    train_df, test_size=0.2, random_state=2022)

In [6]:
from datasets import DatasetDict, Dataset, Value, Features, ClassLabel

features = Features({
    'idtwitter': Value('string'),
    'text': Value('string'),
    'subj': ClassLabel(num_classes=2, names=["objective", "subjective"]),
    'opos': ClassLabel(num_classes=2, names=["obj. non positive", "obj,  positive"]),
    "oneg": ClassLabel(num_classes=2, names=["obj. non negative", "obj,  negative"]),
    "iro": ClassLabel(num_classes=2, names=["non ironic", "ironic"]),
    "lpos": ClassLabel(num_classes=2, names=["lit. non positive", "lit. positive"]),
    "lneg": ClassLabel(num_classes=2, names=["lit. non negative", "lit. negative"]),
    "top": Value('int64'),
})

train = Dataset.from_pandas(train_df, features=features, preserve_index=False)
dev = Dataset.from_pandas(dev_df, features=features, preserve_index=False)
test = Dataset.from_pandas(df_test, features=features, preserve_index=False)

ds = DatasetDict(
    train=train,
    dev=dev,
    test=test
)

See http://www.di.unito.it/~tutreeb/sentipolc-evalita16/sentipolc-guidelines2016UPDATED130916.pdf for the guidelines.

In [12]:
from pysentimiento.sentipolc import load_datasets

ds = load_datasets()

ds



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/1482 [00:00<?, ?ex/s]

  0%|          | 0/5928 [00:00<?, ?ex/s]

  0%|          | 0/2000 [00:00<?, ?ex/s]

DatasetDict({
    dev: Dataset({
        features: ['idtwitter', 'text', 'subj', 'opos', 'oneg', 'iro', 'lpos', 'lneg', 'top', 'labels'],
        num_rows: 1482
    })
    train: Dataset({
        features: ['idtwitter', 'text', 'subj', 'opos', 'oneg', 'iro', 'lpos', 'lneg', 'top', 'labels'],
        num_rows: 5928
    })
    test: Dataset({
        features: ['idtwitter', 'text', 'subj', 'opos', 'oneg', 'iro', 'lpos', 'lneg', 'top', 'labels'],
        num_rows: 2000
    })
})

In [13]:
from pysentimiento.training import train_and_eval
from pysentimiento.tuning import get_training_arguments

model_name = "m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0"

training_args = get_training_arguments(model_name, task_name="sentiment", lang="it", use_defaults_if_not_tuned=True, metric_for_best_model="macro_f1")


trainer, test_results = train_and_eval(
    model_name, ds, id2label=["pos", "neg"], lang="it", training_args=training_args, 
)



In [14]:
test_results.metrics

{'test_loss': 0.5472076535224915,
 'test_pos_f1': 0.5400516795865633,
 'test_pos_precision': 0.495260663507109,
 'test_pos_recall': 0.59375,
 'test_neg_f1': 0.625,
 'test_neg_precision': 0.8158995815899581,
 'test_neg_recall': 0.5064935064935064,
 'test_emr': 0.6425,
 'test_macro_f1': 0.5825258493423462,
 'test_macro_precision': 0.6555801630020142,
 'test_macro_recall': 0.5501217842102051,
 'test_runtime': 3.7927,
 'test_samples_per_second': 527.325,
 'test_steps_per_second': 16.611}

In [15]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0")

loading configuration file config.json from cache at /users/jmperez/.cache/huggingface/hub/models--m-polignano-uniba--bert_uncased_L-12_H-768_A-12_italian_alb3rt0/snapshots/4454cfbc82952da79729e33e81c37a72dc095b4b/config.json
Model config BertConfig {
  "_name_or_path": "m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_ve