## Exploratory Analysis



In [45]:
%load_ext autoreload
%autoreload 2
import os
import pandas as pd

train_path = "../../data/sentiment/training_set_sentipolc16.csv"
test_path = "../../data/sentiment/test_set_sentipolc16_gold2000.csv"

# Read SENTIPOLC16 data
# Take care that data is separated by commas and double quotes
train_df = pd.read_csv(train_path, sep=",", quotechar='"')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [46]:
# Try with csv reader
import csv


lines = list(open(test_path, "r"))

reader = csv.reader(lines, quotechar='"', delimiter=',',
                     quoting=csv.QUOTE_ALL, skipinitialspace=True)

test_data = []

for row in reader:
    while len(row) != 9:
        # Join last one with commas
        last = row.pop()
        before_last = row.pop()

        row.append(before_last + "," + last)
    test_data.append(row)


In [47]:
from collections import Counter
Counter(len(t) for t in test_data)

Counter({9: 2000})

In [48]:
df_test = pd.DataFrame(test_data, columns=["idtwitter","subj","opos","oneg","iro","lpos","lneg","top","text"])

In [53]:
from sklearn.model_selection import train_test_split


train_df, dev_df = train_test_split(
    train_df, test_size=0.2, random_state=2022)

In [58]:
{(x, x in train_df.columns) for x in features}

{('idtwitter', True),
 ('iro', True),
 ('lneg', True),
 ('lpos', True),
 ('oneg', True),
 ('opos', True),
 ('subj', True),
 ('text', True),
 ('top', True)}

In [61]:
# Check difference between columns

set(features.keys()) - set(train_df.columns)

set()

In [64]:
from datasets import DatasetDict, Dataset, Value, Features, ClassLabel

features = Features({
    'idtwitter': Value('string'),
    'text': Value('string'),
    'subj': ClassLabel(num_classes=2, names=["objective", "subjective"]),
    'opos': ClassLabel(num_classes=2, names=["obj. non positive", "obj,  positive"]),
    "oneg": ClassLabel(num_classes=2, names=["obj. non negative", "obj,  negative"]),
    "iro": ClassLabel(num_classes=2, names=["non ironic", "ironic"]),
    "lpos": ClassLabel(num_classes=2, names=["lit. non positive", "lit. positive"]),
    "lneg": ClassLabel(num_classes=2, names=["lit. non negative", "lit. negative"]),
    "top": Value('int64'),
})

train = Dataset.from_pandas(train_df, features=features, preserve_index=False)
dev = Dataset.from_pandas(dev_df, features=features, preserve_index=False)
test = Dataset.from_pandas(df_test, features=features, preserve_index=False)

ds = DatasetDict(
    train=train,
    dev=dev,
    test=test
)

In [65]:
ds.push_to_hub("pysentimiento/it_sentipolc16", private=True)



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]



Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

See http://www.di.unito.it/~tutreeb/sentipolc-evalita16/sentipolc-guidelines2016UPDATED130916.pdf for the guidelines.

In [68]:
from pysentimiento.sentipolc import load_datasets

ds = load_datasets()

ds



  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?ex/s]

  0%|          | 0/5928 [00:00<?, ?ex/s]

  0%|          | 0/1482 [00:00<?, ?ex/s]

DatasetDict({
    test: Dataset({
        features: ['idtwitter', 'text', 'subj', 'opos', 'oneg', 'iro', 'lpos', 'lneg', 'top'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['idtwitter', 'text', 'subj', 'opos', 'oneg', 'iro', 'lpos', 'lneg', 'top'],
        num_rows: 5928
    })
    dev: Dataset({
        features: ['idtwitter', 'text', 'subj', 'opos', 'oneg', 'iro', 'lpos', 'lneg', 'top'],
        num_rows: 1482
    })
})

In [69]:
ds = ds.map(lambda ex: {"labels": [ex["opos"], ex["oneg"]]}, batched=False)

  0%|          | 0/2000 [00:00<?, ?ex/s]

  0%|          | 0/5928 [00:00<?, ?ex/s]

  0%|          | 0/1482 [00:00<?, ?ex/s]

In [70]:
from transformers import AutoTokenizer
model_name = "m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0"


tokenizer = AutoTokenizer.from_pretrained(model_name)

In [71]:
from pysentimiento.training import train_and_eval
from pysentimiento.tuning import get_training_arguments


training_args = get_training_arguments(model_name, task_name="sentiment", lang="it", use_defaults_if_not_tuned=True, metric_for_best_model="macro_f1")


train_and_eval(model_name, ds, id2label=["pos", "neg"], lang="it", training_args=training_args, )



Some weights of the model checkpoint at m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0 were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: lpos, oneg, text, opos, idtwitter, iro, top, lneg, subj. If lpos, oneg, text, opos, idtwitter, iro, top, lneg, subj are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5928
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 558
  Number of trainable parameters = 184346882
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjmperez[0m. Use [1m`wandb login --relo