In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd


files = [
    ("NEG", "../../data/sentiment/ttsbr/tweets.neg"),
    ("POS", "../../data/sentiment/ttsbr/tweets.pos"),
    ("NEU", "../../data/sentiment/ttsbr/tweets.neu"),
]


data = []
for label, file in files:
    with open(file) as f:
        for line in f:
            twid, text = line.split(" ", 1)

            data.append({
                "tweet_id": twid, 
                "text": text,
                "label": label
            })

df = pd.DataFrame(data)

df

Unnamed: 0,tweet_id,text,label
0,865572794016378882,tô passada com esse cara quanta merda pode sai...,NEG
1,865566046320832512,coitada da namorada\n,NEG
2,862307799258329089,esse japa não entendi porra nenhuma de orquíde...,NEG
3,864814104745320449,aí vc fica até NUMBER assistindo e acorda cedo...,NEG
4,864665198359183361,imagina que insuportável ter de dar de comer p...,NEG
...,...,...,...
14995,864097252591194112,lazaro falou bale fitness e ana maria braga es...,NEU
14996,863089429656817665,simpatia na trama das seis ingrid guimarães mo...,NEU
14997,864699532961091584,ocidentais tem mta dificuldade pra aceitar com...,NEU
14998,865628931621232640,USERNAME que horas vc chega em belém / aeropor...,NEU


In [2]:

pd.options.display.max_colwidth = 200

df["label"].value_counts()

POS    6648
NEG    4426
NEU    3926
Name: label, dtype: int64

In [3]:
# Perform train test split

from sklearn.model_selection import train_test_split


label2id = {
    "NEG": 0,
    "NEU": 1,
    "POS": 2,
}


df["label"] = df["label"].apply(lambda x: label2id[x])


train, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])

train, dev = train_test_split(train, test_size=0.2, random_state=42, stratify=train["label"])

train.shape, dev.shape, test.shape

((9600, 3), (2400, 3), (3000, 3))

In [4]:
from datasets import Dataset, Features, Value, ClassLabel, DatasetDict


features = Features({
    'tweet_id': Value('string'),
    'text': Value('string'),
    "label": ClassLabel(num_classes=3, names=["NEG", "NEU", "POS"]),
})


train = Dataset.from_pandas(train, features=features, preserve_index=False)
dev = Dataset.from_pandas(dev, features=features, preserve_index=False)
test = Dataset.from_pandas(test, features=features, preserve_index=False)

ds = DatasetDict({
    "train": train,
    "dev": dev,
    "test": test,
})


ds

DatasetDict({
    train: Dataset({
        features: ['tweet_id', 'text', 'label'],
        num_rows: 9600
    })
    dev: Dataset({
        features: ['tweet_id', 'text', 'label'],
        num_rows: 2400
    })
    test: Dataset({
        features: ['tweet_id', 'text', 'label'],
        num_rows: 3000
    })
})

In [5]:
ds.push_to_hub("pysentimiento/pt_sentiment", private=True)

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split dev to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

In [1]:
%load_ext autoreload
%autoreload 2

from datasets import load_dataset
from pysentimiento.preprocessing import preprocess_tweet

ds = load_dataset("pysentimiento/pt_sentiment")


ds = ds.map(
    lambda x: {"text": preprocess_tweet(x["text"], lang="pt", preprocess_handles=False) }, 
    batched=False)



Using custom data configuration pysentimiento--pt_sentiment-76c273a313043bbf
Found cached dataset parquet (/users/jmperez/.cache/huggingface/datasets/pysentimiento___parquet/pysentimiento--pt_sentiment-76c273a313043bbf/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
from pysentimiento.training import train_and_eval
from pysentimiento.tuning import get_training_arguments

model_name = "neuralmind/bert-base-portuguese-cased"

training_args = get_training_arguments(model_name, task_name="sentiment", lang="pt", use_defaults_if_not_tuned=True, metric_for_best_model="macro_f1")

id2label = ds["train"].features["label"].names

trainer, test_results = train_and_eval(
    model_name, ds, id2label=id2label, lang="pt", training_args=training_args, 
)



Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

  0%|          | 0/3 [00:00<?, ?ba/s]

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, tweet_id. If text, tweet_id are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 9600
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 900
  Number of trainable parameters = 108927747
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Traceback (most recent call last):
  File "/users/jmperez/.pyenv/versions/3.8.16/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/users/jmperez/.pyenv/versions/3.8.16/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/users/jmperez/projects/pysentimiento/.venv/lib/python3.8/site-packages/wandb/__main__.py", line 3, in <module>
    cli.cli(prog_name="python -m wandb")
  File "/users/jmperez/projects/pysentimiento/.venv/lib/python3.8/site-packages/click/core.py", line 1130, in __call__
    return self.main(*args, **kwargs)
  File "/users/jmperez/projects/pysentimiento/.venv/lib/python3.8/site-packages/click/core.py", line 1055, in main
    rv = self.invoke(ctx)
  File "/users/jmperez/projects/pysentimiento/.venv/lib/python3.8/site-packages/click/core.py", line 1657, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "/users/jmperez/projects/pysentimiento/.venv/lib/p

AssertionError: 