In [4]:
%load_ext autoreload
%autoreload 2

import pandas as pd


files = [
    ("NEG", "../../data/sentiment/ttsbr/tweets.neg"),
    ("POS", "../../data/sentiment/ttsbr/tweets.pos"),
    ("NEU", "../../data/sentiment/ttsbr/tweets.neu"),
]


data = []
for label, file in files:
    with open(file) as f:
        for line in f:
            twid, text = line.split(" ", 1)

            data.append({
                "tweet_id": twid, 
                "text": text,
                "label": label
            })

df = pd.DataFrame(data)

df

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Unnamed: 0,tweet_id,text,label
0,865572794016378882,tô passada com esse cara quanta merda pode sai...,NEG
1,865566046320832512,coitada da namorada\n,NEG
2,862307799258329089,esse japa não entendi porra nenhuma de orquíde...,NEG
3,864814104745320449,aí vc fica até NUMBER assistindo e acorda cedo...,NEG
4,864665198359183361,imagina que insuportável ter de dar de comer p...,NEG
...,...,...,...
14995,864097252591194112,lazaro falou bale fitness e ana maria braga es...,NEU
14996,863089429656817665,simpatia na trama das seis ingrid guimarães mo...,NEU
14997,864699532961091584,ocidentais tem mta dificuldade pra aceitar com...,NEU
14998,865628931621232640,USERNAME que horas vc chega em belém / aeropor...,NEU


In [6]:

pd.options.display.max_colwidth = 200

df["label"].value_counts()

POS    6648
NEG    4426
NEU    3926
Name: label, dtype: int64

In [12]:
# Perform train test split

from sklearn.model_selection import train_test_split


label2id = {
    "NEG": 0,
    "NEU": 1,
    "POS": 2,
}


df["label"] = df["label"].apply(lambda x: label2id[x])


train, test = train_test_split(df, test_size=0.2, random_state=42)

train, dev = train_test_split(train, test_size=0.2, random_state=42)

train.shape, dev.shape, test.shape

((9600, 3), (2400, 3), (3000, 3))

In [15]:
from datasets import Dataset, Features, Value, ClassLabel, DatasetDict


features = Features({
    'tweet_id': Value('string'),
    'text': Value('string'),
    "label": ClassLabel(num_classes=3, names=["NEG", "NEU", "POS"]),
})


train = Dataset.from_pandas(train, features=features, preserve_index=False)
dev = Dataset.from_pandas(dev, features=features, preserve_index=False)
test = Dataset.from_pandas(test, features=features, preserve_index=False)

ds = DatasetDict({
    "train": train,
    "dev": dev,
    "test": test,
})


ds

DatasetDict({
    train: Dataset({
        features: ['tweet_id', 'text', 'label'],
        num_rows: 9600
    })
    dev: Dataset({
        features: ['tweet_id', 'text', 'label'],
        num_rows: 2400
    })
    test: Dataset({
        features: ['tweet_id', 'text', 'label'],
        num_rows: 3000
    })
})

In [16]:
ds.push_to_hub("pysentimiento/pt_sentiment", private=True)

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split dev to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]