In [None]:
!pip -q install pandas==2.1.4 numpy==1.26.4 pyarrow==15.0.2 seaborn==0.13.2 \
    "protobuf<5.0.0" "transformers==4.44.2" "datasets==2.20.0" \
    "sentencepiece>=0.1.99"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.7/11.7 MB[0m [31m75.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m68.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m80.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import transformers, datasets, accelerate, google.protobuf as gp, sys
print("transformers:", transformers.__version__)
print("datasets:", datasets.__version__)
print("accelerate:", accelerate.__version__)
print("protobuf:", gp.__version__)
print("transformers module path:", transformers.__file__)

transformers: 4.44.2
datasets: 2.20.0
accelerate: 1.12.0
protobuf: 4.25.8
transformers module path: /usr/local/lib/python3.12/dist-packages/transformers/__init__.py


In [None]:
from google.colab import drive
drive.mount('/content/drive')


import os, json, re, urllib.request, warnings, random
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

SEED = 42
random.seed(SEED); np.random.seed(SEED)
BASE_DIR = "/content/drive/MyDrive/covid-sentiment"
os.makedirs(BASE_DIR, exist_ok=True)
CSV_PATH = "/content/covid_instagram.csv" 


Mounted at /content/drive



###  Select language configuration

In [None]:
# "en","es","pt","hi","id"
LANG = {
    "code": "en",
    "full": "english",
    "stopwords_code": "en",
    "tag": "en"
}
SAVE_LANG_DIR = f"{BASE_DIR}/{LANG['tag']}"
os.makedirs(SAVE_LANG_DIR, exist_ok=True)
print("Preparing language:", LANG)

Preparing language: {'code': 'en', 'full': 'english', 'stopwords_code': 'en', 'tag': 'en'}


In [None]:
df = pd.read_csv(CSV_PATH)

def cf(x): return str(x).casefold()

df = df[(df["Language Code"].astype(str).map(cf) == LANG["code"]) |
        (df["Full Language"].astype(str).map(cf) == LANG["full"])].copy()

text_col  = "Post Description"
label_col = "Sentiment"
df = df.dropna(subset=[text_col, label_col])
df = df[df[text_col].astype(str).str.strip().str.len() > 0].copy()

STOPWORDS_URL = f"https://raw.githubusercontent.com/stopwords-iso/stopwords-{LANG['stopwords_code']}/master/stopwords-{LANG['stopwords_code']}.txt"
STOPWORDS_PATH = f"/content/stopwords_{LANG['stopwords_code']}.txt"
try:
    if not os.path.exists(STOPWORDS_PATH):
        urllib.request.urlretrieve(STOPWORDS_URL, STOPWORDS_PATH)
    with open(STOPWORDS_PATH, "r", encoding="utf-8") as f:
        LANG_STOPWORDS = set(w.strip() for w in f if w.strip())
except Exception:
    LANG_STOPWORDS = set()

URL_RE        = re.compile(r"https?://\S+|www\.\S+")
MENTION_RE    = re.compile(r"@\w+")
HASHTAG_RE    = re.compile(r"#(\w+)", flags=re.UNICODE)
HTML_RE       = re.compile(r"&\w+;")
MULTISPACE_RE = re.compile(r"\s+")

def basic_clean(text: str) -> str:
    t = text if isinstance(text, str) else ""
    t = URL_RE.sub(" ", t)
    t = MENTION_RE.sub(" ", t)
    t = HTML_RE.sub(" ", t)
    t = HASHTAG_RE.sub(r"\1", t)
    t = MULTISPACE_RE.sub(" ", t)
    return t.strip()

def remove_stopwords(text: str, stopwords: set) -> str:
    toks = text.split()
    return " ".join(tok for tok in toks if tok.casefold() not in stopwords)

df["text_clean"] = df[text_col].astype(str).apply(basic_clean)
if LANG_STOPWORDS:
    df["text_clean"] = df["text_clean"].apply(lambda s: remove_stopwords(s, LANG_STOPWORDS))

labels_raw = df[label_col].astype(str).map(cf)
label_names = sorted(labels_raw.unique().tolist())
label2id = {lbl:i for i,lbl in enumerate(label_names)}
id2label = {i:lbl for lbl,i in label2id.items()}
df["label"] = labels_raw.map(label2id)

print("Classes:", label2id)
print("Total rows:", len(df))

Classes: {'negative': 0, 'neutral': 1, 'positive': 2}
Total rows: 343041


In [None]:
train_df, test_df = train_test_split(
    df[["text_clean", "label"]],
    test_size=0.25,
    random_state=SEED,
    stratify=df["label"]
)
print("Original train counts:", train_df["label"].value_counts().sort_index().to_dict())

def make_class_balanced_train(train_df, label_col="label",
                              target_per_class=None, max_cap_per_class=None,
                              random_state=SEED):
    counts = train_df[label_col].value_counts().sort_index()
    if target_per_class is None:
        target_per_class = int(np.median(counts.values))
    if max_cap_per_class is not None:
        target_per_class = min(target_per_class, int(max_cap_per_class))
    target_per_class = max(target_per_class, 1)

    parts=[]
    for cls, cnt in counts.items():
        df_c = train_df[train_df[label_col]==cls]
        if cnt > target_per_class:
            df_c_bal = resample(df_c, replace=False, n_samples=target_per_class, random_state=random_state)
        elif cnt < target_per_class:
            df_c_bal = resample(df_c, replace=True,  n_samples=target_per_class, random_state=random_state)
        else:
            df_c_bal = df_c
        parts.append(df_c_bal)
    out = pd.concat(parts, axis=0).sample(frac=1.0, random_state=random_state).reset_index(drop=True)
    return out

BAL_CAP = None
train_bal_df = make_class_balanced_train(train_df.copy(), label_col="label",
                                         target_per_class=None, max_cap_per_class=BAL_CAP,
                                         random_state=SEED)
print("Balanced train counts:", train_bal_df["label"].value_counts().sort_index().to_dict())


Original train counts: {0: 38183, 1: 91271, 2: 127826}
Balanced train counts: {0: 91271, 1: 91271, 2: 91271}


In [None]:
MODEL_NAME = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_batch(batch):
    return tokenizer(batch["text_clean"], truncation=True, padding=False, max_length=256)

train_ds = Dataset.from_pandas(train_bal_df.reset_index(drop=True))
test_ds  = Dataset.from_pandas(test_df.reset_index(drop=True))
raw = DatasetDict({"train": train_ds, "test": test_ds})

tok = raw.map(tokenize_batch, batched=True, remove_columns=["text_clean"])
tok = tok.rename_column("label", "labels")
tok.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

TOK_DIR = f"{SAVE_LANG_DIR}/tok_xlmr_256"
tok.save_to_disk(TOK_DIR)
print("Saved tokenized dataset to:", TOK_DIR)



Map:   0%|          | 0/273813 [00:00<?, ? examples/s]

Map:   0%|          | 0/85761 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/273813 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/85761 [00:00<?, ? examples/s]

Saved tokenized dataset to: /content/drive/MyDrive/covid-sentiment/en/tok_xlmr_256


In [None]:
train_bal_df.to_parquet(f"{SAVE_LANG_DIR}/train_bal.parquet", index=False)
test_df.to_parquet(     f"{SAVE_LANG_DIR}/test.parquet",      index=False)

with open(f"{SAVE_LANG_DIR}/label2id.json","w",encoding="utf-8") as f: json.dump(label2id,f,ensure_ascii=False,indent=2)
with open(f"{SAVE_LANG_DIR}/id2label.json","w",encoding="utf-8") as f: json.dump(id2label,f,ensure_ascii=False,indent=2)
with open(f"{SAVE_LANG_DIR}/meta.json","w",encoding="utf-8") as f:
    json.dump({"lang": LANG, "seed": SEED, "bal_cap": BAL_CAP}, f, ensure_ascii=False, indent=2)

print("All artifacts saved in:", SAVE_LANG_DIR)

All artifacts saved in: /content/drive/MyDrive/covid-sentiment/en


In [None]:
%cd "/content/drive/MyDrive/covid-sentiment"
!zip -r folder_full.zip . i covid-sentiment

/content/drive/MyDrive/covid-sentiment
  adding: es/ (stored 0%)
  adding: es/tok_xlmr_256/ (stored 0%)
  adding: es/tok_xlmr_256/dataset_dict.json (stored 0%)
  adding: es/tok_xlmr_256/train/ (stored 0%)
  adding: es/tok_xlmr_256/train/data-00000-of-00001.arrow (deflated 65%)
  adding: es/tok_xlmr_256/train/state.json (deflated 39%)
  adding: es/tok_xlmr_256/train/dataset_info.json (deflated 63%)
  adding: es/tok_xlmr_256/test/ (stored 0%)
  adding: es/tok_xlmr_256/test/data-00000-of-00001.arrow (deflated 65%)
  adding: es/tok_xlmr_256/test/state.json (deflated 40%)
  adding: es/tok_xlmr_256/test/dataset_info.json (deflated 63%)
  adding: es/label2id.json (deflated 23%)
  adding: es/id2label.json (deflated 24%)
  adding: es/meta.json (deflated 33%)
  adding: es/train_bal.parquet (deflated 14%)
  adding: es/test.parquet (deflated 14%)
  adding: es/xlmr_es_best.zip (stored 0%)
  adding: es/xlmr_es_best/ (stored 0%)
  adding: es/xlmr_es_best/label2id.json (deflated 23%)
  adding: es/xlmr