Environment check & installs

In [1]:
import torch

print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0))


CUDA available: True
GPU: Tesla T4


In [2]:
import os
import pandas as pd
from datasets import Dataset

from transformers import (
    RobertaTokenizerFast,
    RobertaForMaskedLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)


Mount Google Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
!ls /content/drive/MyDrive/Mental_Health_Sentiment/data/processed/


reddit_unlabeled_clean.csv   train_5class.csv	 val_5class.csv
reddit_unlabeled_merged.csv  train_baseline.csv  val_baseline.csv


Path

In [5]:
BASE_DIR = "/content/drive/MyDrive/Mental_Health_Sentiment"

DATA_FILE = f"{BASE_DIR}/data/processed/reddit_unlabeled_merged.csv"
OUTPUT_DIR = f"{BASE_DIR}/models/v1.0/roberta_mlm_adapted"



In [9]:
MODEL_NAME = "roberta-base"

MAX_LENGTH = 256
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 5e-5

Sanity check

In [6]:

import os
print(os.path.exists(DATA_FILE))



True


In [7]:
os.makedirs(OUTPUT_DIR, exist_ok=True)
print("Output dir ready:", OUTPUT_DIR)


Output dir ready: /content/drive/MyDrive/Mental_Health_Sentiment/models/v1.0/roberta_mlm_adapted


Load Reddit data

In [8]:
import pandas as pd

df = pd.read_csv(DATA_FILE)
print(df.shape)
df.head()


(50000, 1)


Unnamed: 0,text
0,healthy perfect life lie o you fucking swine o...
1,i'm done why is it that every single thing i d...
2,you have you ever been so depressed that you a...
3,how do i have emotional intimacy if i'm scared...
4,is a high resting heart rate common among peop...


Convert to Hugging Face Dataset

In [10]:
dataset = Dataset.from_pandas(df[["text"]])
dataset


Dataset({
    features: ['text'],
    num_rows: 50000
})

Tokenizer

In [11]:
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Tokenization function

In [12]:
def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_LENGTH
    )


Tokenize dataset

In [13]:
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

MLM data collator

In [14]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)


Load MLM model

In [15]:
model = RobertaForMaskedLM.from_pretrained(MODEL_NAME)


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/202 [00:00<?, ?it/s]

RobertaForMaskedLM LOAD REPORT from: roberta-base
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Training arguments

In [17]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    fp16=True,
    save_steps=10_000,
    save_total_limit=2,
    logging_steps=500,
    report_to="none"
)



Trainer

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)


Train

In [19]:
trainer.train()


Step,Training Loss
500,1.647196
1000,1.164774
1500,1.00704
2000,0.925499
2500,0.854771
3000,0.815984
3500,0.77282
4000,0.73383
4500,0.713631
5000,0.690822


Step,Training Loss
500,1.647196
1000,1.164774
1500,1.00704
2000,0.925499
2500,0.854771
3000,0.815984
3500,0.77282
4000,0.73383
4500,0.713631
5000,0.690822


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=9375, training_loss=0.7831500113932292, metrics={'train_runtime': 3013.2736, 'train_samples_per_second': 49.78, 'train_steps_per_second': 3.111, 'total_flos': 1.97449097472e+16, 'train_loss': 0.7831500113932292, 'epoch': 3.0})

Save model & tokenizer

In [20]:
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('/content/drive/MyDrive/Mental_Health_Sentiment/models/v1.0/roberta_mlm_adapted/tokenizer_config.json',
 '/content/drive/MyDrive/Mental_Health_Sentiment/models/v1.0/roberta_mlm_adapted/tokenizer.json')