**Environment**

In [None]:
!pip uninstall -y peft fastai timm torchvision torchaudio torch transformers accelerate datasets tf-keras


Found existing installation: peft 0.18.1
Uninstalling peft-0.18.1:
  Successfully uninstalled peft-0.18.1
Found existing installation: fastai 2.8.6
Uninstalling fastai-2.8.6:
  Successfully uninstalled fastai-2.8.6
Found existing installation: timm 1.0.24
Uninstalling timm-1.0.24:
  Successfully uninstalled timm-1.0.24
Found existing installation: torchvision 0.24.0+cu126
Uninstalling torchvision-0.24.0+cu126:
  Successfully uninstalled torchvision-0.24.0+cu126
Found existing installation: torchaudio 2.9.0+cu126
Uninstalling torchaudio-2.9.0+cu126:
  Successfully uninstalled torchaudio-2.9.0+cu126
Found existing installation: torch 2.10.0
Uninstalling torch-2.10.0:
  Successfully uninstalled torch-2.10.0
Found existing installation: transformers 4.36.2
Uninstalling transformers-4.36.2:
  Successfully uninstalled transformers-4.36.2
Found existing installation: accelerate 0.25.0
Uninstalling accelerate-0.25.0:
  Successfully uninstalled accelerate-0.25.0
Found existing installation: dat

In [None]:
!pip install transformers==4.36.2 datasets accelerate

Collecting transformers==4.36.2
  Using cached transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
Collecting datasets
  Downloading datasets-4.5.0-py3-none-any.whl.metadata (19 kB)
Collecting accelerate
  Downloading accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-23.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting torch>=2.0.0 (from accelerate)
  Using cached torch-2.10.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (31 kB)
Using cached transformers-4.36.2-py3-none-any.whl (8.2 MB)
Downloading datasets-4.5.0-py3-none-any.whl (515 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.2/515.2 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading accelerate-1.12.0-py3-none-any.whl (380 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.9/380.9 kB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-23.0.0-cp312-cp312-manylinux_2_2

In [None]:
!pip install torch==2.10.0



In [None]:
!pip uninstall -y accelerate
!pip install accelerate==0.25.0

Found existing installation: accelerate 1.12.0
Uninstalling accelerate-1.12.0:
  Successfully uninstalled accelerate-1.12.0
Collecting accelerate==0.25.0
  Using cached accelerate-0.25.0-py3-none-any.whl.metadata (18 kB)
Using cached accelerate-0.25.0-py3-none-any.whl (265 kB)
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0


In [None]:
import torch

print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())


Torch version: 2.10.0+cu128
CUDA available: True


In [None]:
import pandas as pd
import numpy as np

from datasets import Dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

from sklearn.metrics import accuracy_score, f1_score

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [None]:
!unzip /content/final_model.zip -d /content/


Archive:  /content/final_model.zip
   creating: /content/content/final_model/
  inflating: /content/content/final_model/tokenizer_config.json  
  inflating: /content/content/final_model/config.json  
  inflating: /content/content/final_model/vocab.txt  
  inflating: /content/content/final_model/model.safetensors  
  inflating: /content/content/final_model/tokenizer.json  
  inflating: /content/content/final_model/special_tokens_map.json  


**Load Previously Trained BERT Model**

In [None]:
MODEL_PATH = "../models/final_model" # folder copied from Colab zip

tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_PATH)
model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH)


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print("Loaded model on:", device)


Loaded model on: cuda


**Load Reddit Dataset**

In [None]:
reddit_df = pd.read_csv("reddit_analysis.csv")

reddit_df.head()


Unnamed: 0.1,Unnamed: 0,text,title,target
0,0,Welcome to /r/depression's check-in post - a p...,"Regular check-in post, with information about ...",1
1,1,We understand that most people who reply immed...,Our most-broken and least-understood rules is ...,1
2,2,Anyone else just miss physical touch? I crave ...,"I haven’t been touched, or even hugged, in so ...",1
3,3,I’m just so ashamed. Everyone and everything f...,Being Depressed is Embarrassing,1
4,4,I really need a friend. I don't even have a si...,I'm desperate for a friend and to feel loved b...,1


In [None]:
reddit_df = reddit_df[["text"]]
reddit_df = reddit_df.dropna().reset_index(drop=True)

# Use only 20–30% as discussed
reddit_df = reddit_df.sample(frac=0.25, random_state=42).reset_index(drop=True)

print("Reddit samples used:", len(reddit_df))


Reddit samples used: 1402


**Generate pseudo-labels**

In [None]:
def bert_predict(texts, batch_size=32):
    model.eval()
    preds = []
    probs = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]

        enc = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = model(**enc)
            logits = outputs.logits
            softmax = torch.softmax(logits, dim=1)

        preds.extend(torch.argmax(softmax, dim=1).cpu().numpy())
        probs.extend(torch.max(softmax, dim=1).values.cpu().numpy())

    return np.array(preds), np.array(probs)


In [None]:
pseudo_labels, confidences = bert_predict(reddit_df["text"].tolist())


**Keep only confident samples**

In [None]:
CONF_THRESHOLD = 0.70

reddit_df["label"] = pseudo_labels
reddit_df["confidence"] = confidences

reddit_df = reddit_df[reddit_df["confidence"] >= CONF_THRESHOLD]
reddit_df = reddit_df.reset_index(drop=True)

print("High-confidence samples:", len(reddit_df))


High-confidence samples: 781


**Convert to HuggingFace Dataset**

In [None]:
da_dataset = Dataset.from_pandas(
    reddit_df[["text", "label"]],
    preserve_index=False
)


**Tokenization**

In [None]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=128
    )

da_dataset = da_dataset.map(tokenize, batched=True)
da_dataset = da_dataset.remove_columns(["text"])
da_dataset.set_format("torch")

Map:   0%|          | 0/781 [00:00<?, ? examples/s]

**Light fine-tuning (LOW LR, 1 epoch)**

In [None]:
training_args = TrainingArguments(
    output_dir="./phase6_output",
    per_device_train_batch_size=16,
    num_train_epochs=1,                 # VERY IMPORTANT
    learning_rate=2e-5,                 # LOW LR
    weight_decay=0.01,
    logging_steps=100,
    save_strategy="no",
    report_to="none"
)


**Trainer**

In [None]:
data_collator = DataCollatorWithPadding(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=da_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


**Train**

In [None]:
trainer.train()


Step,Training Loss


TrainOutput(global_step=49, training_loss=0.0416382088953135, metrics={'train_runtime': 3.8374, 'train_samples_per_second': 203.521, 'train_steps_per_second': 12.769, 'total_flos': 25866565849344.0, 'train_loss': 0.0416382088953135, 'epoch': 1.0})

**Save Model**

In [None]:
DA_MODEL_PATH = "/content/final_model_domain_adapted"

model.save_pretrained(DA_MODEL_PATH)
tokenizer.save_pretrained(DA_MODEL_PATH)


('/content/final_model_domain_adapted/tokenizer_config.json',
 '/content/final_model_domain_adapted/special_tokens_map.json',
 '/content/final_model_domain_adapted/vocab.txt',
 '/content/final_model_domain_adapted/added_tokens.json',
 '/content/final_model_domain_adapted/tokenizer.json')

In [None]:
!zip -r final_model_domain_adapted.zip /content/final_model_domain_adapted


  adding: content/final_model_domain_adapted/ (stored 0%)
  adding: content/final_model_domain_adapted/tokenizer_config.json (deflated 74%)
  adding: content/final_model_domain_adapted/config.json (deflated 53%)
  adding: content/final_model_domain_adapted/vocab.txt (deflated 53%)
  adding: content/final_model_domain_adapted/model.safetensors (deflated 8%)
  adding: content/final_model_domain_adapted/tokenizer.json (deflated 71%)
  adding: content/final_model_domain_adapted/special_tokens_map.json (deflated 80%)
