# Main imports and code

In [None]:
# check which gpus we're using
!nvidia-smi

Tue Feb 24 11:47:18 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 591.86                 Driver Version: 591.86         CUDA Version: 13.1     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060      WDDM  |   00000000:06:00.0  On |                  N/A |
|  0%   44C    P8             11W /  170W |     938MiB /  12288MiB |     85%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

In [2]:
!pip install simpletransformers
!pip install tensorboardx



In [3]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs, MultiLabelClassificationModel, MultiLabelClassificationArgs
from urllib import request
import pandas as pd
import logging
import torch
from collections import Counter
from ast import literal_eval

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [4]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()

print('Cuda available? ',cuda_available)

Cuda available?  True


In [5]:
# Use PyTorch for GPU check (same backend as simpletransformers). TensorFlow often
# doesn't see the GPU on Windows (CPU-only install or CUDA mismatch).
if cuda_available:
    device_name = torch.cuda.get_device_name(0)
    print('Found GPU: {}'.format(device_name))
else:
    print('No GPU found; training will use CPU (slower).')

Found GPU: NVIDIA GeForce RTX 3060


# Fetch Don't Patronize Me! data manager module

In [6]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [7]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [8]:
from dont_patronize_me import DontPatronizeMe

In [9]:
dpm = DontPatronizeMe('.', '.')

## Dataset required (run the cell below first)

The notebook only fetches the **Python loader** (`dont_patronize_me.py`). The **actual data files** are not in the public repo:

- **`dontpatronizeme_pcl.tsv`** – Task 1 (binary PCL labels)
- **`dontpatronizeme_categories.tsv`** – Task 2 (category labels)

**How to get them:**  
1. **Course:** If this is for a module, the data may be on the course VLE or shared drive – use that first.  
2. **Otherwise:** Request access from the task organisers: https://forms.gle/VN8hwbdGYkf5KHiKA  

Place both TSV files in the **same folder as this notebook** (e.g. `BestModel/`).  

The cell below downloads the **practice split** CSVs (train/dev paragraph IDs) from the repo and checks that the TSV files are present before you run `dpm.load_task1()`.

In [10]:
# Download practice split CSVs (train/dev paragraph IDs) if missing; check for required TSV data files
import os
from urllib.request import urlretrieve
base_url = "https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/practice%20splits"
for name in ["train_semeval_parids-labels.csv", "dev_semeval_parids-labels.csv"]:
    if not os.path.isfile(name):
        urlretrieve(f"{base_url}/{name}", name)
        print(f"Downloaded {name}")
    else:
        print(f"Already have {name}")
# Check for required dataset TSV files (not in the public repo)
pcl_tsv = "dontpatronizeme_pcl.tsv"
cat_tsv = "dontpatronizeme_categories.tsv"
if os.path.isfile(pcl_tsv) and os.path.isfile(cat_tsv):
    print("Dataset TSV files found. You can run dpm.load_task1() and load_task2() next.")
else:
    print("Missing dataset files:", pcl_tsv, "and/or", cat_tsv)
    print("Get them from your course materials or request at: https://forms.gle/VN8hwbdGYkf5KHiKA")
    print("Place both TSV files in this notebook's folder (e.g. BestModel/).")

Already have train_semeval_parids-labels.csv
Already have dev_semeval_parids-labels.csv
Missing dataset files: dontpatronizeme_pcl.tsv and/or dontpatronizeme_categories.tsv
Get them from your course materials or request at: https://forms.gle/VN8hwbdGYkf5KHiKA
Place both TSV files in this notebook's folder (e.g. BestModel/).


In [11]:
dpm.load_task1()


# Load paragraph IDs

In [12]:
trids = pd.read_csv('train_semeval_parids-labels.csv')
teids = pd.read_csv('dev_semeval_parids-labels.csv')

In [13]:
trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)

In [14]:
data=dpm.train_task1_df

In [28]:
data

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0,0
...,...,...,...,...,...,...,...
10464,10465,@@14297363,women,lk,"""Sri Lankan norms and culture inhibit women fr...",0,1
10465,10466,@@70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0,0
10466,10467,@@20282330,in-need,ng,""""""" She has one huge platform , and informatio...",1,3
10467,10468,@@16753236,hopeless,in,""""""" Anja Ringgren Loven I ca n't find a word t...",1,4




# Rebuild training set (Task 1)

In [16]:
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):
  parid = trids.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })


In [17]:
import random

In [18]:
trdf1 = pd.DataFrame(rows)

In [19]:
trdf1

Unnamed: 0,par_id,community,text,label
0,4341,poor-families,"The scheme saw an estimated 150,000 children f...",1
1,4136,homeless,Durban 's homeless communities reconciliation ...,1
2,10352,poor-families,The next immediate problem that cropped up was...,1
3,8279,vulnerable,Far more important than the implications for t...,1
4,1164,poor-families,To strengthen child-sensitive social protectio...,1
...,...,...,...,...
8370,8380,refugee,Rescue teams search for survivors on the rubbl...,0
8371,8381,hopeless,The launch of ' Happy Birthday ' took place la...,0
8372,8382,homeless,"The unrest has left at least 20,000 people dea...",0
8373,8383,hopeless,You have to see it from my perspective . I may...,0


# Rebuild test set (Task 1)

In [20]:
rows = [] # will contain par_id, label and text
for idx in range(len(teids)):
  parid = teids.par_id[idx]
  #print(parid)
  # select row from original dataset
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })


In [21]:
len(rows)

2094

In [22]:
tedf1 = pd.DataFrame(rows)

In [23]:
# Shuffle rows (random.shuffle() is for lists; use .sample() for DataFrames)
tedf1 = tedf1.sample(frac=1, random_state=42).reset_index(drop=True)

# RoBERTa Baseline for Task 1

In [24]:
# downsample negative instances
pcldf = trdf1[trdf1.label==1]
npos = len(pcldf)

training_set1 = pd.concat([pcldf,trdf1[trdf1.label==0][:npos*2]])

In [26]:
training_set1

Unnamed: 0,par_id,community,text,label
0,4341,poor-families,"The scheme saw an estimated 150,000 children f...",1
1,4136,homeless,Durban 's homeless communities reconciliation ...,1
2,10352,poor-families,The next immediate problem that cropped up was...,1
3,8279,vulnerable,Far more important than the implications for t...,1
4,1164,poor-families,To strengthen child-sensitive social protectio...,1
...,...,...,...,...
2377,1775,refugee,Last but not the least element of culpability ...,0
2378,1776,refugee,"Then , taking the art of counter-intuitive non...",0
2379,1777,refugee,Kagunga village was reported to lack necessary...,0
2380,1778,vulnerable,"""After her parents high-profile divorce after ...",0


In [33]:
from sklearn.metrics import f1_score


# Epoch search centred at 10 (Pérez-Almendros et al., 2020); also tune lr and class weight.
configs = [
    {"num_train_epochs": 8,  "learning_rate": 2e-5, "weight": [1.0, 2.0]},
    {"num_train_epochs": 10, "learning_rate": 2e-5, "weight": [1.0, 2.0]},
    {"num_train_epochs": 12, "learning_rate": 2e-5, "weight": [1.0, 2.0]},
    {"num_train_epochs": 10, "learning_rate": 1e-5, "weight": [1.0, 2.0]},
    {"num_train_epochs": 10, "learning_rate": 3e-5, "weight": [1.0, 2.0]},
    {"num_train_epochs": 10, "learning_rate": 2e-5, "weight": [1.0, 3.0]},
    {"num_train_epochs": 10, "learning_rate": 2e-5, "weight": [1.0, 1.5]},
]

best_f1 = 0
best_config = None
best_preds = None

for config in configs:
    print(f"\nTrying config: {config}")
    
    model_args = ClassificationArgs(
        num_train_epochs=config["num_train_epochs"],
        learning_rate=config["learning_rate"],
        no_save=True,
        no_cache=True,
        overwrite_output_dir=True,
        max_seq_length=256,
        weight_decay=0.01,
        warmup_ratio=0.1,
        silent=False
    )
    
    model = ClassificationModel(
        "roberta",
        "cardiffnlp/twitter-roberta-base-hate",
        args=model_args,
        num_labels=2,
        weight=config["weight"],
        use_cuda=cuda_available
    )
    
    model.train_model(training_set1[['text', 'label']])
    preds, _ = model.predict(tedf1.text.tolist())
    
    f1 = f1_score(tedf1.label.tolist(), preds)
    print(f"F1: {f1:.4f}")
    
    if f1 > best_f1:
        best_f1 = f1
        best_config = config
        best_preds = preds
        best_model = model

print(f"\nBest config: {best_config}")
print(f"Best F1: {best_f1:.4f}")

# Save dev predictions to submission/
import os
os.makedirs('submission', exist_ok=True)
labels2file([[k] for k in best_preds], 'submission/dev.txt')


Trying config: {'num_train_epochs': 8, 'learning_rate': 2e-05, 'weight': [1.0, 2.0]}


  return torch.load(checkpoint_file, map_location=map_location)
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
5it [00:15,  3.06s/it]                       
  scaler = amp.GradScaler()
  with amp.autocast():
Epochs 1/8. Running Loss:    0.7021: 100%|██████████| 298/298 [00:43<00:00,  6.79it/s]
Epochs 2/8. Running Loss:    0.2742: 100%|██████████| 298/298 [00:43<00:00,  6.81it/s]
Epochs 3/8. Running Loss:    0.1434: 100%|██████████| 298/298 [00:43<00:00,  6.79it/s]
Epochs 4/8. Running Loss:    0.9526: 100%|██████████| 298/298 [00:44<00:00,  6.74it/s]
Epochs 5/8. Running Loss:    0.0026: 100%|██████████| 298/298 [00:44<00:00,  6.63it/s]
Epochs 6/8. Running Loss:    0.0005: 100%|██████████| 298/298 [00:44<00:00,  6.75it/s]
Epochs 7/8. Running Loss:    0.0004: 100%|██████████| 298/298 [00:44<00:00,  6.74it/s]
Epochs 8/8. Running Loss:    0.0003: 100%|██████████| 298/298 [00:44<00:00,  6.74it/s]
Epoch 8 of 8: 100%|██████████| 

F1: 0.5468

Trying config: {'num_train_epochs': 10, 'learning_rate': 2e-05, 'weight': [1.0, 2.0]}


  return torch.load(checkpoint_file, map_location=map_location)
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
5it [00:12,  2.54s/it]                       
  scaler = amp.GradScaler()
  with amp.autocast():
Epochs 1/10. Running Loss:    0.3941: 100%|██████████| 298/298 [00:43<00:00,  6.79it/s]
Epochs 2/10. Running Loss:    0.0895: 100%|██████████| 298/298 [00:43<00:00,  6.83it/s]
Epochs 3/10. Running Loss:    0.1243: 100%|██████████| 298/298 [00:43<00:00,  6.84it/s]
Epochs 4/10. Running Loss:    0.0005: 100%|██████████| 298/298 [00:43<00:00,  6.83it/s]
Epochs 5/10. Running Loss:    0.0005: 100%|██████████| 298/298 [00:44<00:00,  6.76it/s]
Epochs 6/10. Running Loss:    0.0003: 100%|██████████| 298/298 [00:43<00:00,  6.82it/s]
Epochs 7/10. Running Loss:    0.0010: 100%|██████████| 298/298 [00:43<00:00,  6.82it/s]
Epochs 8/10. Running Loss:    0.0001: 100%|██████████| 298/298 [00:43<00:00,  6.82it/s]
Epochs 9/10. Running Lo

F1: 0.5284

Trying config: {'num_train_epochs': 12, 'learning_rate': 2e-05, 'weight': [1.0, 2.0]}


  return torch.load(checkpoint_file, map_location=map_location)
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
5it [00:12,  2.48s/it]                       
  scaler = amp.GradScaler()
  with amp.autocast():
Epochs 1/12. Running Loss:    1.2566: 100%|██████████| 298/298 [00:43<00:00,  6.87it/s]
Epochs 2/12. Running Loss:    0.1746: 100%|██████████| 298/298 [00:43<00:00,  6.87it/s]
Epochs 3/12. Running Loss:    0.8479: 100%|██████████| 298/298 [00:43<00:00,  6.86it/s]
Epochs 4/12. Running Loss:    0.0035: 100%|██████████| 298/298 [00:43<00:00,  6.87it/s]
Epochs 5/12. Running Loss:    0.4214: 100%|██████████| 298/298 [00:43<00:00,  6.85it/s]
Epochs 6/12. Running Loss:    0.0005: 100%|██████████| 298/298 [00:43<00:00,  6.87it/s]
Epochs 7/12. Running Loss:    0.0001: 100%|██████████| 298/298 [00:43<00:00,  6.86it/s]
Epochs 8/12. Running Loss:    0.0001: 100%|██████████| 298/298 [00:43<00:00,  6.86it/s]
Epochs 9/12. Running Lo

F1: 0.5299

Trying config: {'num_train_epochs': 10, 'learning_rate': 1e-05, 'weight': [1.0, 2.0]}


  return torch.load(checkpoint_file, map_location=map_location)
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
5it [00:12,  2.43s/it]                       
  scaler = amp.GradScaler()
  with amp.autocast():
Epochs 1/10. Running Loss:    0.5549: 100%|██████████| 298/298 [00:43<00:00,  6.85it/s]
Epochs 2/10. Running Loss:    0.1390: 100%|██████████| 298/298 [00:43<00:00,  6.85it/s]
Epochs 3/10. Running Loss:    0.1488: 100%|██████████| 298/298 [00:43<00:00,  6.85it/s]
Epochs 4/10. Running Loss:    0.0023: 100%|██████████| 298/298 [00:43<00:00,  6.84it/s]
Epochs 5/10. Running Loss:    0.0018: 100%|██████████| 298/298 [00:43<00:00,  6.85it/s]
Epochs 6/10. Running Loss:    0.0009: 100%|██████████| 298/298 [00:43<00:00,  6.84it/s]
Epochs 7/10. Running Loss:    0.0008: 100%|██████████| 298/298 [00:43<00:00,  6.84it/s]
Epochs 8/10. Running Loss:    0.0004: 100%|██████████| 298/298 [00:43<00:00,  6.84it/s]
Epochs 9/10. Running Lo

F1: 0.5468

Trying config: {'num_train_epochs': 10, 'learning_rate': 3e-05, 'weight': [1.0, 2.0]}


  return torch.load(checkpoint_file, map_location=map_location)
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
5it [00:12,  2.42s/it]                       
  scaler = amp.GradScaler()
  with amp.autocast():
Epochs 1/10. Running Loss:    0.7384: 100%|██████████| 298/298 [00:44<00:00,  6.72it/s]
Epochs 2/10. Running Loss:    0.5465: 100%|██████████| 298/298 [00:43<00:00,  6.87it/s]
Epochs 3/10. Running Loss:    0.7121: 100%|██████████| 298/298 [00:43<00:00,  6.86it/s]
Epochs 4/10. Running Loss:    0.0027: 100%|██████████| 298/298 [00:43<00:00,  6.85it/s]
Epochs 5/10. Running Loss:    0.0012: 100%|██████████| 298/298 [00:43<00:00,  6.85it/s]
Epochs 6/10. Running Loss:    0.0011: 100%|██████████| 298/298 [00:43<00:00,  6.86it/s]
Epochs 7/10. Running Loss:    0.0002: 100%|██████████| 298/298 [00:43<00:00,  6.85it/s]
Epochs 8/10. Running Loss:    0.0003: 100%|██████████| 298/298 [00:43<00:00,  6.85it/s]
Epochs 9/10. Running Lo

F1: 0.5393

Trying config: {'num_train_epochs': 10, 'learning_rate': 2e-05, 'weight': [1.0, 3.0]}


  return torch.load(checkpoint_file, map_location=map_location)
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
5it [00:12,  2.46s/it]                       
  scaler = amp.GradScaler()
  with amp.autocast():
Epochs 1/10. Running Loss:    0.8848: 100%|██████████| 298/298 [00:43<00:00,  6.87it/s]
Epochs 2/10. Running Loss:    0.4961: 100%|██████████| 298/298 [00:43<00:00,  6.89it/s]
Epochs 3/10. Running Loss:    0.3554: 100%|██████████| 298/298 [00:43<00:00,  6.87it/s]
Epochs 4/10. Running Loss:    0.3207: 100%|██████████| 298/298 [00:43<00:00,  6.87it/s]
Epochs 5/10. Running Loss:    0.0830: 100%|██████████| 298/298 [00:43<00:00,  6.87it/s]
Epochs 6/10. Running Loss:    0.0005: 100%|██████████| 298/298 [00:43<00:00,  6.87it/s]
Epochs 7/10. Running Loss:    0.0015: 100%|██████████| 298/298 [00:44<00:00,  6.74it/s]
Epochs 8/10. Running Loss:    0.0005: 100%|██████████| 298/298 [00:43<00:00,  6.89it/s]
Epochs 9/10. Running Lo

F1: 0.5523

Trying config: {'num_train_epochs': 10, 'learning_rate': 2e-05, 'weight': [1.0, 1.5]}


  return torch.load(checkpoint_file, map_location=map_location)
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
5it [00:11,  2.39s/it]                       
  scaler = amp.GradScaler()
  with amp.autocast():
Epochs 1/10. Running Loss:    0.0886: 100%|██████████| 298/298 [00:43<00:00,  6.90it/s]
Epochs 2/10. Running Loss:    0.0620: 100%|██████████| 298/298 [00:43<00:00,  6.91it/s]
Epochs 3/10. Running Loss:    0.2307: 100%|██████████| 298/298 [00:43<00:00,  6.91it/s]
Epochs 4/10. Running Loss:    0.0054: 100%|██████████| 298/298 [00:43<00:00,  6.91it/s]
Epochs 5/10. Running Loss:    0.0007: 100%|██████████| 298/298 [00:43<00:00,  6.91it/s]
Epochs 6/10. Running Loss:    0.5366: 100%|██████████| 298/298 [00:43<00:00,  6.90it/s]
Epochs 7/10. Running Loss:    0.0002: 100%|██████████| 298/298 [00:43<00:00,  6.91it/s]
Epochs 8/10. Running Loss:    0.0001: 100%|██████████| 298/298 [00:43<00:00,  6.91it/s]
Epochs 9/10. Running Lo

F1: 0.5078

Best config: {'num_train_epochs': 10, 'learning_rate': 2e-05, 'weight': [1.0, 3.0]}
Best F1: 0.5523





In [34]:
Counter(best_preds)

Counter({np.int64(0): 1786, np.int64(1): 308})

In [35]:
# Load official test set (no labels) and get predictions from best model
test_df = pd.read_csv('task4_test.tsv', sep='\t', header=None,
                         names=['par_id', 'art_id', 'keyword', 'country_code', 'text'])
test_preds, _ = best_model.predict(test_df.text.tolist())
labels2file([[k] for k in test_preds], 'submission/test.txt')
print(f'submission/dev.txt: {len(best_preds)} predictions')
print(f'submission/test.txt: {len(test_preds)} predictions')

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


8it [00:15,  1.95s/it]                       
  with amp.autocast():
Predicting: 100%|██████████| 39/39 [00:14<00:00,  2.70it/s]

dev.txt: 2094 predictions
test.txt: 3832 predictions



