# Main imports and code

In [1]:
# check which gpu we're using
!nvidia-smi

Tue Feb 25 17:54:58 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   46C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!pip install simpletransformers
!pip install tensorboardx

Collecting simpletransformers
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting streamlit (from simpletransformers)
  Downloading streamlit-1.42.2-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets-

In [4]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [5]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs, MultiLabelClassificationModel, MultiLabelClassificationArgs
from urllib import request
import pandas as pd
import numpy as np
import logging
import torch
from collections import Counter
from ast import literal_eval

from transformers import AutoTokenizer, DataCollatorWithPadding, Trainer, TrainingArguments
from transformers import AdamW
from transformers import AutoModelForSequenceClassification
from transformers import get_scheduler

import evaluate

from datasets import Dataset

from torch.utils.data import DataLoader

import nltk
from nltk.corpus import wordnet
import random

In [6]:
# Download WordNet corpus
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [7]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()

print('Cuda available? ',cuda_available)

Cuda available?  True


In [8]:
if cuda_available:
  import tensorflow as tf
  # Get the GPU device name.
  device_name = tf.test.gpu_device_name()
  # The device name should look like the following:
  if device_name == '/device:GPU:0':
      print('Found GPU at: {}'.format(device_name))
  else:
      raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


# Fetch Don't Patronize Me! data manager module

In [None]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [None]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [None]:
from dont_patronize_me import DontPatronizeMe

In [None]:
dpm = DontPatronizeMe('./Dont_Patronize_Me_Trainingset', '.')

In [None]:
dpm.load_task1()
dpm.load_task2(return_one_hot=True)

Map of label to numerical label:
{'Unbalanced_power_relations': 0, 'Shallow_solution': 1, 'Presupposition': 2, 'Authority_voice': 3, 'Metaphors': 4, 'Compassion': 5, 'The_poorer_the_merrier': 6}


# Load paragraph IDs

In [None]:
trids = pd.read_csv('./Train_Dev_Split/train_semeval_parids-labels.csv')
teids = pd.read_csv('./Train_Dev_Split/dev_semeval_parids-labels.csv')

In [None]:
trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)

In [None]:
data=dpm.train_task1_df

In [None]:
data

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0,0
...,...,...,...,...,...,...,...
10464,10465,@@14297363,women,lk,"""Sri Lankan norms and culture inhibit women fr...",0,1
10465,10466,@@70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0,0
10466,10467,@@20282330,in-need,ng,""""""" She has one huge platform , and informatio...",1,3
10467,10468,@@16753236,hopeless,in,""""""" Anja Ringgren Loven I ca n't find a word t...",1,4




# Rebuild training set (Task 1)

In [None]:
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):
  parid = trids.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })


In [None]:
import random

In [None]:
trdf1 = pd.DataFrame(rows)

In [None]:
trdf1

Unnamed: 0,par_id,community,text,label
0,4341,poor-families,"The scheme saw an estimated 150,000 children f...",1
1,4136,homeless,Durban 's homeless communities reconciliation ...,1
2,10352,poor-families,The next immediate problem that cropped up was...,1
3,8279,vulnerable,Far more important than the implications for t...,1
4,1164,poor-families,To strengthen child-sensitive social protectio...,1
...,...,...,...,...
8370,8380,refugee,Rescue teams search for survivors on the rubbl...,0
8371,8381,hopeless,The launch of ' Happy Birthday ' took place la...,0
8372,8382,homeless,"The unrest has left at least 20,000 people dea...",0
8373,8383,hopeless,You have to see it from my perspective . I may...,0


# Rebuild test set (Task 1)

In [None]:
rows = [] # will contain par_id, label and text
for idx in range(len(teids)):
  parid = teids.par_id[idx]
  #print(parid)
  # select row from original dataset
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })

In [None]:
len(rows)

2094

In [None]:
tedf1 = pd.DataFrame(rows)
tedf1

Unnamed: 0,par_id,community,text,label
0,4046,hopeless,We also know that they can benefit by receivin...,1
1,1279,refugee,Pope Francis washed and kissed the feet of Mus...,1
2,8330,refugee,Many refugees do n't want to be resettled anyw...,1
3,4063,in-need,"""Budding chefs , like """" Fred """" , """" Winston ...",1
4,4089,homeless,"""In a 90-degree view of his constituency , one...",1
...,...,...,...,...
2089,10462,homeless,"The sad spectacle , which occurred on Saturday...",0
2090,10463,refugee,""""""" The Pakistani police came to our house and...",0
2091,10464,disabled,"""When Marie O'Donoghue went looking for a spec...",0
2092,10465,women,"""Sri Lankan norms and culture inhibit women fr...",0


In [None]:
# random.shuffle(tedf1)

# RoBERTa Baseline for Task 1

In [None]:
# downsample negative instances
pcldf = trdf1[trdf1.label==1]
npos = len(pcldf)

training_set1 = pd.concat([pcldf,trdf1[trdf1.label==0][:npos*2]])
test_set1 = tedf1

In [None]:
training_set1

Unnamed: 0,par_id,community,text,label
0,4341,poor-families,"The scheme saw an estimated 150,000 children f...",1
1,4136,homeless,Durban 's homeless communities reconciliation ...,1
2,10352,poor-families,The next immediate problem that cropped up was...,1
3,8279,vulnerable,Far more important than the implications for t...,1
4,1164,poor-families,To strengthen child-sensitive social protectio...,1
...,...,...,...,...
2377,1775,refugee,Last but not the least element of culpability ...,0
2378,1776,refugee,"Then , taking the art of counter-intuitive non...",0
2379,1777,refugee,Kagunga village was reported to lack necessary...,0
2380,1778,vulnerable,"""After her parents high-profile divorce after ...",0


In [None]:
test_set1

Unnamed: 0,par_id,community,text,label
0,4046,hopeless,We also know that they can benefit by receivin...,1
1,1279,refugee,Pope Francis washed and kissed the feet of Mus...,1
2,8330,refugee,Many refugees do n't want to be resettled anyw...,1
3,4063,in-need,"""Budding chefs , like """" Fred """" , """" Winston ...",1
4,4089,homeless,"""In a 90-degree view of his constituency , one...",1
...,...,...,...,...
2089,10462,homeless,"The sad spectacle , which occurred on Saturday...",0
2090,10463,refugee,""""""" The Pakistani police came to our house and...",0
2091,10464,disabled,"""When Marie O'Donoghue went looking for a spec...",0
2092,10465,women,"""Sri Lankan norms and culture inhibit women fr...",0


In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
# Synonym Replacement
def replace_synonyms(text, num_replacements=1):
    words = text.split()
    new_words = words.copy()
    words_to_replace = random.sample(words, min(num_replacements, len(words)))

    for word in words_to_replace:
        synonyms = set()
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonyms.add(lemma.name().replace("_", " "))
        synonyms.discard(word)

        if synonyms:
            new_word = random.choice(list(synonyms))
            new_words[new_words.index(word)] = new_word

    return " ".join(new_words)

In [None]:
class AugmentedDataset(Dataset):
    def _init_(self, texts, labels, augment=True):
        self.texts = texts
        self.labels = labels
        self.augment = augment

    def _len_(self):
        return len(self.texts)

    def _getitem_(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        if self.augment:
            if random.random() > 0.5:
                text = replace_synonyms(text)
            # if random.random() > 0.5:
            #     text = random_swap(text)

        return text, label

In [None]:

# task1_model_args = ClassificationArgs(num_train_epochs=1,
                                    #   no_save=True,
                                    #   no_cache=True,
                                    #   overwrite_output_dir=True)
# task1_model = ClassificationModel("deberta",
                                #   'microsoft/deberta-v2-base',
                                #   args = task1_model_args,
                                #   num_labels=2,
                                #   use_cuda=cuda_available)
# Loading the pretrained model

def getModel(checkpoint):
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
    model = model.to(device)

    return model

def getDataLoaders(checkpoint):
    # Perform tokenization
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    hf_dataset_train = AugmentedDataset.from_pandas(training_set1)
    hf_dataset_test = AugmentedDataset.from_pandas(test_set1)

    # Tokenization function
    def tokenize_function(example):
        return tokenizer(example["text"], truncation=True, padding=True, max_length=512)

    # Apply tokenization
    tokenized_datasets_train = hf_dataset_train.map(tokenize_function)
    tokenized_datasets_test = hf_dataset_test.map(tokenize_function)

    # Rename label column and set format for PyTorch
    tokenized_datasets_train = tokenized_datasets_train.remove_columns(["par_id", "text", "community", "__index_level_0__"])
    tokenized_datasets_train = tokenized_datasets_train.rename_column("label", "labels")
    tokenized_datasets_train.set_format("torch")

    tokenized_datasets_test = tokenized_datasets_test.remove_columns(["par_id", "text", "community"])
    tokenized_datasets_test = tokenized_datasets_test.rename_column("label", "labels")
    tokenized_datasets_test.set_format("torch")

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_dataloader = DataLoader(
        tokenized_datasets_train, shuffle=True, batch_size=8, collate_fn=data_collator
    )

    eval_dataloader = DataLoader(
        tokenized_datasets_test, batch_size=8, collate_fn=data_collator
    )

    return train_dataloader, eval_dataloader


In [None]:
# Deberta
deberta = "microsoft/deberta-v3-base"

# RoBERTa
roberta = "FacebookAI/roberta-base"

checkpoint = deberta

model = getModel(checkpoint)
train_dataloader, eval_dataloader = getDataLoaders(checkpoint)


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 2382/2382 [00:01<00:00, 2208.73 examples/s]
Map: 100%|██████████| 2094/2094 [00:00<00:00, 2474.91 examples/s]


In [None]:
# Setting up model training for fine-tuning
optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)



In [None]:
# Setting the model to training mode
model.train()

# Running the training
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
    print("Epoch: " + epoch)

0
1
2
3
4


In [None]:
# labels2file([[k] for k in preds_task1], 'task1.txt')

In [None]:
# Setting the model to evaluation mode
model.eval()

# Running evaluation
metric = evaluate.load("f1")
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    # print("Unique Predictions:", torch.unique(predictions))
    metric.add_batch(predictions=predictions, references=batch["labels"])

print(metric.compute())

{'f1': 0.40238450074515647}


# Ensemble method implementation

In [None]:
class EnsembleModel(torch.nn.Module):
    def __init__(self, model1, model2, hidden_size=768):
        super(EnsembleModel, self).__init__()
        self.model1 = model1
        self.model2 = model2
        self.classifier = torch.nn.Linear(4, 2)

    def forward(self, input_ids1, input_ids2):
        outputs1 = self.model1(input_ids=input_ids1).logits
        outputs2 = self.model2(input_ids=input_ids2).logits
        concatenated = torch.cat((outputs1, outputs2), dim=0)
        logits = self.classifier(concatenated)
        return logits

def getEnsembleData():
    # Perform tokenization
    tokenizer1 = AutoTokenizer.from_pretrained(deberta)
    tokenizer2 = AutoTokenizer.from_pretrained(roberta)

    hf_dataset_train = AugmentedDataset.from_pandas(training_set1)
    hf_dataset_test = AugmentedDataset.from_pandas(test_set1)

    # Tokenization function
    def tokenize_function(example):
        return {
            "input_ids1": tokenizer1(example["text"], truncation=True, padding="max_length", max_length=128)["input_ids"],
            "input_ids2": tokenizer2(example["text"], truncation=True, padding="max_length", max_length=128)["input_ids"],
        }

    # Apply tokenization
    tokenized_datasets_train = hf_dataset_train.map(tokenize_function)
    tokenized_datasets_test = hf_dataset_test.map(tokenize_function)

    # Rename label column and set format for PyTorch
    tokenized_datasets_train = tokenized_datasets_train.remove_columns(["par_id", "text", "community", "__index_level_0__"])
    tokenized_datasets_train = tokenized_datasets_train.rename_column("label", "labels")
    tokenized_datasets_train.set_format("torch")

    tokenized_datasets_test = tokenized_datasets_test.remove_columns(["par_id", "text", "community"])
    tokenized_datasets_test = tokenized_datasets_test.rename_column("label", "labels")
    tokenized_datasets_test.set_format("torch")

    return tokenized_datasets_train, tokenized_datasets_test

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return metric.compute(predictions=preds, references=p.label_ids)


In [None]:
torch.cuda.empty_cache()
train_ds, test_ds = getEnsembleData()
torch.cuda.empty_cache()
model1 = getModel(deberta)
model2 = getModel(roberta)
ensemble_model = EnsembleModel(model1, model2)

Map: 100%|██████████| 2382/2382 [00:01<00:00, 1369.39 examples/s]
Map: 100%|██████████| 2094/2094 [00:01<00:00, 1421.36 examples/s]
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 376.00 MiB. GPU 0 has a total capacity of 7.92 GiB of which 117.56 MiB is free. Including non-PyTorch memory, this process has 7.53 GiB memory in use. Of the allocated memory 7.02 GiB is allocated by PyTorch, and 422.20 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:


def collate_fn(batch):
    # Assuming each item in batch is a dict of tensors
    return {key: torch.stack([item[key] for item in batch]).to(device) for key in batch[0]}

from torch.utils.data import DataLoader
train_loader = DataLoader(train_ds, batch_size=2, collate_fn=collate_fn)

for batch in train_loader:
    # Now batch['input_ids1'], batch['input_ids2'], etc. are on GPU
    output1 = model1(batch["input_ids1"]).logits
    print(output1.shape)
    output2 = model2(batch["input_ids2"]).logits
    print(output2.shape)
    concatenated = torch.cat((output1, output2), dim=0)
    print(concatenated.shape)
    break

Map: 100%|██████████| 2382/2382 [00:01<00:00, 1314.86 examples/s]
Map: 100%|██████████| 2094/2094 [00:01<00:00, 1425.08 examples/s]
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 376.00 MiB. GPU 0 has a total capacity of 7.92 GiB of which 103.12 MiB is free. Including non-PyTorch memory, this process has 7.53 GiB memory in use. Of the allocated memory 7.02 GiB is allocated by PyTorch, and 422.20 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to=[],
)


trainer = Trainer(
    model=ensemble_model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)

trainer.train()




RuntimeError: mat1 and mat2 shapes cannot be multiplied (8x2 and 4x2)

# Rebuild training set (Task 2)

In [None]:
rows2 = [] # will contain par_id, label and text
for idx in range(len(trids)):
  parid = trids.par_id[idx]
  label = trids.label[idx]
  # select row from original dataset to retrieve the `text` value
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  rows2.append({
      'par_id':parid,
      'text':text,
      'label':label
  })


In [None]:
trdf2 = pd.DataFrame(rows2)

In [None]:
trdf2

Unnamed: 0,par_id,text,label
0,4341,"The scheme saw an estimated 150,000 children f...","[1, 0, 0, 1, 0, 0, 0]"
1,4136,Durban 's homeless communities reconciliation ...,"[0, 1, 0, 0, 0, 0, 0]"
2,10352,The next immediate problem that cropped up was...,"[1, 0, 0, 0, 0, 1, 0]"
3,8279,Far more important than the implications for t...,"[0, 0, 0, 1, 0, 0, 0]"
4,1164,To strengthen child-sensitive social protectio...,"[1, 0, 0, 1, 1, 1, 0]"
...,...,...,...
8370,8380,Rescue teams search for survivors on the rubbl...,"[0, 0, 0, 0, 0, 0, 0]"
8371,8381,The launch of ' Happy Birthday ' took place la...,"[0, 0, 0, 0, 0, 0, 0]"
8372,8382,"The unrest has left at least 20,000 people dea...","[0, 0, 0, 0, 0, 0, 0]"
8373,8383,You have to see it from my perspective . I may...,"[0, 0, 0, 0, 0, 0, 0]"


In [None]:
trdf2.label = trdf2.label.apply(literal_eval)

# Rebuild test set (Task 2)

In [None]:
rows2 = [] # will contain par_id, label and text
for idx in range(len(teids)):
  parid = teids.par_id[idx]
  label = teids.label[idx]
  #print(parid)
  # select row from original dataset to access the `text` value
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  rows2.append({
      'par_id':parid,
      'text':text,
      'label':label
  })


In [None]:
tedf2 = pd.DataFrame(rows2)

In [None]:
tedf2

Unnamed: 0,par_id,text,label
0,4046,We also know that they can benefit by receivin...,"[1, 0, 0, 1, 0, 0, 0]"
1,1279,Pope Francis washed and kissed the feet of Mus...,"[0, 1, 0, 0, 0, 0, 0]"
2,8330,Many refugees do n't want to be resettled anyw...,"[0, 0, 1, 0, 0, 0, 0]"
3,4063,"""Budding chefs , like """" Fred """" , """" Winston ...","[1, 0, 0, 1, 1, 1, 0]"
4,4089,"""In a 90-degree view of his constituency , one...","[1, 0, 0, 0, 0, 0, 0]"
...,...,...,...
2089,10462,"The sad spectacle , which occurred on Saturday...","[0, 0, 0, 0, 0, 0, 0]"
2090,10463,""""""" The Pakistani police came to our house and...","[0, 0, 0, 0, 0, 0, 0]"
2091,10464,"""When Marie O'Donoghue went looking for a spec...","[0, 0, 0, 0, 0, 0, 0]"
2092,10465,"""Sri Lankan norms and culture inhibit women fr...","[0, 0, 0, 0, 0, 0, 0]"


In [None]:
tedf2.label = tedf2.label.apply(literal_eval)

# RoBERTa baseline for Task 2

In [None]:
all_negs = trdf2[trdf2.label.apply(lambda x:sum(x) == 0)]
all_pos = trdf2[trdf2.label.apply(lambda x:sum(x) > 0)]

training_set2 = pd.concat([all_pos,all_negs[:round(len(all_pos)*0.5)]])

In [None]:
training_set2

Unnamed: 0,par_id,text,label
0,4341,"The scheme saw an estimated 150,000 children f...","[1, 0, 0, 1, 0, 0, 0]"
1,4136,Durban 's homeless communities reconciliation ...,"[0, 1, 0, 0, 0, 0, 0]"
2,10352,The next immediate problem that cropped up was...,"[1, 0, 0, 0, 0, 1, 0]"
3,8279,Far more important than the implications for t...,"[0, 0, 0, 1, 0, 0, 0]"
4,1164,To strengthen child-sensitive social protectio...,"[1, 0, 0, 1, 1, 1, 0]"
...,...,...,...
1186,434,""""""" I was absolutely useless at school , hopel...","[0, 0, 0, 0, 0, 0, 0]"
1187,435,I also noticed the change in socio-economic le...,"[0, 0, 0, 0, 0, 0, 0]"
1188,436,"Can Donald Trump win ? It 's possible , but ce...","[0, 0, 0, 0, 0, 0, 0]"
1189,437,He added that any introduction of new law must...,"[0, 0, 0, 0, 0, 0, 0]"


In [None]:
task2_model_args = MultiLabelClassificationArgs(num_train_epochs=1,
                                                no_save=True,
                                                no_cache=True,
                                                overwrite_output_dir=True
                                                )
task2_model = MultiLabelClassificationModel("roberta",
                                            'roberta-base',
                                            num_labels=7,
                                            args = task2_model_args,
                                            use_cuda=cuda_available)
# train model
task2_model.train_model(training_set2[['text', 'label']])
# run predictions
preds_task2, _ = task2_model.predict(tedf2.text.tolist())

Some weights of RobertaForMultiLabelSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` 

Process ForkPoolWorker-22:
Process ForkPoolWorker-14:
Process ForkPoolWorker-28:
Process ForkPoolWorker-25:
Process ForkPoolWorker-11:
Process ForkPoolWorker-24:
Process ForkPoolWorker-10:
  0%|          | 0/2 [04:37<?, ?it/s]Process ForkPoolWorker-21:
Process ForkPoolWorker-30:
Process ForkPoolWorker-29:
Traceback (most recent call last):
Process ForkPoolWorker-8:
Process ForkPoolWorker-23:
Process ForkPoolWorker-12:
Process ForkPoolWorker-27:
Traceback (most recent call last):
Process ForkPoolWorker-20:
Process ForkPoolWorker-26:
Process ForkPoolWorker-18:



KeyboardInterrupt: 

In [None]:
labels2file(preds_task2, 'task2.txt')

## Prepare submission

In [None]:
!cat task1.txt | head -n 10

1
1
0
1
0
0
1
1
0
1


In [None]:
!cat task2.txt | head -n 10

1,0,0,0,0,0,0
1,0,0,0,0,0,0
0,0,0,0,0,0,0
1,0,0,0,0,1,0
0,0,0,0,0,0,0
0,0,0,0,0,0,0
1,0,0,0,0,0,0
1,0,0,0,0,0,0
0,0,0,0,0,0,0
0,0,1,0,0,1,0


In [None]:
!zip submission.zip task1.txt task2.txt

  adding: task1.txt (deflated 92%)
  adding: task2.txt (deflated 97%)
