In [1]:
!pip install transformers
!pip install datasets
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from urllib import request
import datasets
import pandas as pd
import torch
from torch.utils.data import DataLoader
import transformers
from transformers.models.auto.modeling_auto import AutoModelForSeq2SeqLM
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
import pickle
import evaluate

In [3]:
cuda_available = torch.cuda.is_available()
device = torch.device("cuda") if cuda_available else torch.device("cpu")
print(device)

cuda


In [4]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))
from dont_patronize_me import DontPatronizeMe
dpm = DontPatronizeMe('.', '.')
dpm.load_task1()

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [5]:
trids = pd.read_csv('train_semeval_parids-labels.csv')
teids = pd.read_csv('dev_semeval_parids-labels.csv')

trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)

data_tr = dpm.train_task1_df
data_tr.head()
data_te = dpm.train_task1_df.copy(deep=True)



# Rebuild training set (Task 1)

In [6]:
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):  
  parid = trids.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  keyword = data_tr.loc[data_tr.par_id == parid].keyword.values[0]
  text = data_tr.loc[data_tr.par_id == parid].text.values[0]
  label = data_tr.loc[data_tr.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })

# Concatenating the community column to the text column
trdf1 = pd.DataFrame(rows)
trdf1["text"] = trdf1["community"] + " " + trdf1["text"]
# Split into train and internal dev set
train_data, internal_dev_data = train_test_split(trdf1, test_size=0.2, random_state=69)

In [7]:
# Upsample positive pcl class
pcldf = train_data[train_data.label==1]
upsample_factor = 9 # 9 = 1:1 ratio approximately

#npos = len(pcldf)
augmented_data = pd.read_pickle("augmented_data.pkl")
augmented_data_df = pd.DataFrame(augmented_data, columns=['text'])
augmented_data_df['label'] = 1
augmented_data_df['text'] = augmented_data_df['text'].str.replace('\n\n', '')

upsampled = pd.concat([pcldf]*upsample_factor)

#training_set_even = pd.concat([pcldf,train_data[train_data.label==0][:npos*10]])
training_set_even = pd.concat([pcldf, upsampled, train_data[train_data.label==0]])

#training_set_even = pd.concat([training_set_even, augmented_data_df.sample(n=500, random_state=42)], ignore_index=True)

In [8]:
training_set_even

Unnamed: 0,par_id,community,text,label
376,1052,homeless,"homeless There is infinitely more to be done ,...",1
545,496,refugee,refugee Hundreds of thousands of Rohingya refu...,1
711,2509,homeless,"homeless ""2015 donation drives , feed the hung...",1
686,3598,in-need,in-need For her unwavering commitment to aidin...,1
314,9923,hopeless,hopeless The first point worth mentioning is l...,1
...,...,...,...,...
5472,5158,immigrant,"immigrant Jiverly Antares Wong , 42 , a Vietna...",0
1830,1160,poor-families,poor-families Without the evidence and measure...,0
4374,3962,disabled,disabled Democrats also criticized Price for h...,0
4041,3597,women,women It is thought that he may have been invo...,0


In [9]:
training_set_even["label"].value_counts()

1    6370
0    6063
Name: label, dtype: int64

In [10]:
training_set_even.head()

Unnamed: 0,par_id,community,text,label
376,1052,homeless,"homeless There is infinitely more to be done ,...",1
545,496,refugee,refugee Hundreds of thousands of Rohingya refu...,1
711,2509,homeless,"homeless ""2015 donation drives , feed the hung...",1
686,3598,in-need,in-need For her unwavering commitment to aidin...,1
314,9923,hopeless,hopeless The first point worth mentioning is l...,1


# Rebuild official dev set (Task 1)

In [11]:
rows = [] # will contain par_id, label and text
for idx in range(len(teids)):  
  parid = teids.par_id[idx]
  #print(parid)
  # select row from original dataset
  keyword = data_te.loc[data_te.par_id == parid].keyword.values[0]
  text = data_te.loc[data_te.par_id == parid].text.values[0]
  label = data_te.loc[data_te.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })
tedf1 = pd.DataFrame(rows)
# Concatenating the community column to the text column
tedf1["text"] = tedf1["community"] + " " + tedf1["text"]

# Rebuild official test set

In [12]:
rows=[]
with open("task4_test.tsv") as f:
  for line in f:
    t=line.strip().split('\t')
    rows.append(t)
official_test_set = pd.DataFrame(rows, columns="par_id art_id community country text".split())

In [13]:
official_test_set["text"] = official_test_set["community"] + " " + official_test_set["text"]

# Hyperparameters

In [14]:
batch_size = 64
learning_rate = 1e-5
pretrained_model = "distilbert-base-cased"
num_epochs = 1

In [15]:
tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained_model, num_labels=2)
def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True)

In [16]:
training_set_even = training_set_even[['text', 'label']]
training_set_even = datasets.Dataset.from_pandas(training_set_even)
training_set_even = training_set_even.map(tokenize_function, batched=True)
training_set_even = training_set_even.remove_columns(["text", "__index_level_0__"])
training_set_even = training_set_even.rename_column("label", "labels")
training_set_even.set_format("torch")
train_even_dataloader = DataLoader(training_set_even, shuffle=True, batch_size=batch_size)

internal_dev_data = internal_dev_data[['text', 'label']]
internal_dev_data = datasets.Dataset.from_pandas(internal_dev_data)
internal_dev_data = internal_dev_data.map(tokenize_function, batched=True)
internal_dev_data = internal_dev_data.remove_columns(["text", "__index_level_0__"])
internal_dev_data = internal_dev_data.rename_column("label", "labels")
internal_dev_data.set_format("torch")
internal_dev_dataloader = DataLoader(internal_dev_data, shuffle=True, batch_size=batch_size)

official_dev_set = tedf1[['text', 'label']]
official_dev_set = datasets.Dataset.from_pandas(official_dev_set)
official_dev_set = official_dev_set.map(tokenize_function, batched=True)
official_dev_set = official_dev_set.remove_columns(["text"])
official_dev_set = official_dev_set.rename_column("label", "labels")
official_dev_set.set_format("torch")
# need to submit predictions for the official dev set so shuffle is set to False to preserve order
official_dev_set_dataloader = DataLoader(official_dev_set, shuffle=False, batch_size=batch_size)

official_test_set = official_test_set[["text"]]
official_test_set = datasets.Dataset.from_pandas(official_test_set)
official_test_set = official_test_set.map(tokenize_function, batched=True)
official_test_set = official_test_set.remove_columns(["text"])
official_test_set.set_format("torch")
# need to submit predictions for the official test set so shuffle is set to False to preserve order
official_test_set_dataloader = DataLoader(official_test_set, shuffle=False, batch_size=batch_size)

Map:   0%|          | 0/12433 [00:00<?, ? examples/s]

Map:   0%|          | 0/1675 [00:00<?, ? examples/s]

Map:   0%|          | 0/2094 [00:00<?, ? examples/s]

Map:   0%|          | 0/3832 [00:00<?, ? examples/s]

# Loading in the transformer

In [54]:
model = transformers.AutoModelForSequenceClassification.from_pretrained(pretrained_model)
model.to(device)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bia

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [55]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [56]:
from tqdm.auto import tqdm
num_training_steps = num_epochs * len(train_even_dataloader)
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_even_dataloader:
        #print(batch.items())
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    
    validation_metric = evaluate.combine(["accuracy", "precision", "f1", "recall"])
    model.eval()
    
    for batch in internal_dev_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        max_pred = max(0, torch.max(predictions))
        min_pred = min(1, torch.min(predictions))
        validation_metric.add_batch(predictions=predictions, references=batch["labels"])

    print(validation_metric.compute())

  0%|          | 0/195 [00:00<?, ?it/s]

{'accuracy': 0.8495522388059702, 'precision': 0.35384615384615387, 'f1': 0.4771784232365146, 'recall': 0.732484076433121}


In [None]:
metric = evaluate.combine(["accuracy", "precision", "f1", "recall"])
model.eval()
for batch in official_dev_set_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    max_pred = max(0, torch.max(predictions))
    min_pred = min(1, torch.min(predictions))
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

In [None]:
#torch.save(model.state_dict(), "model")

In [None]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(str(pi)+'\n')

In [None]:
# loading + reronning
metric = evaluate.combine(["accuracy", "precision", "f1", "recall"])
model = transformers.AutoModelForSequenceClassification.from_pretrained(pretrained_model)
model.to(device)

predictions_out = []

model.load_state_dict(torch.load("model"))
model.eval()
for batch in official_dev_set_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    predictions_out = predictions_out + predictions.flatten().tolist()
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.w

{'accuracy': 0.9049665711556829,
 'precision': 0.5,
 'f1': 0.5382830626450117,
 'recall': 0.5829145728643216}

In [None]:
with open("dev.txt", "r") as f:
    dev = f.readlines()

dev = [x.strip() for x in dev]
dev = [int(x) for x in dev]

In [None]:
dev == predictions_out

True

In [None]:
predictions_out[:5]

[0, 0, 0, 0, 1]

In [None]:
labels2file(predictions_out, "dev.txt")

In [None]:
predictions_out = []
model.eval()
for batch in official_test_set_dataloader:
  batch = {k: v.to(device) for k, v in batch.items()}
  with torch.no_grad():
    outputs = model(**batch)
  
  logits = outputs.logits
  predictions = torch.argmax(logits, dim=-1)
  predictions_out = predictions_out + predictions.flatten().tolist()

In [None]:
labels2file(predictions_out, "test.txt")