In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/Shareddrives/CS263

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/Shareddrives/CS263


In [None]:
#!pip install datasets
#!pip install transformers[torch]

In [None]:
import csv

label_dict = {'not sexist': 0, 'sexist': 1}
category_dict = {'1. threats, plans to harm and incitement': 0, '2. derogation': 1, '3. animosity': 2, '4. prejudiced discussions': 3, 'none': 4}
vector_dict = {
    '1.1 threats of harm': 0,
    '1.2 incitement and encouragement of harm': 1,
    '2.1 descriptive attacks': 2,
    '2.2 aggressive and emotive attacks': 3,
    '2.3 dehumanising attacks & overt sexual objectification': 4,
    '3.1 casual use of gendered slurs, profanities, and insults': 5,
    '3.2 immutable gender differences and gender stereotypes': 6,
    '3.3 backhanded gendered compliments': 7,
    '3.4 condescending explanations or unwelcome advice': 8,
    '4.1 supporting mistreatment of individual women': 9,
    '4.2 supporting systemic discrimination against women as a group': 10,
    'none': 0,
    }

with open('edos_labelled_aggregated.csv') as file:
  data = list(csv.reader(file, quotechar='"'))
  taskA_train = []
  taskA_dev = []
  taskA_test = []
  taskB_train = []
  taskB_dev = []
  taskB_test = []
  taskC_train = []
  taskC_dev = []
  taskC_test = []
  for i in range(len(data)):
    if i == 0:
      continue

    temp_dict = {
        'rewire_id': data[i][0],
        'text': data[i][1],
        'label_sexist': label_dict[data[i][2]],
        'label_category': category_dict[data[i][3]],
        'label_vector': vector_dict[data[i][4]],
        'split': data[i][5]
        }

    if temp_dict['split'] == 'train':
      taskA_train.append(temp_dict)
      if temp_dict['label_sexist'] == 1:
        taskB_train.append(temp_dict)
        taskC_train.append(temp_dict)
    elif temp_dict['split'] == 'dev':
      taskA_dev.append(temp_dict)
      if temp_dict['label_sexist'] == 1:
        taskB_dev.append(temp_dict)
        taskC_dev.append(temp_dict)
    else:
      taskA_test.append(temp_dict)
      if temp_dict['label_sexist'] == 1:
        taskB_test.append(temp_dict)
        taskC_test.append(temp_dict)


In [None]:
print(len(taskA_train), len(taskA_dev), len(taskA_test))

14000 2000 4000


In [None]:
print(len(taskB_train), len(taskB_dev), len(taskB_test))

3398 486 970


In [None]:
print(len(taskC_train), len(taskC_dev), len(taskC_test))

3398 486 970


In [None]:
from sklearn.metrics import (accuracy_score,
                             auc,
                             confusion_matrix,
                              precision_score,
                              recall_score,
                              f1_score
                            )

def get_acc(y_test, y_pred):
    acc = round(accuracy_score(y_test,y_pred), 5)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1score = f1_score(y_test, y_pred, average='weighted')
    print(f"Accuracy: {acc}")
    print(f"precision: {precision}")
    print(f"recall: {recall}")
    print(f"f1 score: {f1score}")

In [None]:
from datasets import Dataset, load_metric
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import numpy as np

taskA_train = Dataset.from_list(taskA_train)
taskA_dev = Dataset.from_list(taskA_dev)
taskA_test = Dataset.from_list(taskA_test)
taskB_train = Dataset.from_list(taskB_train)
taskB_dev = Dataset.from_list(taskB_dev)
taskB_test = Dataset.from_list(taskB_test)
taskC_train = Dataset.from_list(taskC_train)
taskC_dev = Dataset.from_list(taskC_dev)
taskC_test = Dataset.from_list(taskC_test)

In [None]:
taskA_train

Dataset({
    features: ['rewire_id', 'text', 'label_sexist', 'label_category', 'label_vector', 'split'],
    num_rows: 14000
})

In [None]:
class EDOS(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

def compute_metrics(eval_pred):
  preds, labels = eval_pred
  preds = np.argmax(preds, axis = 1)
  return metric.compute(predictions = preds, references = labels)

In [None]:
model_path = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_path)
taskA_train_encodings = tokenizer(taskA_train['text'], truncation = True, padding = True)
taskA_dev_encodings = tokenizer(taskA_dev['text'], truncation = True, padding = True)
taskA_test_encodings = tokenizer(taskA_test['text'], truncation = True, padding = True)
taskA_train_dataset = EDOS(taskA_train_encodings, taskA_train['label_sexist'])
taskA_dev_dataset = EDOS(taskA_dev_encodings, taskA_dev['label_sexist'])
taskA_test_dataset = EDOS(taskA_test_encodings, taskA_test['label_sexist'])
taskB_train_encodings = tokenizer(taskB_train['text'], truncation = True, padding = True)
taskB_dev_encodings = tokenizer(taskB_dev['text'], truncation = True, padding = True)
taskB_test_encodings = tokenizer(taskB_test['text'], truncation = True, padding = True)
taskB_train_dataset = EDOS(taskB_train_encodings, taskB_train['label_category'])
taskB_dev_dataset = EDOS(taskB_dev_encodings, taskB_dev['label_category'])
taskB_test_dataset = EDOS(taskB_test_encodings, taskB_test['label_category'])
taskC_train_encodings = tokenizer(taskC_train['text'], truncation = True, padding = True)
taskC_dev_encodings = tokenizer(taskC_dev['text'], truncation = True, padding = True)
taskC_test_encodings = tokenizer(taskC_test['text'], truncation = True, padding = True)
taskC_train_dataset = EDOS(taskC_train_encodings, taskC_train['label_vector'])
taskC_dev_dataset = EDOS(taskC_dev_encodings, taskC_dev['label_vector'])
taskC_test_dataset = EDOS(taskC_test_encodings, taskC_test['label_vector'])
metric = load_metric('accuracy')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  metric = load_metric('accuracy')


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [None]:
batch_size = 64
epoch = 10

taskA_training_args = TrainingArguments(
    output_dir = './',
    learning_rate = 1e-5,
    num_train_epochs = epoch,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    logging_steps = 100,
    evaluation_strategy = 'steps',
    eval_steps = 100,
    metric_for_best_model = 'eval_accuracy',
    load_best_model_at_end = True
)

In [None]:
taskA_model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels = 2)

taskA_trainer = Trainer(
    model = taskA_model,
    args = taskA_training_args,
    train_dataset = taskA_train_dataset,
    eval_dataset = taskA_dev_dataset,
    compute_metrics = compute_metrics
)

taskA_trainer.train()

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.

Step,Training Loss,Validation Loss,Accuracy
100,0.5326,0.445942,0.7905
200,0.4031,0.366152,0.8455
300,0.3348,0.36385,0.8555
400,0.3228,0.33418,0.858
500,0.2924,0.335001,0.858
600,0.2709,0.343653,0.8645
700,0.2616,0.36987,0.86
800,0.2269,0.346753,0.856
900,0.2372,0.346371,0.8635
1000,0.1961,0.361598,0.854


TrainOutput(global_step=2190, training_loss=0.21450593983201677, metrics={'train_runtime': 1258.9888, 'train_samples_per_second': 111.2, 'train_steps_per_second': 1.739, 'total_flos': 3368604551760000.0, 'train_loss': 0.21450593983201677, 'epoch': 10.0})

In [None]:
y_pred = taskA_trainer.predict(taskA_test_dataset).predictions.argmax(axis = 1)

get_acc(taskA_test['label_sexist'], y_pred)

Accuracy: 0.86025
precision: 0.8570944188054948
recall: 0.86025
f1 score: 0.8582942310087377


In [None]:
epoch = 20

taskB_training_args = TrainingArguments(
    output_dir = './',
    learning_rate = 1e-5,
    num_train_epochs = epoch,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    logging_steps = 100,
    evaluation_strategy = 'steps',
    eval_steps = 100,
    metric_for_best_model = 'eval_accuracy',
    load_best_model_at_end = True
)

In [None]:
taskB_model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels = 4)

taskB_trainer = Trainer(
    model = taskB_model,
    args = taskB_training_args,
    train_dataset = taskB_train_dataset,
    eval_dataset = taskB_dev_dataset,
    compute_metrics = compute_metrics
)

taskB_trainer.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.

Step,Training Loss,Validation Loss,Accuracy
100,1.1425,1.044932,0.520576
200,0.9338,0.934399,0.569959
300,0.7842,0.904523,0.590535
400,0.656,0.91179,0.609053
500,0.5457,0.924867,0.625514
600,0.4408,0.970606,0.615226
700,0.3631,1.018975,0.615226
800,0.3025,1.064995,0.613169
900,0.2537,1.092072,0.62963
1000,0.229,1.115481,0.627572


TrainOutput(global_step=1080, training_loss=0.5397217732888681, metrics={'train_runtime': 541.2493, 'train_samples_per_second': 125.561, 'train_steps_per_second': 1.995, 'total_flos': 1635275218812480.0, 'train_loss': 0.5397217732888681, 'epoch': 20.0})

In [None]:
y_pred = taskB_trainer.predict(taskB_test_dataset).predictions.argmax(axis = 1)

get_acc(taskB_test['label_category'], y_pred)

Accuracy: 0.56598
precision: 0.5636350300031324
recall: 0.565979381443299
f1 score: 0.5646602343237517


In [None]:
epoch = 20

taskC_training_args = TrainingArguments(
    output_dir = './',
    learning_rate = 1e-5,
    num_train_epochs = epoch,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    logging_steps = 100,
    evaluation_strategy = 'steps',
    eval_steps = 100,
    metric_for_best_model = 'eval_accuracy',
    load_best_model_at_end = True
)

In [None]:
taskC_model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels = 11)

taskC_trainer = Trainer(
    model = taskC_model,
    args = taskC_training_args,
    train_dataset = taskC_train_dataset,
    eval_dataset = taskC_dev_dataset,
    compute_metrics = compute_metrics
)

taskC_trainer.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.

Step,Training Loss,Validation Loss,Accuracy
100,2.068,1.826728,0.368313
200,1.6741,1.605201,0.44856
300,1.4623,1.519725,0.469136
400,1.3076,1.474046,0.485597
500,1.1813,1.457698,0.504115
600,1.0656,1.456246,0.502058
700,0.9716,1.46373,0.493827
800,0.9037,1.472634,0.495885
900,0.8424,1.479632,0.502058
1000,0.7883,1.476512,0.504115


TrainOutput(global_step=1080, training_loss=1.1935647364015933, metrics={'train_runtime': 545.9267, 'train_samples_per_second': 124.486, 'train_steps_per_second': 1.978, 'total_flos': 1635479351215920.0, 'train_loss': 1.1935647364015933, 'epoch': 20.0})

In [None]:
y_pred = taskC_trainer.predict(taskC_test_dataset).predictions.argmax(axis = 1)

get_acc(taskC_test['label_vector'], y_pred)

Accuracy: 0.47113
precision: 0.43584772718842824
recall: 0.4711340206185567
f1 score: 0.44931764494500387


  _warn_prf(average, modifier, msg_start, len(result))
