In [None]:
!pip install transformers
!pip install lightning
# on mac
!pip3 install torch torchvision torchaudio
!pip install bayesian-optimization

In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():        
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 2060


In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl
from torchmetrics.functional import accuracy, f1_score, precision, recall

In [3]:
davidson = pd.read_csv("datasets/davidson.csv")
davidson

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies


In [4]:
davidson_classes = pd.read_csv("datasets/edited/davidson_with_classes.csv")
davidson_classes.head()

Unnamed: 0,care.virtue,care.vice,authority.virtue,fairness.vice,fairness.virtue,loyalty.vice,loyalty.virtue,sanctity.virtue,authority.vice,sanctity.vice,positive,neutral,negative,target
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,1.0,0.0679,0.4113,0.5208,Partially Negative
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0326,0.4309,0.5365,Neutral but Negative Sentiment
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0031,0.0268,0.9701,Morally Negative
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0609,0.731,0.2081,Neutral
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0119,0.1853,0.8027,Morally Negative


In [5]:
dataframe_davidson = pd.DataFrame(davidson['tweet']).join(davidson_classes['target'])
dataframe_davidson = dataframe_davidson.rename(columns={'tweet': 'text'})
dataframe_davidson.head()

Unnamed: 0,text,target
0,!!! RT @mayasolovely: As a woman you shouldn't...,Partially Negative
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,Neutral but Negative Sentiment
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Morally Negative
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,Neutral
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,Morally Negative


In [6]:
mfrc = pd.read_csv("datasets/edited/mfrc_reduced.csv")
mfrc.head()

Unnamed: 0,text,subreddit,bucket,annotator,annotation,confidence,care.virtue,care.vice,fairness.virtue,fairness.vice,loyalty.virtue,loyalty.vice,authority.virtue,authority.vice,sanctity.virtue,sanctity.vice,cleaned_text
0,That particular part of the debate is especial...,europe,French politics,annotator03,Non-Moral,Confident,0,0,0,0,0,0,0,0,0,0,particular debate especially funny macron expl...
1,"/r/france is pretty lively, with it's own ling...",europe,French politics,annotator03,Non-Moral,Confident,0,0,0,0,0,0,0,0,0,0,r france pretty lively lingo usually delibera...
2,TBH Marion Le Pen would be better. Closet fasc...,neoliberal,French politics,annotator03,Non-Moral,Somewhat Confident,0,0,0,0,0,0,0,0,0,0,tbh marion le pen well closet fascist vs flamb...
3,it really is a very unusual situation isn't it...,europe,French politics,annotator03,Non-Moral,Confident,0,0,0,0,0,0,0,0,0,0,unusual situation fillon affair influence vote...
4,The Le Pen brand of conservatism and classical...,europe,French politics,annotator03,Authority,Somewhat Confident,0,0,0,0,0,0,0,0,0,0,le pen brand conservatism classical right wing...


In [7]:
mfrc_classes = pd.read_csv("datasets/edited/mfrc_with_classes.csv")
mfrc_classes.head()

Unnamed: 0,care.virtue,care.vice,authority.virtue,fairness.vice,fairness.virtue,loyalty.vice,loyalty.virtue,sanctity.virtue,authority.vice,sanctity.vice,positive,neutral,negative,target
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1067,0.3189,0.5743,Neutral but Negative Sentiment
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1703,0.559,0.2707,Neutral
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0658,0.446,0.4882,Neutral but Negative Sentiment
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0204,0.3472,0.6324,Neutral but Negative Sentiment
4,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.666667,0.0,0.0,0.0455,0.7794,0.1751,Morally Positive


In [8]:
dataframe_mfrc = pd.DataFrame(mfrc['text']).join(mfrc_classes['target'])
dataframe_mfrc.head()

Unnamed: 0,text,target
0,That particular part of the debate is especial...,Neutral but Negative Sentiment
1,"/r/france is pretty lively, with it's own ling...",Neutral
2,TBH Marion Le Pen would be better. Closet fasc...,Neutral but Negative Sentiment
3,it really is a very unusual situation isn't it...,Neutral but Negative Sentiment
4,The Le Pen brand of conservatism and classical...,Morally Positive


In [9]:
df = pd.concat([dataframe_davidson, dataframe_mfrc]).reset_index()
del df['index']
df

Unnamed: 0,text,target
0,!!! RT @mayasolovely: As a woman you shouldn't...,Partially Negative
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,Neutral but Negative Sentiment
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Morally Negative
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,Neutral
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,Morally Negative
...,...,...
42664,My job is actually ok and I work with loads of...,Neutral but Negative Sentiment
42665,Someone dying of a disease doesn't change that...,Morally Negative
42666,Some of these politicians and commentators tak...,Morally Negative
42667,"No, it's ""truth over facts! Wait where am I ag...",Neutral but Negative Sentiment


In [10]:
df = df.join(pd.get_dummies(df['target'], dtype = float))
del df['target']
df

Unnamed: 0,text,Morally Negative,Morally Positive,Neutral,Neutral but Negative Sentiment,Neutral but Positive Sentiment,Partially Negative,Partially Neutral,Partially Positive
0,!!! RT @mayasolovely: As a woman you shouldn't...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
42664,My job is actually ok and I work with loads of...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
42665,Someone dying of a disease doesn't change that...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42666,Some of these politicians and commentators tak...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42667,"No, it's ""truth over facts! Wait where am I ag...",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [11]:
LABEL_COLUMNS = df.columns.tolist()[1:]
LABEL_COLUMNS

['Morally Negative',
 'Morally Positive',
 'Neutral',
 'Neutral but Negative Sentiment',
 'Neutral but Positive Sentiment',
 'Partially Negative',
 'Partially Neutral',
 'Partially Positive']

In [12]:
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping
from lightning.pytorch.trainer.trainer import Trainer

early_stopping_callback = EarlyStopping(monitor='val_loss', patience=5)

checkpoint_callback = ModelCheckpoint(
  dirpath="checkpoints",
  filename="best-checkpoint-{val_loss:.2f}",
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)

trainer = Trainer(
  callbacks=[early_stopping_callback, checkpoint_callback],
  max_epochs=15,
  accelerator="gpu",
  devices="auto"
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [14]:
from utils.dataset_torch import CustomDataModule, CustomDataset, Model
from transformers import AutoTokenizer
from lightning.pytorch.tuner import Tuner


BERT_MODEL_NAME = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
model = Model(BERT_MODEL_NAME, lr = 2e-5, weigth_decay=0.001)
datamodule = CustomDataModule(df, tokenizer, batch_size=10)

tuner = Tuner(trainer)
lr_finder = tuner.lr_find(model, datamodule=datamodule, early_stop_threshold=None)
model.lr = lr_finder.suggestion()
model = Model(BERT_MODEL_NAME, lr = lr_finder.suggestion(), weigth_decay=0.001)

trainer.fit(model=model, 
            datamodule=datamodule)
trainer.test(model=model, datamodule=datamodule)
trainer.validate(model, datamodule=datamodule)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(
  rank_zero_warn(


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=100` reached.
Learning rate set to 0.2754228703338169
Restoring states from the checkpoint path at /home/roberto/Desktop/git-repos/Fair-NLP/.lr_find_59b12775-83ce-4db3-a39a-badebae12905.ckpt
Restored all states from the checkpoint at /home/roberto/Desktop/git-repos/Fair-NLP/.lr_find_59b12775-83ce-4db3-a39a-badebae12905.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type                | Params
-----------------------------------------------------------
0 | bert               | BertModel           | 109 M 
1 | classifier         | Linear              | 6.2 K 
2 | criterion          | CrossEntropyLoss    | 0     
3 | weighted_accuracy  | MulticlassAccuracy  | 0     
4 | weighted_precision | MulticlassPrecision | 0     
5 | weighted_recall    | MulticlassRecall    | 0     
6 | weighted_f1        | MulticlassF1Score   | 0     
-----------------------------------------------------------
109 M     Trainable params
0         Non-trai

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 0, global step 2986: 'val_loss' reached 1.81101 (best 1.81101), saving model to '/home/roberto/Desktop/git-repos/Fair-NLP/checkpoints/best-checkpoint-val_loss=1.81.ckpt' as top 1
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

In [None]:
from utils.dataset_torch import CustomDataModule, CustomDataset, Model
from transformers import AutoTokenizer
from lightning.pytorch.tuner import Tuner

BERT_MODEL_NAME = 'distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
model = Model(BERT_MODEL_NAME, lr = 2e-5, weigth_decay=0.001)
datamodule = CustomDataModule(df, tokenizer, batch_size=10)

tuner = Tuner(trainer)
lr_finder = tuner.lr_find(model, datamodule=datamodule, early_stop_threshold=None)
model.lr = lr_finder.suggestion()
model = Model(BERT_MODEL_NAME, lr = lr_finder.suggestion(), weigth_decay=0.001)

trainer.fit(model=model, 
            datamodule=datamodule)
trainer.test(model=model, datamodule=datamodule)
trainer.validate(model, datamodule=datamodule)

In [None]:
from utils.dataset_torch import CustomDataModule, CustomDataset, Model
from transformers import AutoTokenizer
from lightning.pytorch.tuner import Tuner

BERT_MODEL_NAME = 'roberta-base'

tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
model = Model(BERT_MODEL_NAME, lr = 2e-5, weigth_decay=0.001)
datamodule = CustomDataModule(df, tokenizer, batch_size=10)

tuner = Tuner(trainer)
lr_finder = tuner.lr_find(model, datamodule=datamodule, early_stop_threshold=None)
model.lr = lr_finder.suggestion()
model = Model(BERT_MODEL_NAME, lr = lr_finder.suggestion(), weigth_decay=0.001)

trainer.fit(model=model, 
            datamodule=datamodule)
trainer.test(model=model, datamodule=datamodule)
trainer.validate(model, datamodule=datamodule)

In [None]:
from utils.dataset_torch import CustomDataModule, CustomDataset, Model
from transformers import AutoTokenizer
from lightning.pytorch.tuner import Tuner

BERT_MODEL_NAME = 'deberta-base-v3'

tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
model = Model(BERT_MODEL_NAME, lr = 2e-5, weigth_decay=0.001)
datamodule = CustomDataModule(df, tokenizer, batch_size=10)

tuner = Tuner(trainer)
lr_finder = tuner.lr_find(model, datamodule=datamodule, early_stop_threshold=None)
model.lr = lr_finder.suggestion()
model = Model(BERT_MODEL_NAME, lr = lr_finder.suggestion(), weigth_decay=0.001)

trainer.fit(model=model, 
            datamodule=datamodule)
trainer.test(model=model, datamodule=datamodule)
trainer.validate(model, datamodule=datamodule)