# Main imports and code

In [None]:
# check which gpu we're using
!nvidia-smi

Sun Mar  3 17:00:22 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
!pip install simpletransformers
!pip install tensorboardx



In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs, MultiLabelClassificationModel, MultiLabelClassificationArgs
from urllib import request
import pandas as pd
import logging
import torch
from collections import Counter
from ast import literal_eval

In [None]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()

print('Cuda available? ',cuda_available)

Cuda available?  True


In [None]:
if cuda_available:
  import tensorflow as tf
  # Get the GPU device name.
  device_name = tf.test.gpu_device_name()
  # The device name should look like the following:
  if device_name == '/device:GPU:0':
      print('Found GPU at: {}'.format(device_name))
  else:
      raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


# Fetch Don't Patronize Me! data manager module

In [None]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [None]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [None]:
from dont_patronize_me import DontPatronizeMe

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = '/content/drive/MyDrive/NLP CW1/'
filename = 'dontpatronizeme_pcl.tsv'

In [None]:
dpm = DontPatronizeMe(file_path, filename)

In [None]:
dpm.load_task1()
dpm.load_task2(return_one_hot=True)

Map of label to numerical label:
{'Unbalanced_power_relations': 0, 'Shallow_solution': 1, 'Presupposition': 2, 'Authority_voice': 3, 'Metaphors': 4, 'Compassion': 5, 'The_poorer_the_merrier': 6}


# Load paragraph IDs

In [None]:
file1 = '/content/drive/My Drive/NLP CW1/train_semeval_parids-labels.csv'
file2 = '/content/drive/My Drive/NLP CW1/dev_semeval_parids-labels.csv'

In [None]:
trids = pd.read_csv(file1)
teids = pd.read_csv(file2)

In [None]:
trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)

In [None]:
data=dpm.train_task1_df

In [None]:
data



# Rebuild training set (Task 1)

In [None]:
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):
  parid = trids.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })


In [None]:
import random

In [None]:
trdf1 = pd.DataFrame(rows)

In [None]:
trdf1

# Rebuild test set (Task 1)

In [None]:
rows = [] # will contain par_id, label and text
for idx in range(len(teids)):
  parid = teids.par_id[idx]
  #print(parid)
  # select row from original dataset
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })


In [None]:
len(rows)

2094

In [None]:
tedf1 = pd.DataFrame(rows)

In [None]:
tedf1=tedf1.sample(frac=1)

In [None]:
tedf1

# RoBERTa Baseline for Task 1

In [None]:
# downsample negative instances
pcldf = trdf1[trdf1.label==1]
npos = len(pcldf)


training_set1 = pd.concat([pcldf,trdf1[trdf1.label==0][:npos*2]])

In [None]:
training_set1

In [None]:

task1_model_args = ClassificationArgs(num_train_epochs=1,
                                      no_save=True,
                                      no_cache=True,
                                      overwrite_output_dir=True)

# Adjust Hyperparameters
task1_model_args.num_train_epochs = 3 # Number of epochs
task1_model_args.learning_rate = 0.00005 # Learning rate
task1_model_args.train_batch_size = 32 # Batch size
task1_model_args.weight_decay = 0.01 # Regularization Strength
task1_model_args.warmup_steps = 100 # lr starts at 0
task1_model_args.gradient_accumulation_steps = 2 # Fancy hyperparameter related to batch size


task1_model = ClassificationModel("roberta",
                                  'roberta-base',
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available)
# train model
task1_model.train_model(training_set1[['text', 'label']])
# run predictions
preds_task1, _ = task1_model.predict(tedf1.text.tolist())

In [None]:
Counter(preds_task1)

Counter({0: 1729, 1: 365})

In [None]:
# Evaluate the model
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
print('accuracy: ' + str(accuracy_score(tedf1.label.values, preds_task1)))
print('precision: ' + str(precision_score(tedf1.label.values, preds_task1)))
print('recall： ' + str(recall_score(tedf1.label.values, preds_task1)))
print('f1_score: ' + str(f1_score(tedf1.label.values, preds_task1)))

accuracy: 0.8720152817574021
precision: 0.4054794520547945
recall： 0.7437185929648241
f1_score: 0.524822695035461


In [None]:
labels2file([[k] for k in preds_task1], 'task1.txt')

# Rebuild training set (Task 2)

In [None]:
rows2 = [] # will contain par_id, label and text
for idx in range(len(trids)):
  parid = trids.par_id[idx]
  label = trids.label[idx]
  # select row from original dataset to retrieve the `text` value
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  rows2.append({
      'par_id':parid,
      'text':text,
      'label':label
  })


In [None]:
trdf2 = pd.DataFrame(rows2)

In [None]:
trdf2

In [None]:
trdf2.label = trdf2.label.apply(literal_eval)

# Rebuild test set (Task 2)

In [None]:
rows2 = [] # will contain par_id, label and text
for idx in range(len(teids)):
  parid = teids.par_id[idx]
  label = teids.label[idx]
  #print(parid)
  # select row from original dataset to access the `text` value
  text = dpm.train_task1_df.loc[dpm.train_task1_df.par_id == parid].text.values[0]
  rows2.append({
      'par_id':parid,
      'text':text,
      'label':label
  })


In [None]:
tedf2 = pd.DataFrame(rows2)

In [None]:
tedf2

In [None]:
tedf2.label = tedf2.label.apply(literal_eval)

# RoBERTa baseline for Task 2

In [None]:
all_negs = trdf2[trdf2.label.apply(lambda x:sum(x) == 0)]
all_pos = trdf2[trdf2.label.apply(lambda x:sum(x) > 0)]

training_set2 = pd.concat([all_pos,all_negs[:round(len(all_pos)*0.5)]])

In [None]:
training_set2

In [None]:
task2_model_args = MultiLabelClassificationArgs(num_train_epochs=1,
                                                no_save=True,
                                                no_cache=True,
                                                overwrite_output_dir=True
                                                )

# Adjust Hyperparameters
task2_model_args.num_train_epochs = 3 # Number of epochs
task2_model_args.learning_rate = 0.00005 # Learning rate
task2_model_args.train_batch_size = 32 # Batch size
task2_model_args.weight_decay = 0.01 # Regularization Strength
task2_model_args.warmup_steps = 100 # lr starts at 0
task2_model_args.gradient_accumulation_steps = 2 # Fancy hyperparameter related to batch size

task2_model = MultiLabelClassificationModel("roberta",
                                            'roberta-base',
                                            num_labels=7,
                                            args = task2_model_args,
                                            use_cuda=cuda_available)
# train model
task2_model.train_model(training_set2[['text', 'label']])
# run predictions
preds_task2, _ = task2_model.predict(tedf2.text.tolist())

In [None]:
# Evaluate the model
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

y_true = mlb.fit_transform(tedf2.label.values)
y_pred = mlb.transform(preds_task2)

print('accuracy: ' + str(accuracy_score(y_true, y_pred)))
print('precision: ' + str(precision_score(y_true, y_pred, average='samples')))
print('recall： ' + str(recall_score(y_true, y_pred, average='samples')))
print('f1_score: ' + str(f1_score(y_true, y_pred, average='samples')))

accuracy: 0.6270296084049666
precision: 0.8223495702005731
recall： 0.9911652340019103
f1_score: 0.8756765361349887


In [None]:
labels2file(preds_task2, 'task2.txt')

## Prepare submission

In [None]:
!cat task1.txt | head -n 10

0
0
0
0
0
0
0
0
0
0


In [None]:
!cat task2.txt | head -n 10

1,0,0,0,0,0,0
1,0,0,0,0,0,0
0,0,0,0,0,0,0
1,0,0,0,0,0,0
0,0,0,0,0,0,0
0,0,0,0,0,0,0
1,0,0,0,0,0,0
1,0,0,0,0,0,0
0,0,0,0,0,0,0
1,0,0,0,0,0,0


In [None]:
!zip submission.zip task1.txt task2.txt

  adding: task1.txt (deflated 93%)
  adding: task2.txt (deflated 97%)


## Data Augmentation
Reference: https://github.com/makcedward/nlpaug/blob/master/example/textual_augmenter.ipynb


In [None]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn.functional as F

data = data.dropna()
data

In [None]:
!pip install nlpaug


import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/410.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━[0m [32m266.2/410.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11


# Data augmentation and preprocessing

## Trying aug = naw.SynonymAug(aug_src='wordnet')

In [None]:
# Initialize the synonym augmenter
aug = naw.SynonymAug(aug_src='wordnet')

# Function to apply augmentation
def augment_text(df):
    augmented_texts = []
    for text in df['text']:
        augmented_text = aug.augment(text)
        augmented_texts.append(augmented_text)
    return augmented_texts

# Apply augmentation to the training set
training_set1['augmented_text'] = augment_text(training_set1)

data_out = training_set1[['augmented_text', 'label']].rename(columns={'augmented_text': 'text'})


In [None]:

task1_model_args = ClassificationArgs(num_train_epochs=1,
                                      no_save=True,
                                      no_cache=True,
                                      overwrite_output_dir=True)


# Best Hyperparameters
task1_model_args.num_train_epochs = 5 # Number of epochs
task1_model_args.learning_rate = 3.76e-05 # Learning rate
task1_model_args.train_batch_size = 16 # Batch size
task1_model_args.weight_decay = 0.0300861 # Regularization Strength
task1_model_args.warmup_steps = 222 # lr starts at 0 - goes to 0.0001 over steps
task1_model_args.gradient_accumulation_steps = 2 # Fancy hyperparameter related to batch size and memory




task1_model_data_aug = ClassificationModel("roberta",
                                  'roberta-base',
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available)
# train model
task1_model_data_aug.train_model(data_out[['text', 'label']])
# run predictions
preds_task1, _ = task1_model_data_aug.predict(tedf1.text.tolist())


In [None]:
# Evaluate the model
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
print('accuracy: ' + str(accuracy_score(tedf1.label.values, preds_task1)))
print('precision: ' + str(precision_score(tedf1.label.values, preds_task1)))
print('recall： ' + str(recall_score(tedf1.label.values, preds_task1)))
print('f1_score: ' + str(f1_score(tedf1.label.values, preds_task1)))

accuracy: 0.8767908309455588
precision: 0.40390879478827363
recall： 0.6231155778894473
f1_score: 0.49011857707509887


## Trying aug = naw.AntonymAug()

In [None]:
aug = naw.AntonymAug()
# Function to apply augmentation
def augment_text(df):
    augmented_texts = []
    for text in df['text']:
        augmented_text = aug.augment(text)
        augmented_texts.append(augmented_text)
    return augmented_texts

# Apply augmentation to the training set
training_set1['augmented_text'] = augment_text(training_set1)

data_out = training_set1[['augmented_text', 'label']].rename(columns={'augmented_text': 'text'})


In [None]:

task1_model_args = ClassificationArgs(num_train_epochs=1,
                                      no_save=True,
                                      no_cache=True,
                                      overwrite_output_dir=True)


# Best Hyperparameters
task1_model_args.num_train_epochs = 5 # Number of epochs
task1_model_args.learning_rate = 3.76e-05 # Learning rate
task1_model_args.train_batch_size = 16 # Batch size
task1_model_args.weight_decay = 0.0300861 # Regularization Strength
task1_model_args.warmup_steps = 222 # lr starts at 0 - goes to 0.0001 over steps
task1_model_args.gradient_accumulation_steps = 2 # Fancy hyperparameter related to batch size and memory




task1_model_data_aug = ClassificationModel("roberta",
                                  'roberta-base',
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available)
# train model
task1_model_data_aug.train_model(data_out[['text', 'label']])
# run predictions
preds_task1, _ = task1_model_data_aug.predict(tedf1.text.tolist())


In [None]:
# Evaluate the model
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
print('accuracy: ' + str(accuracy_score(tedf1.label.values, preds_task1)))
print('precision: ' + str(precision_score(tedf1.label.values, preds_task1)))
print('recall： ' + str(recall_score(tedf1.label.values, preds_task1)))
print('f1_score: ' + str(f1_score(tedf1.label.values, preds_task1)))

accuracy: 0.8500477554918816
precision: 0.3566084788029925
recall： 0.7185929648241206
f1_score: 0.4766666666666667


## Roberta base

In [None]:
# Run Bert Base case
torch.manual_seed(2)


# Best Hyperparameters
task1_model_args.num_train_epochs = 5 # Number of epochs
task1_model_args.learning_rate = 3.76e-05 # Learning rate
task1_model_args.train_batch_size = 16 # Batch size
task1_model_args.weight_decay = 0.0300861 # Regularization Strength
task1_model_args.warmup_steps = 222 # lr starts at 0 - goes to 0.0001 over steps
task1_model_args.gradient_accumulation_steps = 2 # Fancy hyperparameter related to batch size and memory



task1_model_data_aug = ClassificationModel("roberta",
                                  'roberta-base',
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available)
# train model
task1_model_data_aug.train_model(training_set1[['text', 'label']])
# run predictions
preds_task1_bert, _ = task1_model.predict(tedf1.text.tolist())

In [None]:
# Evaluate the model
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
print('accuracy: ' + str(accuracy_score(tedf1.label.values, preds_task1)))
print('precision: ' + str(precision_score(tedf1.label.values, preds_task1)))
print('recall： ' + str(recall_score(tedf1.label.values, preds_task1)))
print('f1_score: ' + str(f1_score(tedf1.label.values, preds_task1)))

accuracy: 0.8519579751671442
precision: 0.36561743341404357
recall： 0.7587939698492462
f1_score: 0.49346405228758167


## Bert model

In [None]:
# Run Bert Base case
torch.manual_seed(2)


# Best Hyperparameters
task1_model_args.num_train_epochs = 5 # Number of epochs
task1_model_args.learning_rate = 3.76e-05 # Learning rate
task1_model_args.train_batch_size = 16 # Batch size
task1_model_args.weight_decay = 0.0300861 # Regularization Strength
task1_model_args.warmup_steps = 222 # lr starts at 0 - goes to 0.0001 over steps
task1_model_args.gradient_accumulation_steps = 2 # Fancy hyperparameter related to batch size and memory


task1_model_bert_aug = ClassificationModel("bert",
                                    'bert-base-uncased',
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available)
# train model
task1_model_bert_aug.train_model(training_set1[['text', 'label']])
# run predictions
preds_task1_bert, _ = task1_model.predict(tedf1.text.tolist())



In [None]:
# Evaluate Bert model
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
print('precision: ' + str(precision_score(tedf1.label.values, preds_task1_bert)))
print( 'recall： ' + str(recall_score(tedf1.label.values, preds_task1_bert)))
print('f1_score: ' + str(f1_score(tedf1.label.values, preds_task1_bert)))


precision: 0.36561743341404357
recall： 0.7587939698492462
f1_score: 0.49346405228758167


## XLnet

In [None]:
# Run XLNet Base case
torch.manual_seed(2)


# Best Hyperparameters
task1_model_args.num_train_epochs = 5 # Number of epochs
task1_model_args.learning_rate = 3.76e-05 # Learning rate
task1_model_args.train_batch_size = 16 # Batch size
task1_model_args.weight_decay = 0.0300861 # Regularization Strength
task1_model_args.warmup_steps = 222 # lr starts at 0 - goes to 0.0001 over steps
task1_model_args.gradient_accumulation_steps = 2 # Fancy hyperparameter related to batch size and memory


task1_model_xlnet = ClassificationModel("xlnet",
                                    'xlnet-base-cased',
                                    args = task1_model_args,
                                    num_labels=2,
                                    use_cuda=cuda_available)

# train model
task1_model_xlnet.train_model(training_set1[['text', 'label']])
# run predictions
preds_task1_xlnet, _ = task1_model_xlnet.predict(tedf1.text.tolist())


In [None]:
# Evaluate XLNet model
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
print('precision: ' + str(precision_score(tedf1.label.values, preds_task1_xlnet)))
print( 'recall： ' + str(recall_score(tedf1.label.values, preds_task1_xlnet)))
print('f1_score: ' + str(f1_score(tedf1.label.values, preds_task1_xlnet)))

precision: 0.421875
recall： 0.678391959798995
f1_score: 0.5202312138728324


## DistilBERT model

In [None]:
# Run DistilBert Base case
torch.manual_seed(2)
# Best Hyperparameters
task1_model_args.num_train_epochs = 5 # Number of epochs
task1_model_args.learning_rate = 3.76e-05 # Learning rate
task1_model_args.train_batch_size = 16 # Batch size
task1_model_args.weight_decay = 0.0300861 # Regularization Strength
task1_model_args.warmup_steps = 222 # lr starts at 0 - goes to 0.0001 over steps
task1_model_args.gradient_accumulation_steps = 2 # Fancy hyperparameter related to batch size and memory

task1_model_distilbert = ClassificationModel("distilbert",
                                  'distilbert-base-uncased',
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available)

# train model
task1_model_distilbert.train_model(training_set1[['text', 'label']])
# run predictions
preds_task1_distilbert, _ = task1_model_distilbert.predict(tedf1.text.tolist())

In [None]:
# Evaluate the model
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
print('precision: ' + str(precision_score(tedf1.label.values, preds_task1_distilbert)))
print( 'recall： ' + str(recall_score(tedf1.label.values, preds_task1_distilbert)))
print('f1_score: ' + str(f1_score(tedf1.label.values, preds_task1_distilbert)))

precision: 0.40978593272171254
recall： 0.6733668341708543
f1_score: 0.5095057034220531


# Data preprocessing
## backtranslation

In [None]:
!pip install googletrans
!pip install -U deep-translator
from deep_translator import GoogleTranslator
GoogleTranslator(source='auto', target='de').translate("it is a sunny day today")

In [None]:
import pandas as pd
from googletrans import Translator
from deep_translator import GoogleTranslator

# Load the data
df = trdf1 #pd.read_csv("data.csv")

# Get the examples with label = 1
label_1 = df[df["label"] == 1]

# Initialize the Translator
translator = Translator()

# Create a list to store the translated text
translated_text = []

# Loop over the examples with label = 1
for index, row in label_1.iterrows():
    # Translate the text from English to French
    # translated = translator.translate(row["text"], dest="fr").text
    translated=GoogleTranslator(source='en', target='fr').translate(row["text"])

    # Translate the text back from French to English
    # back_translated = translator.translate(translated, dest="en").text
    back_translated=GoogleTranslator(source='fr', target='en').translate(translated)
    # Add the back-translated text to the list
    translated_text.append(back_translated)

# Add the back-translated text to the original dataframe
label_1["back_translated_text"] = translated_text

# Concatenate the original dataframe and the back-translated dataframe
df = pd.concat([df, label_1])

Label_1_cleaned = label_1.drop(['text'], axis=1)

#rename the column back_translated_text to text
Label_1_cleaned.rename(columns = {'back_translated_text':'text'}, inplace = True)
# Concatenate Label_1_cleaned below the training_set1
training_set1_backtranslate_french = pd.concat([trdf1, Label_1_cleaned])
training_set1_backtranslate_french


In [None]:
# Show the minority class from training_set1
training_set1_backtranslate_french[training_set1_backtranslate_french["label"] == 1]
# Print the number of examples in the minority class and the majority class
print("No. of examples in minority class:", len(training_set1_backtranslate_french[training_set1_backtranslate_french["label"] == 1]))
print("No. of examples in majority class:", len(training_set1_backtranslate_french[training_set1_backtranslate_french["label"] == 0]))
duplicates= training_set1_backtranslate_french[training_set1_backtranslate_french["label"] == 1].text.duplicated().sum()
print("Number of duplicates in the minority class:", duplicates)



No. of examples in minority class: 1588
No. of examples in majority class: 7581
Number of duplicates in the minority class: 1


In [None]:
# downsample negative instances
pcldf_french = training_set1_backtranslate_french[training_set1_backtranslate_french.label==1]
npos = len(pcldf_french)
print(npos)

training_set1_french = pd.concat([pcldf_french,training_set1_backtranslate_french[training_set1_backtranslate_french.label==0][:npos*2]])

1588


In [None]:
#print the number of positive and negative instances in the training set
print(training_set1_french.label.value_counts())

# Print by how much the negative instances are downsampled in absolute terms
print(f'Number of negative instances downsampled: {len(training_set1_backtranslate_french[training_set1_backtranslate_french.label==0]) - len(training_set1_backtranslate_french[training_set1_backtranslate_french.label==0])}')

0    3176
1    1588
Name: label, dtype: int64
Number of negative instances downsampled: 0


In [None]:
# Run Roberta Base case

torch.manual_seed(2)

# Best Hyperparameters
task1_model_args.num_train_epochs = 5 # Number of epochs
task1_model_args.learning_rate = 3.76e-05 # Learning rate
task1_model_args.train_batch_size = 16 # Batch size
task1_model_args.weight_decay = 0.0300861 # Regularization Strength
task1_model_args.warmup_steps = 222 # lr starts at 0 - goes to 0.0001 over steps
task1_model_args.gradient_accumulation_steps = 2 # Fancy hyperparameter related to batch size and memory

task1_model = ClassificationModel("roberta",
                                  'roberta-base',
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available)
# train model
task1_model.train_model(training_set1_french[['text', 'label']])
# run predictions
preds_task1, _ = task1_model.predict(tedf1.text.tolist())

In [None]:
# Evaluate the model
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
print('precision: ' + str(precision_score(tedf1.label.values, preds_task1)))
print( 'recall： ' + str(recall_score(tedf1.label.values, preds_task1)))
print('f1_score: ' + str(f1_score(tedf1.label.values, preds_task1)))

precision: 0.4981949458483754
recall： 0.6934673366834171
f1_score: 0.5798319327731092


# Implementing a learning rate scheduler

In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs

enable_lr_scheduling = True

# Set up early stopping and other parameters
task1_model_args = ClassificationArgs()
task1_model_args.num_train_epochs = 5
task1_model_args.learning_rate = 3.76e-05
task1_model_args.train_batch_size = 16
task1_model_args.weight_decay = 0.0300861
task1_model_args.warmup_steps = 222
task1_model_args.gradient_accumulation_steps = 2
task1_model_args.use_early_stopping = True
task1_model_args.early_stopping_patience = 3
task1_model_args.early_stopping_delta = 0.01
task1_model_args.early_stopping_metric = "f1"
task1_model_args.early_stopping_metric_minimize = False
task1_model_args.save_steps = -1  # Prevents saving checkpoints
task1_model_args.save_model_every_epoch = False
task1_model_args.no_save = True  # Disable saving model after training
task1_model_args.overwrite_output_dir = True

# Conditionally set the learning rate scheduler
if enable_lr_scheduling:
    task1_model_args.scheduler = "linear_schedule_with_warmup"
else:
    task1_model_args.scheduler = "constant_schedule"

# Train the model with the conditional learning rate scheduler
model = ClassificationModel(
    "roberta",
    'roberta-base',
    args=task1_model_args,
    num_labels=2,
    use_cuda=cuda_available
)

# Train and evaluate the model
model.train_model(training_set1_french[['text', 'label']], eval_df=tedf1[['text', 'label']])
results, model_outputs, wrong_predictions = model.eval_model(tedf1[['text', 'label']])


In [None]:
# Arguments for training without a learning rate scheduler
task1_model_args_no_scheduler = ClassificationArgs()
task1_model_args_no_scheduler.num_train_epochs = 5
task1_model_args_no_scheduler.learning_rate = 3.76e-05
task1_model_args_no_scheduler.train_batch_size = 16
task1_model_args_no_scheduler.weight_decay = 0.0300861
task1_model_args_no_scheduler.use_early_stopping = True
task1_model_args_no_scheduler.early_stopping_patience = 3
task1_model_args_no_scheduler.early_stopping_delta = 0.01
task1_model_args_no_scheduler.early_stopping_metric = "f1"
task1_model_args_no_scheduler.early_stopping_metric_minimize = False
task1_model_args_no_scheduler.save_steps = -1
task1_model_args_no_scheduler.save_model_every_epoch = False
task1_model_args_no_scheduler.no_save = True
task1_model_args_no_scheduler.overwrite_output_dir = True

# Train the model without a learning rate scheduler and collect the results
model_without_scheduler = ClassificationModel(
    "roberta",
    'roberta-base',
    args=task1_model_args_no_scheduler,  # Arguments without scheduler settings
    num_labels=2,
    use_cuda=cuda_available
)
# Train and evaluate the model
model_without_scheduler.train_model(training_set1_french[['text', 'label']], eval_df=tedf1[['text', 'label']])
result_without_scheduler, model_outputs, wrong_predictions = model_without_scheduler.eval_model(tedf1[['text', 'label']])



In [None]:
import matplotlib.pyplot as plt
import numpy as np

loss_with_scheduler = results['eval_loss']
f1_with_scheduler = results['f1_score']
loss_without_scheduler = result_without_scheduler['eval_loss']
f1_without_scheduler = result_without_scheduler['f1_score']

metrics = ['Loss', 'F1 Score']
values_with_scheduler = [loss_with_scheduler, f1_with_scheduler]
values_without_scheduler = [loss_without_scheduler, f1_without_scheduler]

x = np.arange(len(metrics))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, values_with_scheduler, width, label='With Scheduler')
rects2 = ax.bar(x + width/2, values_without_scheduler, width, label='Without Scheduler')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Scores')
ax.set_title('Scores by metric and scheduler use')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()

ax.bar_label(rects1, padding=3)
ax.bar_label(rects2, padding=3)

fig.tight_layout()

plt.show()


In [None]:
!pip install wandb




In [None]:
import wandb
wandb.login()


In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import wandb

wandb.login()

enable_lr_scheduling = True

# Set up all parameters including WandB project name in a single ClassificationArgs configuration
task1_model_args = ClassificationArgs(
    num_train_epochs=5,
    learning_rate=3.76e-05,
    train_batch_size=16,
    weight_decay=0.0300861,
    warmup_steps=222,
    gradient_accumulation_steps=2,
    use_early_stopping=True,
    early_stopping_patience=3,
    early_stopping_delta=0.01,
    early_stopping_metric="f1",
    early_stopping_metric_minimize=False,
    save_steps=-1,  # Prevents saving checkpoints
    save_model_every_epoch=False,
    no_save=True,  # Disable saving model after training
    overwrite_output_dir=True,
    wandb_project="learningratescheduler",
)

# Conditional setting for the learning rate scheduler
task1_model_args.scheduler = "linear_schedule_with_warmup" if enable_lr_scheduling else "constant_schedule"

# Initialize and train the model with the configured settings
model = ClassificationModel(
    "roberta",
    'roberta-base',
    args=task1_model_args,
    num_labels=2,
    use_cuda=torch.cuda.is_available()  # Automatically use CUDA if available
)

model.train_model(training_set1_french[['text', 'label']])
results, model_outputs, wrong_predictions = model.eval_model(tedf1[['text', 'label']])


In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import wandb

# Ensure you're logged into WandB; this might prompt for an API key if you're not already logged in.
wandb.login()

enable_lr_scheduling = False

# Set up all parameters including WandB project name in a single ClassificationArgs configuration
task1_model_args_no_scheduler = ClassificationArgs(
    num_train_epochs=5,
    learning_rate=3.76e-05,
    train_batch_size=16,
    weight_decay=0.0300861,
    use_early_stopping=True,
    early_stopping_patience=3,
    early_stopping_delta=0.01,
    early_stopping_metric="f1",
    early_stopping_metric_minimize=False,
    save_steps=-1,  # Prevents saving checkpoints
    save_model_every_epoch=False,
    no_save=True,  # Disable saving model after training
    overwrite_output_dir=True,
    wandb_project="learningratescheduler_no_scheduler",  # A different WandB project name for clarity
)


# Initialize and train the model without a learning rate scheduler
model_without_scheduler = ClassificationModel(
    "roberta",
    'roberta-base',
    args=task1_model_args_no_scheduler,
    num_labels=2,
    use_cuda=torch.cuda.is_available()  # Automatically use CUDA if available
)

model_without_scheduler.train_model(training_set1_french[['text', 'label']])
results_without_scheduler, model_outputs, wrong_predictions = model_without_scheduler.eval_model(tedf1[['text', 'label']])


## Random upsampling

In [None]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = training_set1[training_set1.label==0]
df_minority = training_set1[training_set1.label==1]

# Upsample minority class
df_minority_upsampled = resample(df_minority,
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=123) # reproducible results

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Display new class counts
df_upsampled.label.value_counts()


0    1588
1    1588
Name: label, dtype: int64

In [None]:
# Run Roberta Base case

torch.manual_seed(2)

# Best Hyperparameters
task1_model_args.num_train_epochs = 5 # Number of epochs
task1_model_args.learning_rate = 3.76e-05 # Learning rate
task1_model_args.train_batch_size = 16 # Batch size
task1_model_args.weight_decay = 0.0300861 # Regularization Strength
task1_model_args.warmup_steps = 222 # lr starts at 0 - goes to 0.0001 over steps
task1_model_args.gradient_accumulation_steps = 2 # Fancy hyperparameter related to batch size and memory

task1_model = ClassificationModel("roberta",
                                  'roberta-base',
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available)
# train model
task1_model.train_model(df_upsampled[['text', 'label']])
# run predictions
preds_task1, _ = task1_model.predict(tedf1.text.tolist())

In [None]:
# Evaluate the model
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
print('precision: ' + str(precision_score(tedf1.label.values, preds_task1)))
print( 'recall： ' + str(recall_score(tedf1.label.values, preds_task1)))
print('f1_score: ' + str(f1_score(tedf1.label.values, preds_task1)))

precision: 0.4794007490636704
recall： 0.6432160804020101
f1_score: 0.5493562231759657


In [None]:
# Run Roberta Base case

torch.manual_seed(2)

# Best Hyperparameters
task1_model_args.num_train_epochs = 5 # Number of epochs
task1_model_args.learning_rate = 3.76e-05 # Learning rate
task1_model_args.train_batch_size = 16 # Batch size
task1_model_args.weight_decay = 0.0300861 # Regularization Strength
task1_model_args.warmup_steps = 222 # lr starts at 0 - goes to 0.0001 over steps
task1_model_args.gradient_accumulation_steps = 2 # Fancy hyperparameter related to batch size and memory

task1_model_distilbert = ClassificationModel("distilbert",
                                  'distilbert-base-uncased',
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available)
# train model
task1_model_distilbert.train_model(df_upsampled[['text', 'label']])
# run predictions
preds_task1, _ = task1_model_distilbert.predict(tedf1.text.tolist())

In [None]:
# Evaluate the model
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
print('precision: ' + str(precision_score(tedf1.label.values, preds_task1)))
print( 'recall： ' + str(recall_score(tedf1.label.values, preds_task1)))
print('f1_score: ' + str(f1_score(tedf1.label.values, preds_task1)))

precision: 0.4406779661016949
recall： 0.6532663316582915
f1_score: 0.5263157894736842


In [None]:


torch.manual_seed(2)

# Best Hyperparameters
task1_model_args.num_train_epochs = 5 # Number of epochs
task1_model_args.learning_rate = 3.76e-05 # Learning rate
task1_model_args.train_batch_size = 16 # Batch size
task1_model_args.weight_decay = 0.0300861 # Regularization Strength
task1_model_args.warmup_steps = 222 # lr starts at 0 - goes to 0.0001 over steps
task1_model_args.gradient_accumulation_steps = 2 # Fancy hyperparameter related to batch size and memory

task1_model_xlnet = ClassificationModel("xlnet",
                                    'xlnet-base-cased',
                                    args = task1_model_args,
                                    num_labels=2,
                                    use_cuda=cuda_available)

# train model
task1_model_xlnet.train_model(df_upsampled[['text', 'label']])
# run predictions
preds_task1, _ = task1_model_xlnet.predict(tedf1.text.tolist())

In [None]:
# Evaluate the model
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
print('precision: ' + str(precision_score(tedf1.label.values, preds_task1)))
print( 'recall： ' + str(recall_score(tedf1.label.values, preds_task1)))
print('f1_score: ' + str(f1_score(tedf1.label.values, preds_task1)))

precision: 0.4099722991689751
recall： 0.7437185929648241
f1_score: 0.5285714285714287


In [None]:
torch.manual_seed(2)

# Best Hyperparameters
task1_model_args.num_train_epochs = 5 # Number of epochs
task1_model_args.learning_rate = 3.76e-05 # Learning rate
task1_model_args.train_batch_size = 16 # Batch size
task1_model_args.weight_decay = 0.0300861 # Regularization Strength
task1_model_args.warmup_steps = 222 # lr starts at 0 - goes to 0.0001 over steps
task1_model_args.gradient_accumulation_steps = 2 # Fancy hyperparameter related to batch size and memory

task1_model_bert_aug = ClassificationModel("bert",
                                    'bert-base-uncased',
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available)
# train model
task1_model_bert_aug.train_model(training_set1[['text', 'label']])
# run predictions
preds_task1_bert, _ = task1_model_bert_aug.predict(tedf1.text.tolist())


In [None]:
# Evaluate the model
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
print('precision: ' + str(precision_score(tedf1.label.values, preds_task1)))
print( 'recall： ' + str(recall_score(tedf1.label.values, preds_task1)))
print('f1_score: ' + str(f1_score(tedf1.label.values, preds_task1)))

precision: 0.4099722991689751
recall： 0.7437185929648241
f1_score: 0.5285714285714287




## random upsampling and downsampling

In [None]:
# Downsample majority class
df_majority_downsampled = resample(df_majority,
                                   replace=False,    # sample without replacement
                                   n_samples=len(df_minority_upsampled), # match minority class
                                   random_state=123) # reproducible results

# Combine downsampled majority class with upsampled minority class
df_balanced = pd.concat([df_minority_upsampled, df_majority_downsampled])



In [None]:
# Run Roberta Base case

torch.manual_seed(2)

# Best Hyperparameters
task1_model_args.num_train_epochs = 5 # Number of epochs
task1_model_args.learning_rate = 3.76e-05 # Learning rate
task1_model_args.train_batch_size = 16 # Batch size
task1_model_args.weight_decay = 0.0300861 # Regularization Strength
task1_model_args.warmup_steps = 222 # lr starts at 0 - goes to 0.0001 over steps
task1_model_args.gradient_accumulation_steps = 2 # Fancy hyperparameter related to batch size and memory

task1_model = ClassificationModel("roberta",
                                  'roberta-base',
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available)
# train model
task1_model.train_model(df_balanced[['text', 'label']])
# run predictions
preds_task1, _ = task1_model.predict(tedf1.text.tolist())

In [None]:
# Evaluate the model
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
print('precision: ' + str(precision_score(tedf1.label.values, preds_task1)))
print( 'recall： ' + str(recall_score(tedf1.label.values, preds_task1)))
print('f1_score: ' + str(f1_score(tedf1.label.values, preds_task1)))

precision: 0.4154727793696275
recall： 0.7286432160804021
f1_score: 0.5291970802919709


Removing stop words and punctuation

In [None]:
import nltk
from nltk.corpus import stopwords
import string

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to remove stopwords and punctuation
def clean_text(text):

    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return text

# Apply the function to your text data
trdf1['text'] = trdf1['text'].apply(clean_text)
tedf1['text'] = tedf1['text'].apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:

data_out = trdf1

torch.manual_seed(2)

# Best Hyperparameters
task1_model_args.num_train_epochs = 5 # Number of epochs
task1_model_args.learning_rate = 3.76e-05 # Learning rate
task1_model_args.train_batch_size = 16 # Batch size
task1_model_args.weight_decay = 0.0300861 # Regularization Strength
task1_model_args.warmup_steps = 222 # lr starts at 0 - goes to 0.0001 over steps
task1_model_args.gradient_accumulation_steps = 2 # Fancy hyperparameter related to batch size and memory

# Initialize the model with the cleaned and preprocessed dataset
task1_model_data_aug = ClassificationModel(
    "roberta",
    'roberta-base',
    args=task1_model_args,
    num_labels=2,
    use_cuda=cuda_available
)

# Train the model with the preprocessed data
task1_model_data_aug.train_model(data_out[['text', 'label']])

preds_task1, _ = task1_model_data_aug.predict(tedf1['text'].tolist())

In [None]:
# Evaluate the model
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
print('precision: ' + str(precision_score(tedf1.label.values, preds_task1)))
print( 'recall： ' + str(recall_score(tedf1.label.values, preds_task1)))
print('f1_score: ' + str(f1_score(tedf1.label.values, preds_task1)))

precision: 0.5833333333333334
recall： 0.4221105527638191
f1_score: 0.489795918367347


## Adding categorical data

In [None]:
# Concatenate the categorical data with the text data
trdf1['text'] = trdf1['community'] + ' ' + trdf1['text']
tedf1['text'] = tedf1['community'] + ' ' + tedf1['text']

torch.manual_seed(2)

# Best Hyperparameters
task1_model_args.num_train_epochs = 5 # Number of epochs
task1_model_args.learning_rate = 3.76e-05 # Learning rate
task1_model_args.train_batch_size = 16 # Batch size
task1_model_args.weight_decay = 0.0300861 # Regularization Strength
task1_model_args.warmup_steps = 222 # lr starts at 0 - goes to 0.0001 over steps
task1_model_args.gradient_accumulation_steps = 2 # Fancy hyperparameter related to batch size and memory

# Initialize the model with the cleaned and preprocessed dataset
task1_model_data_aug = ClassificationModel(
    "roberta",
    'roberta-base',
    args=task1_model_args,
    num_labels=2,
    use_cuda=cuda_available
)


# Continue with your model training and prediction as before
task1_model_data_aug.train_model(trdf1[['text', 'label']])

# Make predictions
preds_task1, _ = task1_model_data_aug.predict(tedf1['text'].tolist())

In [None]:
# Evaluate the model
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
print('precision: ' + str(precision_score(tedf1.label.values, preds_task1)))
print( 'recall： ' + str(recall_score(tedf1.label.values, preds_task1)))
print('f1_score: ' + str(f1_score(tedf1.label.values, preds_task1)))

precision: 0.5930232558139535
recall： 0.5125628140703518
f1_score: 0.5498652291105123
