# Data Preprocessing

In [None]:
import pandas as pd

In [None]:
# Download datasets from google drive
!gdown 1ORwdbXsIrWgauiMFEuaq6LTcT0xo1eur
!gdown 1_Y4HaQMTp7471uX4CjAMgl0zGQXyH7bu

Downloading...
From (original): https://drive.google.com/uc?id=1ORwdbXsIrWgauiMFEuaq6LTcT0xo1eur
From (redirected): https://drive.google.com/uc?id=1ORwdbXsIrWgauiMFEuaq6LTcT0xo1eur&confirm=t&uuid=5e5783df-9077-4210-bcb0-155f1874dcf4
To: /content/train-balanced-sarcasm.csv
100% 255M/255M [00:02<00:00, 104MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1_Y4HaQMTp7471uX4CjAMgl0zGQXyH7bu
To: /content/GEN-sarc-notsarc.csv
100% 1.72M/1.72M [00:00<00:00, 71.8MB/s]


In [None]:
sarcasm_corpus_df = pd.read_csv("GEN-sarc-notsarc.csv")
sarcasm_corpus_df

Unnamed: 0,class,id,text
0,notsarc,1,"If that's true, then Freedom of Speech is doom..."
1,notsarc,2,Neener neener - is it time to go in from the p...
2,notsarc,3,"Just like the plastic gun fear, the armour pie..."
3,notsarc,4,So geology is a religion because we weren't he...
4,notsarc,5,Well done Monty. Mark that up as your first ev...
...,...,...,...
6515,sarc,6516,depends on when the baby bird died. run alon...
6516,sarc,6517,"ok, sheesh, to clarify, women who arent aborti..."
6517,sarc,6518,so.. eh?? hows this sound? will it fly w...
6518,sarc,6519,"I think we should put to a vote, the right of ..."


In [None]:
sarc_df = pd.read_csv("train-balanced-sarcasm.csv")
sarc_df #1 is sarcastic, 0 is not

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...
...,...,...,...,...,...,...,...,...,...,...
1010821,1,I'm sure that Iran and N. Korea have the techn...,TwarkMain,reddit.com,2,2,0,2009-04,2009-04-25 00:47:52,"No one is calling this an engineered pathogen,..."
1010822,1,"whatever you do, don't vote green!",BCHarvey,climate,1,1,0,2009-05,2009-05-14 22:27:40,In a move typical of their recent do-nothing a...
1010823,1,Perhaps this is an atheist conspiracy to make ...,rebelcommander,atheism,1,1,0,2009-01,2009-01-11 00:22:57,Screw the Disabled--I've got to get to Church ...
1010824,1,The Slavs got their own country - it is called...,catsi,worldnews,1,1,0,2009-01,2009-01-23 21:12:49,I've always been unsettled by that. I hear a l...


In [None]:
# process and trim SARC dataframe to just label and text, 50 000 examples
sarc_df_processed = sarc_df[['comment', 'label']]
sarc_df_processed = sarc_df_processed.rename(columns={'comment': 'text'})

first_half = sarc_df_processed.loc[sarc_df_processed['label'] == 0][:25000]
second_half = sarc_df_processed.loc[sarc_df_processed['label'] == 1]
second_half = second_half[:25000]

sarc_df_processed = pd.concat([first_half, second_half], axis = 0)
print(len(sarc_df_processed))

50000


In [None]:
# process and trim sarcasm corpus dataframe to just label and text
sarcasm_corpus_df_processed = sarcasm_corpus_df[['text', 'class']]
sarcasm_corpus_df_processed = sarcasm_corpus_df_processed.rename(columns={'class': 'label'})

# convert label column to 0s and 1s
sarcasm_corpus_df_processed.loc[sarcasm_corpus_df_processed['label'] == 'sarc', 'label'] = 1
sarcasm_corpus_df_processed.loc[sarcasm_corpus_df_processed['label'] == 'notsarc', 'label'] = 0
sarcasm_corpus_df_processed

Unnamed: 0,text,label
0,"If that's true, then Freedom of Speech is doom...",0
1,Neener neener - is it time to go in from the p...,0
2,"Just like the plastic gun fear, the armour pie...",0
3,So geology is a religion because we weren't he...,0
4,Well done Monty. Mark that up as your first ev...,0
...,...,...
6515,depends on when the baby bird died. run alon...,1
6516,"ok, sheesh, to clarify, women who arent aborti...",1
6517,so.. eh?? hows this sound? will it fly w...,1
6518,"I think we should put to a vote, the right of ...",1


In [None]:
# find out dataset distribution of sarcastic vs not sarcastic examples
print(len(sarcasm_corpus_df_processed.loc[sarcasm_corpus_df_processed['label'] == 0]))
print(len(sarcasm_corpus_df_processed.loc[sarcasm_corpus_df_processed['label'] == 1]))
print(len(sarc_df_processed.loc[sarc_df_processed['label'] == 0]))
print(len(sarc_df_processed.loc[sarc_df_processed['label'] == 1]))

3260
3260
25000
25000


In [None]:
# split datasets into train, val and test (60%/20%/20%)
sarcasm_corpus_df_processed = sarcasm_corpus_df_processed[sarcasm_corpus_df_processed['text'].notnull()]
sarc_df_processed = sarc_df_processed[sarc_df_processed['text'].notnull()]
sarcasm_corpus_df_processed = sarcasm_corpus_df_processed.sample(frac=1)
sarc_df_processed = sarc_df_processed.sample(frac=1)

train_data_corpus = sarcasm_corpus_df_processed[:3912]
val_data_corpus = sarcasm_corpus_df_processed[3912: 5216]
test_data_corpus = sarcasm_corpus_df_processed[5216:]

train_data_sarc = sarc_df_processed[:30000]
val_data_sarc = sarc_df_processed[30000: 40000]
test_data_sarc = sarc_df_processed[40000:]

# combine training and validation sets
train_data_all = pd.concat([train_data_corpus, train_data_sarc], axis = 0)
val_data_all = pd.concat([val_data_corpus, val_data_sarc], axis = 0)

# shuffle datasets randomly
train_data = train_data_all.sample(frac=1)
val_data = val_data_all.sample(frac=1)
test_data_corpus = test_data_corpus.sample(frac=1)
test_data_sarc = test_data_sarc.sample(frac=1)

# dataset schema: pandas dataframe (label, text)
print(train_data)

                                                    text label
15904                             Who goes in his place?     0
24774  Dang combine this super move with a pokemon th...     0
32878                                     You forgot the     1
19396                                                 in     0
54246                             no, he's not a puppet.     1
...                                                  ...   ...
39976  Maybe your babelfish is broke, refund and get ...     0
3786            I got mine too, Pidgey in my apartment !     0
14880  PPV buys, social media engagement, merchandise...     0
43485  You may have noticed faction scrams have three...     1
41074                       only if certain teams did it     1

[33912 rows x 2 columns]


Find out the maximimum sequence length based on number of words.

In [None]:
max_length = 0
for text in train_data['text']:
    # Assuming text is a string
    sequence_length = len(text.split())
    if sequence_length > max_length:
        max_length = sequence_length
print(max_length)

200


# Transformer Models:

In [None]:
!pip install -q transformers
!pip install -q simpletransformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m66.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m267.1/267.1 kB[0m [31m30.4

In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import logging
import torch

device = 'cuda' if torch.cuda.is_available()  else 'cpu'
print(device)

logging.basicConfig(level=logging.DEBUG)
transformers_logger = logging.getLogger('transformers')
transformers_logger.setLevel(logging.WARNING)

cuda


In [None]:
def print_metrics(x):
    # Calculate Evaluation Metrics:
    tp = x['tp']
    tn = x['tn']
    fp = x['fp']
    fn = x['fn']
    acc = x['accuracy']
    precision_sarcastic = tp / (tp + fp)
    precision_non_sarcastic = tn / (tn + fn)
    recall_sarcastic = tp / (tp + fn)
    recall_non_sarcastic = tn / (fp + tn)
    print("Metrics for Standard Roberta Transformer")
    print(f'Accuracy:{acc}')
    print(f'Precision_sarcastic: {precision_sarcastic}')
    print(f'Precision_nonsarcastic :{precision_non_sarcastic}')
    print(f'Recall_sarcastic :{recall_sarcastic }')
    print(f'Recall_nonsarcastic :{recall_non_sarcastic}')

First we'll try fine tuning the pretrained roberta base model from simple transformers.

In [None]:
# Define training arguments
model_args = ClassificationArgs(num_train_epochs=2,
                                train_batch_size=32,
                                learning_rate=0.000004,
                                max_seq_length=max_length,
                                overwrite_output_dir=True,
                                evaluate_during_training=True)

# Only train the classifier layer which is left uninitialized
model_args.train_custom_parameters_only = True
model_args.custom_parameter_groups = [
    {"params": ["classifier.dense.weight",
                "classifier.dense.bias",
                "classifier.out_proj.weight",
                "classifier.out_proj.bias"],
     "lr": 0.000004}
]

# Load the RoBERTa model
use_cuda = True if 'cuda' in device else False
model = ClassificationModel('roberta', 'roberta-base', num_labels=2, args=model_args, use_cuda=use_cuda)

model.train_model(train_data, eval_df=val_data)
x, model_outputs, wrong_predictions = model.eval_model(val_data)
y, model_outputs, wrong_predictions = model.eval_model(test_data_corpus)
z, model_outputs, wrong_predictions = model.eval_model(test_data_sarc)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.pid = os.fork()


  0%|          | 0/67 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/1060 [00:00<?, ?it/s]

  self.pid = os.fork()


  0%|          | 0/22 [00:00<?, ?it/s]

Running Epoch 2 of 2:   0%|          | 0/1060 [00:00<?, ?it/s]

  self.pid = os.fork()


  0%|          | 0/22 [00:00<?, ?it/s]

  self.pid = os.fork()
  self.pid = os.fork()


  0%|          | 0/22 [00:00<?, ?it/s]

  self.pid = os.fork()


  0%|          | 0/22 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/114 [00:00<?, ?it/s]

  self.pid = os.fork()


  0%|          | 0/2 [00:00<?, ?it/s]

  self.pid = os.fork()


Running Evaluation:   0%|          | 0/14 [00:00<?, ?it/s]

  self.pid = os.fork()


  0%|          | 0/19 [00:00<?, ?it/s]

  self.pid = os.fork()


Running Evaluation:   0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
print("Validation Metrics")
print_metrics(x)
print("\nCorpus Metrics")
print_metrics(y)
print("\nSarc Metrics")
print_metrics(z)

Validation Metrics
Metrics for Standard Roberta Transformer
Accuracy:0.5667020523708421
Precision_sarcastic: 0.6137549892539147
Precision_nonsarcastic :0.5476575121163166
Recall_sarcastic :0.3544954779216173
Recall_nonsarcastic :0.7779346866725507

Corpus Metrics
Metrics for Standard Roberta Transformer
Accuracy:0.5927914110429447
Precision_sarcastic: 0.8
Precision_nonsarcastic :0.553016453382084
Recall_sarcastic :0.2557077625570776
Recall_nonsarcastic :0.9350850077279753

Sarc Metrics
Metrics for Standard Roberta Transformer
Accuracy:0.5638127625525104
Precision_sarcastic: 0.6036036036036037
Precision_nonsarcastic :0.546779031566919
Recall_sarcastic :0.3631071858691289
Recall_nonsarcastic :0.7631578947368421


Now we try to train the entire roberta model using the pretrained weights as initialization.

In [None]:
# Define training arguments
model_args = ClassificationArgs(num_train_epochs=2,
                                train_batch_size=32,
                                learning_rate=0.00005,
                                max_seq_length=max_length,
                                overwrite_output_dir=True,
                                evaluate_during_training=True)

# Load the RoBERTa model
use_cuda = True if 'cuda' in device else False
model = ClassificationModel('roberta', 'roberta-base', num_labels=2, args=model_args, use_cuda=use_cuda)

model.train_model(train_data_all, eval_df=val_data_all)
x, model_outputs, wrong_predictions = model.eval_model(val_data)
y, model_outputs, wrong_predictions = model.eval_model(test_data_corpus)
z, model_outputs, wrong_predictions = model.eval_model(test_data_sarc)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.pid = os.fork()


  0%|          | 0/67 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/1060 [00:00<?, ?it/s]

  self.pid = os.fork()


  0%|          | 0/22 [00:00<?, ?it/s]

Running Epoch 2 of 2:   0%|          | 0/1060 [00:00<?, ?it/s]

  self.pid = os.fork()


  0%|          | 0/22 [00:00<?, ?it/s]

  self.pid = os.fork()


  0%|          | 0/22 [00:00<?, ?it/s]

  self.pid = os.fork()


  0%|          | 0/22 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/114 [00:00<?, ?it/s]

  self.pid = os.fork()


  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/14 [00:00<?, ?it/s]

  self.pid = os.fork()


  0%|          | 0/19 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
print("Validation Metrics")
print_metrics(x)
print("\nCorpus Metrics")
print_metrics(y)
print("\nSarc Metrics")
print_metrics(z)

Validation Metrics
Metrics for Standard Roberta Transformer
Accuracy:0.7443382873319179
Precision_sarcastic: 0.7591929096737695
Precision_nonsarcastic :0.7312114647558741
Recall_sarcastic :0.7139563752438376
Recall_nonsarcastic :0.7745807590467785

Corpus Metrics
Metrics for Standard Roberta Transformer
Accuracy:0.7906441717791411
Precision_sarcastic: 0.8380281690140845
Precision_nonsarcastic :0.7540760869565217
Recall_sarcastic :0.7245053272450532
Recall_nonsarcastic :0.8578052550231839

Sarc Metrics
Metrics for Standard Roberta Transformer
Accuracy:0.7283456691338268
Precision_sarcastic: 0.7440758293838863
Precision_nonsarcastic :0.7147124719940254
Recall_sarcastic :0.6932958651144119
Recall_nonsarcastic :0.7631578947368421


In [None]:
# Define training arguments
model_args = ClassificationArgs(num_train_epochs=2,
                                train_batch_size=32,
                                learning_rate=0.00005,
                                max_seq_length=max_length,
                                overwrite_output_dir=True,
                                evaluate_during_training=True)

# Define the weights for sarcastic and non-sarcastic tweets
weight_sarcastic = 2.0  # Example: higher weight for sarcastic tweets
weight_non_sarcastic = 1.0  # Example: lower weight for non-sarcastic tweets
weights = [weight_non_sarcastic, weight_sarcastic]

# Load the RoBERTa model
use_cuda = True if 'cuda' in device else False
model = ClassificationModel('roberta', 'roberta-base', num_labels=2, args=model_args, use_cuda=use_cuda, weight=weights)

model.train_model(train_data_all, eval_df=val_data_all)
x, model_outputs, wrong_predictions = model.eval_model(val_data)
y, model_outputs, wrong_predictions = model.eval_model(test_data_corpus)
z, model_outputs, wrong_predictions = model.eval_model(test_data_sarc)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.pid = os.fork()


  0%|          | 0/67 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/1060 [00:00<?, ?it/s]

  self.pid = os.fork()


  0%|          | 0/22 [00:00<?, ?it/s]

Running Epoch 2 of 2:   0%|          | 0/1060 [00:00<?, ?it/s]

  self.pid = os.fork()


  0%|          | 0/22 [00:00<?, ?it/s]

  self.pid = os.fork()


  0%|          | 0/22 [00:00<?, ?it/s]

  self.pid = os.fork()


  0%|          | 0/22 [00:00<?, ?it/s]

  self.pid = os.fork()


Running Evaluation:   0%|          | 0/114 [00:00<?, ?it/s]

  self.pid = os.fork()


  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/14 [00:00<?, ?it/s]

  self.pid = os.fork()


  0%|          | 0/19 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
print("Validation Metrics")
print_metrics(x)
print("\nCorpus Metrics")
print_metrics(y)
print("\nSarc Metrics")
print_metrics(z)

Validation Metrics
Metrics for Standard Roberta Transformer
Accuracy:0.7254953998584572
Precision_sarcastic: 0.6830022075055188
Precision_nonsarcastic :0.7895320470170769
Recall_sarcastic :0.8302325581395349
Recall_nonsarcastic :0.6230311515575779

Corpus Metrics
Metrics for Standard Roberta Transformer
Accuracy:0.8044478527607362
Precision_sarcastic: 0.7821782178217822
Precision_nonsarcastic :0.830820770519263
Recall_sarcastic :0.845565749235474
Recall_nonsarcastic :0.7630769230769231

Sarc Metrics
Metrics for Standard Roberta Transformer
Accuracy:0.707741548309662
Precision_sarcastic: 0.6601689408706952
Precision_nonsarcastic :0.7839666840187403
Recall_sarcastic :0.8304045770331018
Recall_nonsarcastic :0.5901253918495298
