In [1]:
! pip install emoji contractions



In [2]:
import pandas as pd
from tqdm.notebook import trange, tqdm

In [3]:
data1 = pd.read_csv('isarcasm_train.csv')
data2 = pd.read_csv('sarcasmheadlines_train.csv', index_col=0)
data3 = pd.read_csv('semeval2018_train.csv', index_col=0)
data4 = pd.read_csv('isarcasmeval_train.csv', usecols=['tweet', 'sarcastic'])
data4 = data4.rename(columns={"tweet": "text", "sarcastic": "sarcasm_label"})

data = pd.concat([data1, data2, data3, data4], ignore_index=True)
data

Unnamed: 0,text,sarcasm_label
0,sorry but sue from the mediweight advert looks...,0
1,@davesnyder since this is fantasy can it be an...,0
2,"Good times, fun times here in Grand old Britan...",1
3,I send light and love to all impacted by the #...,0
4,Another year closer to death,0
...,...,...
38713,The population spike in Chicago in 9 months is...,0
38714,You'd think in the second to last English clas...,0
38715,I’m finally surfacing after a holiday to Scotl...,0
38716,Couldn't be prouder today. Well done to every ...,0


In [4]:
# replace user tags with @user, consecutive whitespaces to single space,
# surround punctuations with space as in https://aclanthology.org/2022.semeval-1.133.pdf
# also replace emojis with text desc using 'emoji'
# !pip install emoji contractions

import re
import emoji
import contractions

def expand_contractions(text):
    expanded_text = contractions.fix(text)
    return expanded_text

def preprocess_tweet(tweet):
  # Replace tagged users with @user
  tweet = re.sub(r'@[\w]+', '@user', str(tweet))

  # Replace multiple white spaces with a single space
  tweet = re.sub(r'\s+', ' ', tweet)

  # Remove more than two successive occurrences of any punctuation
  tweet = re.sub(r'(\s*([.,!?;:()<>{}[\]\-+=~_$%^&]\s){2,})', r'\1', tweet)

  # Replace emoji with emoji descriptor
  tweet = emoji.demojize(tweet)

  # Put a single white space around punctuations
  tweet = re.sub(r'([.,!?;:()<>{}[\]\-+=~_$%^&*])', r' \1 ', tweet)

  # expand contractions
  tweet = expand_contractions(tweet)

  return tweet

In [5]:
data['text'] = data['text'].apply(preprocess_tweet)
data

Unnamed: 0,text,sarcasm_label
0,sorry but sue from the mediweight advert looks...,0
1,@user since this is fantasy can it be any indi...,0
2,"Good times , fun times here in Grand old Brit...",1
3,I send light and love to all impacted by the #...,0
4,Another year closer to death,0
...,...,...
38713,The population spike in Chicago in 9 months is...,0
38714,You would think in the second to last English ...,0
38715,I am finally surfacing after a holiday to Scot...,0
38716,Could not be prouder today . Well done to eve...,0


## LLM = GPT3

In [6]:
from transformers import pipeline
import numpy as np
import torch

device = 0 if torch.cuda.is_available() else 'cpu'

model_name = "EleutherAI/gpt-neo-1.3B"
zero_shot_classifier = pipeline("zero-shot-classification", model=model_name, device=device)

prompt = '''Here are 4 examples of Sarcastic and Not Sarcastic sentences:
            Sentence: The only thing I got from college is a caffeine addiction
            Output: Sarcastic
            
            Sentence: I love it when professors draw a big question mark next to my answer on an exam because I am always like yeah I do not either ¯\ _  ( ツ )  _ /¯ 
            Output: Sarcastic

            Sentence: 2br apt ,  Notts city centre https : //t . co/yFj989BdXR
            Output: Not Sarcastic
            
            Sentence: Motivation Wednesday Night :  I AM the powerhouse of the cell .
            Output: Not Sarcastic
            
            For the task of Sarcasm Detection, choose whether the following sentence is Sarcastic OR Not Sarcastic. \n'''

candidate_labels = ['Sarcastic','Not Sarcastic']

count = 0
for i in trange(len(data)):
  input_text = data['text'][i]
  true = data['sarcasm_label'][i]
  
  result = zero_shot_classifier(prompt+input_text, candidate_labels)
  pred = result['labels'][np.argmax(result['scores'])]

  if pred == 'Sarcastic' and true == 1:
    count+=1
  elif pred == 'Not Sarcastic' and true == 0:
    count+=1
  # print(np.argmax(result['scores']))
  # print(result)
  # print(pred)

2023-12-19 06:06:23.571974: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-19 06:06:24.264479: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of GPTNeoForSequenceClassification were not initialized from the model checkpoint at EleutherAI/gpt-neo-1.3B and are newly initialized: ['transformer.h.3.attn.attention.bias', 'transformer.h.7.attn.attention.bias', 'transformer.h.21.attn.attention.bias', 'score.weight', 'transformer.h.5.attn.attention.bias', 'transformer.h.9.attn.attention.bias', 'transformer

  0%|          | 0/38718 [00:00<?, ?it/s]

Using pad_token, but it is not set yet.
Tokenizer was not supporting padding necessary for zero-shot, attempting to use  `pad_token=eos_token`


In [7]:
print(count)

21143


In [8]:
print(count/len(data))

0.5460767601632316
