# Compiling data

In [13]:
#Importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [14]:
#Importing survey data
df_survey = pd.read_excel('survey_results_data.xlsx')
df_survey.head()

Unnamed: 0,participant_no,over_18,use_emojis,gender,age,original_text,class_survey,edited_text
0,1,Yes,Yes,Female,18-24,The only thing I got from college was a caffei...,It is sarcastic,The only thing I got from college was a caffei...
1,1,Yes,Yes,Female,18-24,@WalesOnline Riveting news.,I don't know,@WalesOnline Riveting news.
2,1,Yes,Yes,Female,18-24,"""My (extended) fam was discussing going on a t...",It is sarcastic,"""My (extended) fam was discussing going on a t..."
3,1,Yes,Yes,Female,18-24,i love shoegaze sm,It is sarcastic,i love shoegaze sm 😁
4,1,Yes,Yes,Female,18-24,men are so grimey,It is sarcastic,men are so grimey 😢


In [15]:
#Drop rows where the response to the survey did not yield a label
df_survey = df_survey.drop(df_survey[df_survey['class_survey'] == "I don't know"].index)
df_survey.shape

(704, 8)

In [16]:
#Check distribution of data for each label
df_survey["class_survey"].value_counts()

class_survey
It is not sarcastic    359
It is sarcastic        345
Name: count, dtype: int64

In [17]:
#Avoid bias from qc questions- drop duplicate text prompts and only keep one of each
df_survey = df_survey.drop_duplicates(subset=['original_text'])

In [18]:
#Check the percentage of results that have emojis

#Convert non-string values to string format
df_survey['edited_text'] = df_survey['edited_text'].astype(str)

#Create a regex pattern to match emojis
emoji_pattern = re.compile("[\U0001F600-\U0001F64F" #Emoticons
                           "\U0001F300-\U0001F5FF"  #Symbols & pictographs
                           "\U0001F680-\U0001F6FF"  #Transport & map symbols
                           "\U0001F1E0-\U0001F1FF"  #Flags
                           "\U00002702-\U000027B0"  #Dingbats
                           "\U000024C2-\U0001F251"  #Enclosed characters
                           "]+", flags=re.UNICODE)

#Use the regex pattern to check if the survey outputs have emojis
df_survey['has_emoji'] = df_survey['edited_text'].apply(lambda x: bool(re.search(emoji_pattern, x)))

#Calculate the percentage of rows that contain emojis
percentage_with_emojis = (df_survey["has_emoji"].sum() / len(df_survey)) * 100

#Print the result
print(f"Percentage of rows with emojis: {percentage_with_emojis:.2f}%")

Percentage of rows with emojis: 63.64%


In [20]:
#Convert labels to 0 and 1 for classification
df_survey['label'] = df_survey['class_survey'].replace({'It is sarcastic': 1, 'It is not sarcastic': 0})
df_survey.head()

Unnamed: 0,participant_no,over_18,use_emojis,gender,age,original_text,class_survey,edited_text,has_emoji,label
0,1,Yes,Yes,Female,18-24,The only thing I got from college was a caffei...,It is sarcastic,The only thing I got from college was a caffei...,True,1
2,1,Yes,Yes,Female,18-24,"""My (extended) fam was discussing going on a t...",It is sarcastic,"""My (extended) fam was discussing going on a t...",True,1
3,1,Yes,Yes,Female,18-24,i love shoegaze sm,It is sarcastic,i love shoegaze sm 😁,True,1
4,1,Yes,Yes,Female,18-24,men are so grimey,It is sarcastic,men are so grimey 😢,True,1
5,1,Yes,Yes,Female,18-24,okay but like the say so song aint that bad. I...,It is sarcastic,okay but like the say so song aint that bad. I...,True,1


In [21]:
#Rename the edited_text column
df_survey = df_survey.rename(columns={'edited_text': 'tweet'})
df_survey.head()

Unnamed: 0,participant_no,over_18,use_emojis,gender,age,original_text,class_survey,tweet,has_emoji,label
0,1,Yes,Yes,Female,18-24,The only thing I got from college was a caffei...,It is sarcastic,The only thing I got from college was a caffei...,True,1
2,1,Yes,Yes,Female,18-24,"""My (extended) fam was discussing going on a t...",It is sarcastic,"""My (extended) fam was discussing going on a t...",True,1
3,1,Yes,Yes,Female,18-24,i love shoegaze sm,It is sarcastic,i love shoegaze sm 😁,True,1
4,1,Yes,Yes,Female,18-24,men are so grimey,It is sarcastic,men are so grimey 😢,True,1
5,1,Yes,Yes,Female,18-24,okay but like the say so song aint that bad. I...,It is sarcastic,okay but like the say so song aint that bad. I...,True,1


In [27]:
#Drop all unnecessary columns for the classification task
classification_cols = ['tweet', 'label']
df = df_survey[classification_cols]
df.head()

Unnamed: 0,tweet,label
0,The only thing I got from college was a caffei...,1
2,"""My (extended) fam was discussing going on a t...",1
3,i love shoegaze sm 😁,1
4,men are so grimey 😢,1
5,okay but like the say so song aint that bad. I...,1


In [28]:
#Importing iSarcasm dataset
df_isarc = pd.read_csv('isarcasm2022.csv')
df_isarc.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",0.0,1.0,0.0,0.0,0.0,0.0
1,1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,1.0,0.0,0.0,0.0,0.0,0.0
2,2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",0.0,1.0,0.0,0.0,0.0,0.0
3,3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",1.0,0.0,0.0,0.0,0.0,0.0
4,4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,1.0,0.0,0.0,0.0,0.0,0.0


In [29]:
#Drop all unnecessary columns for the classification task
classification_cols = ['tweet', 'sarcastic']
df_isarc = df_isarc[classification_cols]

#Rename the edited_text column
df_isarc = df_isarc.rename(columns={'sarcastic': 'label'})

#Show transformation
df_isarc.head()

Unnamed: 0,tweet,label
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1


In [30]:
#Check distribution of labels
df_isarc['label'].value_counts()

label
0    2601
1     867
Name: count, dtype: int64

In [31]:
#Drop any tweets from the iSarcasm dataset that were used in the survey as prompts to avoid duplicated entries

#Get a list of unique tweets from the survey 
unique_tweets = df_survey['original_text'].unique()

#Remove rows which would give duplicates of text
df_isarc = df_isarc[~df_isarc['tweet'].isin(unique_tweets)]

#Check distribution of labels
df_isarc.shape

(3291, 2)

In [32]:
#Add the iSarcasm data to the survey data
df = pd.concat([df, df_isarc], ignore_index=True)

#Check the current quantity and distribution of data
df.shape, df['label'].value_counts()

((3995, 2),
 label
 0    2885
 1    1110
 Name: count, dtype: int64)

# Checking data augmentation methods

In [46]:
#Importing libraries 
import random
import nlpaug.augmenter.word as naw
from transformers import pipeline

In [37]:
#Define function to perform augmentation
def augment_text(text):
    aug = naw.SynonymAug(aug_src='wordnet')
    augmented_text = aug.augment(text)
    return augmented_text

In [39]:
#Randomly select 25 rows from the df to check the effects of augmentation
random.seed(42)
selected_rows = random.sample(range(len(df)), 25)

#Create a variable to store results
augmented_data = {'original_text': [], 'augmented_text': []}

#Augment text sample
for row in selected_rows:
    original_text = df.loc[row, 'tweet']
    augmented_text = augment_text(original_text)
    
    augmented_data['original_text'].append(original_text)
    augmented_data['augmented_text'].append(augmented_text)

#Convert to df
augmented_df = pd.DataFrame(augmented_data)

#Check results
augmented_df.head()

Unnamed: 0,original_text,augmented_text
0,"“Oven ready, shove it in the microwave” and th...","[“ Oven ready, thrust it in the microwave oven..."
1,@DarkenerWoW @wochinimen @MissPurplePixie shit...,[@ DarkenerWoW @ wochinimen @ MissPurplePixie ...
2,i have this reoccurring nightmare where i acci...,[i make this reoccurring incubus where i by ch...
3,"Happy Trans Visibility Day all my beautiful, s...","[Happy Trans Visibility Day all my beautiful, ..."
4,"""Should I wear my lime green pants?"" dad its 4...","["" Should Iodin wear my lime light green pants..."


In [44]:
#Check if it kept the emojis

#Convert non-string values to string format
augmented_df['augmented_text'] = augmented_df['augmented_text'].astype(str)
augmented_df['original_text'] = augmented_df['original_text'].astype(str)

#Use the regex pattern to check if the survey outputs have emojis
augmented_df['has_emoji'] = augmented_df['augmented_text'].apply(lambda x: bool(re.search(emoji_pattern, x)))
augmented_df['has_emoji_orig'] = augmented_df['original_text'].apply(lambda x: bool(re.search(emoji_pattern, x)))

#Check distribution with emojis
augmented_df['has_emoji'].value_counts(), augmented_df['has_emoji_orig'].value_counts()

(has_emoji
 False    14
 True     11
 Name: count, dtype: int64,
 has_emoji_orig
 False    14
 True     11
 Name: count, dtype: int64)

In [45]:
#Manually evaluate results
print('Before:')
print(augmented_df['original_text'].iloc[0])
print('After:')
print(augmented_df['augmented_text'].iloc[0])
print()
print('Before:')
print(augmented_df['original_text'].iloc[1])
print('After:')
print(augmented_df['augmented_text'].iloc[1])
print()
print('Before:')
print(augmented_df['original_text'].iloc[2])
print('After:')
print(augmented_df['augmented_text'].iloc[2])
print()
print('Before:')
print(augmented_df['original_text'].iloc[3])
print('After:')
print(augmented_df['augmented_text'].iloc[3])
print()
print('Before:')
print(augmented_df['original_text'].iloc[4])
print('After:')
print(augmented_df['augmented_text'].iloc[4])
print()
print('Before:')
print(augmented_df['original_text'].iloc[5])
print('After:')
print(augmented_df['augmented_text'].iloc[5])
print()
print('Before:')
print(augmented_df['original_text'].iloc[6])
print('After:')
print(augmented_df['augmented_text'].iloc[6])
print()
print('Before:')
print(augmented_df['original_text'].iloc[7])
print('After:')
print(augmented_df['augmented_text'].iloc[7])
print()
print('Before:')
print(augmented_df['original_text'].iloc[8])
print('After:')
print(augmented_df['augmented_text'].iloc[8])
print()
print('Before:')
print(augmented_df['original_text'].iloc[9])
print('After:')
print(augmented_df['augmented_text'].iloc[9])
print()

Before:
“Oven ready, shove it in the microwave” and this man is our PM✌🏼
After:
['“ Oven ready, thrust it in the microwave oven ” and this man follow our PM ✌ 🏼']

Before:
@DarkenerWoW @wochinimen @MissPurplePixie shit healer
After:
['@ DarkenerWoW @ wochinimen @ MissPurplePixie shit healer']

Before:
i have this reoccurring nightmare where i accidentally forget to put on shoes before school and i have to spend the whole day barefoot
After:
['i make this reoccurring incubus where i by chance forget to put on shoes before school day and i have to spend the whole daytime barefoot']

Before:
Happy Trans Visibility Day all my beautiful, strong, and sexy trans peeps!! We are unapologetically ourselves 💜
After:
['Happy Trans Visibility Day all my beautiful, unattackable, and aphrodisiac trans peeps! ! We are unapologetically ourselves 💜']

Before:
"Should I wear my lime green pants?" dad its 4th of july...
After:
['" Should Iodin wear my lime light green pants? " dad its fourth of july. ..']

In [50]:
#Try use of paraphrasing to augment text

#Randomly select 25 rows from the df to check the effects of augmentation
random.seed(51)
selected_rows = random.sample(range(len(df)), 25)

#Initialise text generation pipeline with pre-trained model
generator = pipeline("text-generation", model="EleutherAI/gpt-neo-1.3B")

#Function to perform text paraphrasing
def paraphrase_text(text, num_paraphrase=5):
    paraphrased_texts = []
    for _ in range(num_paraphrase):
        paraphrased_text = generator(text, max_length=200, num_return_sequences=1, do_sample=True)[0]["generated_text"]
        paraphrased_texts.append(paraphrased_text)
    return paraphrased_texts

#Create a variable to store results
paraphrased_data = {'original_text': [], 'paraphrased_text': []}

#Augment text sample
for row in selected_rows:
    original_text = df.loc[row, 'tweet']
    paraphrased_text = paraphrase_text(original_text)
    
    paraphrased_data['original_text'].append(original_text)
    paraphrased_data['paraphrased_text'].append(paraphrased_text)

#Convert to df
paraphrased_df = pd.DataFrame(paraphrased_data)

#Check results
paraphrased_df.head()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 64, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 64, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 64, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 64, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 64, but

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Unnamed: 0,original_text,paraphrased_text
0,"My dad just said to me, “Hey, you’re kinda ski...","[My dad just said to me, “Hey, you’re kinda sk..."
1,Post university anxiety is creeping in now lik...,[Post university anxiety is creeping in now li...
2,My department is hiring in metaphysics and epi...,[My department is hiring in metaphysics and ep...
3,Seeing my friends shine is so heartwarming 🥺🥺,[Seeing my friends shine is so heartwarming 🥺🥺...
4,If your website still has a google plus share ...,[If your website still has a google plus share...


In [51]:
#Check if it kept the emojis

#Convert non-string values to string format
paraphrased_df['paraphrased_text'] = paraphrased_df['paraphrased_text'].astype(str)
paraphrased_df['original_text'] = paraphrased_df['original_text'].astype(str)

#Use the regex pattern to check if the survey outputs have emojis
paraphrased_df['has_emoji'] = paraphrased_df['paraphrased_text'].apply(lambda x: bool(re.search(emoji_pattern, x)))
paraphrased_df['has_emoji_orig'] = paraphrased_df['original_text'].apply(lambda x: bool(re.search(emoji_pattern, x)))

#Check distribution with emojis
paraphrased_df['has_emoji'].value_counts(), paraphrased_df['has_emoji_orig'].value_counts()

(has_emoji
 False    20
 True      5
 Name: count, dtype: int64,
 has_emoji_orig
 False    23
 True      2
 Name: count, dtype: int64)

In [52]:
#Manually evaluate results
print('Before:')
print(paraphrased_df['original_text'].iloc[0])
print('After:')
print(paraphrased_df['paraphrased_text'].iloc[0])
print()
print('Before:')
print(paraphrased_df['original_text'].iloc[1])
print('After:')
print(paraphrased_df['paraphrased_text'].iloc[1])
print()
print('Before:')
print(paraphrased_df['original_text'].iloc[2])
print('After:')
print(paraphrased_df['paraphrased_text'].iloc[2])
print()
print('Before:')
print(paraphrased_df['original_text'].iloc[3])
print('After:')
print(paraphrased_df['paraphrased_text'].iloc[3])
print()
print('Before:')
print(paraphrased_df['original_text'].iloc[4])
print('After:')
print(paraphrased_df['paraphrased_text'].iloc[4])
print()
print('Before:')
print(paraphrased_df['original_text'].iloc[5])
print('After:')
print(paraphrased_df['paraphrased_text'].iloc[5])
print()
print('Before:')
print(paraphrased_df['original_text'].iloc[6])
print('After:')
print(paraphrased_df['paraphrased_text'].iloc[6])
print()
print('Before:')
print(paraphrased_df['original_text'].iloc[7])
print('After:')
print(paraphrased_df['paraphrased_text'].iloc[7])
print()
print('Before:')
print(paraphrased_df['original_text'].iloc[8])
print('After:')
print(paraphrased_df['paraphrased_text'].iloc[8])
print()
print('Before:')
print(paraphrased_df['original_text'].iloc[9])
print('After:')
print(paraphrased_df['paraphrased_text'].iloc[9])

Before:
My dad just said to me, “Hey, you’re kinda skinny, can you slip behind the fridge to look at something?” and it might be the nicest compliment I’ve ever gotten. So guys, go tell your girl you think she’s kinda skinny🥰
After:
['My dad just said to me, “Hey, you’re kinda skinny, can you slip behind the fridge to look at something?” and it might be the nicest compliment I’ve ever gotten. So guys, go tell your girl you think she’s kinda skinny🥰 (', 'My dad just said to me, “Hey, you’re kinda skinny, can you slip behind the fridge to look at something?” and it might be the nicest compliment I’ve ever gotten. So guys, go tell your girl you think she’s kinda skinny🥰\n', 'My dad just said to me, “Hey, you’re kinda skinny, can you slip behind the fridge to look at something?” and it might be the nicest compliment I’ve ever gotten. So guys, go tell your girl you think she’s kinda skinny🥰�', 'My dad just said to me, “Hey, you’re kinda skinny, can you slip behind the fridge to look at some

# Collect datasets for compilation with current data

In [68]:
#Importing SarcEval dataset
df_eval = pd.read_csv('english_task_a.csv')
df_eval.head()

Unnamed: 0,text,sarcasm_ref,human_aggregated,#humans_sarcasm
0,"Size on the the Toulouse team, That pack is mo...",0,0,1
1,Pinball!,0,0,0
2,So the Scottish Government want people to get ...,1,1,4
3,villainous pro tip : change the device name on...,0,0,0
4,I would date any of these men 🥺,0,0,1


In [69]:
#Format for the evaluation

#Rename the label column
df_eval = df_eval.rename(columns={'sarcasm_ref': 'label'})

#Drop all unnecessary columns
classification_cols = ['text', 'label']
df_eval = df_eval[classification_cols]

#Check df format and size
df_eval.head(), df_eval.shape

(                                                text  label
 0  Size on the the Toulouse team, That pack is mo...      0
 1                                           Pinball!      0
 2  So the Scottish Government want people to get ...      1
 3  villainous pro tip : change the device name on...      0
 4                    I would date any of these men 🥺      0,
 (1400, 2))

In [70]:
#Importing #sarcasm and #irony dataset
df_add = pd.read_csv('train.csv')
df_add.head()

Unnamed: 0,tweets,class
0,Be aware dirty step to get money #staylight ...,figurative
1,#sarcasm for #people who don't understand #diy...,figurative
2,@IminworkJeremy @medsingle #DailyMail readers ...,figurative
3,@wilw Why do I get the feeling you like games?...,figurative
4,-@TeacherArthurG @rweingarten You probably jus...,figurative


In [71]:
#Format for the evaluation

#Rename the label column
df_add = df_add.rename(columns={'tweets': 'text'})

#All tweets are sarcastic- relabel them all as 1
df_add['label'] = 1

#Drop all unnecessary columns
classification_cols = ['text', 'label']
df_add = df_add[classification_cols]

#Check df format and size
df_add.head(), df_add.shape

(                                                text  label
 0  Be aware  dirty step to get money  #staylight ...      1
 1  #sarcasm for #people who don't understand #diy...      1
 2  @IminworkJeremy @medsingle #DailyMail readers ...      1
 3  @wilw Why do I get the feeling you like games?...      1
 4  -@TeacherArthurG @rweingarten You probably jus...      1,
 (81408, 2))

In [72]:
#Remove specific hashtags from the text
df_add['text'] = df_add['text'].str.replace(r'#sarcasm\b', '', regex=True)
df_add['text'] = df_add['text'].str.replace(r'#irony\b', '', regex=True)
df_add['text'] = df_add['text'].str.replace(r'#sarcastic\b', '', regex=True)

#Check transformation
df_add.head()

Unnamed: 0,text,label
0,Be aware dirty step to get money #staylight ...,1
1,for #people who don't understand #diy #artatt...,1
2,@IminworkJeremy @medsingle #DailyMail readers ...,1
3,@wilw Why do I get the feeling you like games?,1
4,-@TeacherArthurG @rweingarten You probably jus...,1


In [73]:
#Repeat using the test data

#Importing #sarcasm and #irony dataset
df_add1 = pd.read_csv('test.csv')
df_add1.head()

Unnamed: 0,tweets,class
0,no one ever predicted this was going to happen...,figurative
1,@Stooshie its as closely related as Andrews or...,figurative
2,I find it ironic when Vegans say they love foo...,figurative
3,Quick rt that throwing money vine I've not see...,figurative
4,"yep, keep adding me to your #devops lists.... ...",figurative


In [74]:
#Format for the evaluation

#Rename the label column
df_add1 = df_add1.rename(columns={'tweets': 'text'})

#All tweets are sarcastic- relabel them all as 1
df_add1['label'] = 1

#Drop all unnecessary columns
classification_cols = ['text', 'label']
df_add1 = df_add1[classification_cols]

#Remove specific hashtags from the text
df_add1['text'] = df_add1['text'].str.replace(r'#sarcasm\b', '', regex=True)
df_add1['text'] = df_add1['text'].str.replace(r'#irony\b', '', regex=True)
df_add1['text'] = df_add1['text'].str.replace(r'#sarcastic\b', '', regex=True)

#Check df format and size
df_add1.head(), df_add1.shape

(                                                text  label
 0  no one ever predicted this was going to happen...      1
 1  @Stooshie its as closely related as Andrews or...      1
 2  I find it ironic when Vegans say they love foo...      1
 3  Quick rt that throwing money vine I've not see...      1
 4  yep, keep adding me to your #devops lists.... ...      1,
 (8128, 2))

In [75]:
#Combine the dfs using the same collection strategy
df_add = pd.concat([df_add, df_add1], ignore_index=True)

#Check df size and form
df_add.head(), df_add.shape

(                                                text  label
 0  Be aware  dirty step to get money  #staylight ...      1
 1   for #people who don't understand #diy #artatt...      1
 2  @IminworkJeremy @medsingle #DailyMail readers ...      1
 3    @wilw Why do I get the feeling you like games?       1
 4  -@TeacherArthurG @rweingarten You probably jus...      1,
 (89536, 2))

In [81]:
#Check distribution of labels
df_add['label'].value_counts()

label
1    89536
Name: count, dtype: int64

In [76]:
#Specify the file path to read in next dataset- dataset collected and annotated using weak labels -> #not and offensive vocab
file_path = 'Train_v1.txt' 

#Read the data into a df
df_add1 = pd.read_csv(file_path, delimiter='\t', header=None, names=['Column1', 'Column2', 'Column3'])

#Check the df
df_add1.head()

Unnamed: 0,Column1,Column2,Column3
0,TrainSen,0,@0430yes i hope youre lurking rn. i want to li...
1,TrainSen,0,05 really taught me a valuable lesson I'm neve...
2,TrainSen,0,"@098BERRY Never had a voice to protest, so you..."
3,TrainSen,0,@0hMySt4rs Rest in peace & love to you and you...
4,TrainSen,0,100 days until Christmas! 🌲 #too soon #not rea...


In [77]:
#Rename cols based on the contents
df_add1 = df_add1.rename(columns={'Column2': 'label'})
df_add1 = df_add1.rename(columns={'Column3': 'text'})

#Drop unnecessary columns
classification_cols = ['text', 'label']
df_add1 = df_add1[classification_cols]

#Check df form and size
df_add1.head(), df_add1.shape

(                                                text  label
 0  @0430yes i hope youre lurking rn. i want to li...      0
 1  05 really taught me a valuable lesson I'm neve...      0
 2  @098BERRY Never had a voice to protest, so you...      0
 3  @0hMySt4rs Rest in peace & love to you and you...      0
 4  100 days until Christmas! 🌲 #too soon #not rea...      0,
 (39780, 2))

In [79]:
#Dataset collected and annotated using weak labels -> #not and offensive vocab

#Specify the file path to read in next dataset 
file_path = 'Test_v1.txt' 

#Read the data into a df
df_add2 = pd.read_csv(file_path, delimiter='\t', header=None, names=['Column1', 'label', 'text'])

#Drop unnecessary columns
classification_cols = ['text', 'label']
df_add2 = df_add2[classification_cols]

#Check the df
df_add2.head(), df_add2.shape

(                                                text  label
 0  I loovee when people text back ... 😒 #sarcasti...      1
 1  Don't you love it when your parents are Pissed...      1
 2      So many useless classes , great to be student      1
 3  Oh how I love getting home from work at 3am an...      1
 4          I just love having grungy ass hair 😑 #not      1,
 (1975, 2))

In [80]:
#Combine the dfs using the same collection strategy
df_add1 = pd.concat([df_add1, df_add2], ignore_index=True)

#Check df size and form
df_add1.head(), df_add1.shape

(                                                text  label
 0  @0430yes i hope youre lurking rn. i want to li...      0
 1  05 really taught me a valuable lesson I'm neve...      0
 2  @098BERRY Never had a voice to protest, so you...      0
 3  @0hMySt4rs Rest in peace & love to you and you...      0
 4  100 days until Christmas! 🌲 #too soon #not rea...      0,
 (41755, 2))

In [82]:
#Check distribution of labels
df_add1['label'].value_counts()

label
0    22267
1    19488
Name: count, dtype: int64

In [84]:
#Dataset by Riloff

#Specify the file path to read in next dataset
file_path = 'train.txt' 

#Read the data into a df
df_add2 = pd.read_csv(file_path, delimiter='\t', header=None, names=['text'])

#All tweets are sarcastic- relabel them all as 1
df_add2['label'] = 1

#Check the df
df_add2.head(), df_add2.shape

(                                                text  label
 0  Nih min buat fans arsenal @my_supersoccer :D h...      1
 1  Give a person power that will be a true test o...      1
 2  @LordWilsonVILLA At 21 he looks to have a lot ...      1
 3  I'm about to fall asleep and I still have to b...      1
 4  I love hearing the shots from the shooting ran...      1,
 (1368, 2))

In [85]:
#Repeat for the train data

#Specify the file path to read in next dataset
file_path = 'test.txt' 

#Read the data into a df
df_add3 = pd.read_csv(file_path, delimiter='\t', header=None, names=['text'])

#All tweets are sarcastic- relabel them all as 1
df_add3['label'] = 1

#Check the df
df_add3.head(), df_add3.shape

(                                                text  label
 0  Absolutely love when water is spilt on my phon...      1
 1  I was hoping just a LITTLE more shit could hit...      1
 2  @pdomo Don't forget that Nick Foles is also th...      1
 3  I constantly see tweets about Arsenal on twitt...      1
 4  Can feel the feet pulsating...slow one...becau...      1,
 (588, 2))

In [86]:
#Combine the dfs using the same collection strategy
df_add2 = pd.concat([df_add2, df_add3], ignore_index=True)

#Check df size and form
df_add2.head(), df_add2.shape

(                                                text  label
 0  Nih min buat fans arsenal @my_supersoccer :D h...      1
 1  Give a person power that will be a true test o...      1
 2  @LordWilsonVILLA At 21 he looks to have a lot ...      1
 3  I'm about to fall asleep and I still have to b...      1
 4  I love hearing the shots from the shooting ran...      1,
 (1956, 2))

In [87]:
#Check distribution of labels
df_add2['label'].value_counts()

label
1    1956
Name: count, dtype: int64

In [88]:
#Dataset by Ghosh- uses weak labelling with #sarcasm

#Specify the file path to read in next dataset
file_path = 'train (1).txt' 

#Read the data into a df
df_add3 = pd.read_csv(file_path, delimiter='\t', header=None, names=['Column1', 'Column2', 'Column3'])

#Check the df
df_add3.head()

Unnamed: 0,Column1,Column2,Column3
0,TrainSen,1,It feels like just a few days ago it was the l...
1,TrainSen,1,"I love my mom . No matter what we go through ,..."
2,TrainSen,1,Bump that music ... #imtryingtosleep #sarcasm
3,TrainSen,0,Mexican and black jokes are pretty much the sa...
4,TrainSen,0,How to find work you love :


In [90]:
#Rename cols based on the contents
df_add3 = df_add3.rename(columns={'Column2': 'label'})
df_add3 = df_add3.rename(columns={'Column3': 'text'})

#Drop unnecessary columns
classification_cols = ['text', 'label']
df_add3 = df_add3[classification_cols]

#Remove weak label
df_add3['text'] = df_add3['text'].str.replace(r'#sarcasm\b', '', regex=True)

#Check df form and size
df_add3.head(), df_add3.shape

(                                                text  label
 0  It feels like just a few days ago it was the l...      1
 1  I love my mom . No matter what we go through ,...      1
 2              Bump that music ... #imtryingtosleep       1
 3  Mexican and black jokes are pretty much the sa...      0
 4                        How to find work you love :      0,
 (51189, 2))

In [91]:
#Repeat for test data

#Specify the file path to read in next dataset
file_path = 'test (1).txt' 

#Read the data into a df
df_add4 = pd.read_csv(file_path, delimiter='\t', header=None, names=['Column1', 'label', 'text'])

#Drop unnecessary columns
classification_cols = ['text', 'label']
df_add4 = df_add4[classification_cols]

#Remove weak label
df_add4['text'] = df_add4['text'].str.replace(r'#sarcasm\b', '', regex=True)

#Combine the dfs using the same collection strategy
df_add3 = pd.concat([df_add3, df_add4], ignore_index=True)

#Check df form and size
df_add3.head(), df_add3.shape

(                                                text  label
 0  It feels like just a few days ago it was the l...      1
 1  I love my mom . No matter what we go through ,...      1
 2              Bump that music ... #imtryingtosleep       1
 3  Mexican and black jokes are pretty much the sa...      0
 4                        How to find work you love :      0,
 (54877, 2))

# Filter for sarcastic tweets with similar important features to the sarcastic features identified before