# Preprocessing iSarcasm dataset to extract useful data from survey

The purpose of this section is to preprocess the iSarcasm dataset to determine data suitable for use in the primary research survey. The goal is to extract subsets of sarcastic and non-sarcastic tweets from the d

In [16]:
#Import libraries
import pandas as pd
import re

In [14]:
#Read data in as CSV
df = pd.read_csv('isarcasm2022.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",0.0,1.0,0.0,0.0,0.0,0.0
1,1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,1.0,0.0,0.0,0.0,0.0,0.0
2,2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",0.0,1.0,0.0,0.0,0.0,0.0
3,3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",1.0,0.0,0.0,0.0,0.0,0.0
4,4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,1.0,0.0,0.0,0.0,0.0,0.0


In [15]:
#Drop extraneous column
df.drop("Unnamed: 0", axis=1, inplace=True)

# Emoji containing subset

In [18]:
#Create subset of tweets which already contain emojis

#Convert non-string values to string format
df["tweet"] = df["tweet"].astype(str)

#Create a regex pattern to match emojis
emoji_pattern = re.compile("[\U0001F600-\U0001F64F" #Emoticons
                           "\U0001F300-\U0001F5FF"  #Symbols & pictographs
                           "\U0001F680-\U0001F6FF"  #Transport & map symbols
                           "\U0001F1E0-\U0001F1FF"  #Flags
                           "\U00002702-\U000027B0"  #Dingbats
                           "\U000024C2-\U0001F251"  #Enclosed characters
                           "]+", flags=re.UNICODE)

#Use the regex pattern to check if the tweet column contains emojis
df["has_emoji"] = df["tweet"].apply(lambda x: bool(re.search(emoji_pattern, x)))

#Extract the rows where the tweet column contains emojis
emoji_tweets = df[df["has_emoji"]]

#Display subset
emoji_tweets.head()

Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question,has_emoji
1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,1.0,0.0,0.0,0.0,0.0,0.0,True
2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",0.0,1.0,0.0,0.0,0.0,0.0,True
3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",1.0,0.0,0.0,0.0,0.0,0.0,True
16,My eldest is having a wild Friday night out. S...,1,My eldest is going to play bingo tonight.,1.0,1.0,0.0,0.0,0.0,0.0,True
24,@AsdaServiceTeam imagine your delivery being 2...,1,It's not acceptable for you to just refund my ...,1.0,0.0,0.0,0.0,0.0,0.0,True


In [13]:
#Check some of the tweets for emojis

#Select a random row from the DataFrame
random_row = emoji_tweets.sample(n=1)

#Retrieve the contents of the tweets column
random_tweet = random_row["tweet"].values[0]

#Print the tweet
print(random_tweet)

Did you know? Abuse and alcoholism and depression will just ✨disappear✨ if the country is opened completely during a pandemic? #trumpisanidiot #bidenharris2020


In [20]:
#Check if all rows in the tweet column contain an emoji
all_rows_have_emoji = emoji_tweets["tweet"].str.contains(emoji_pattern).all()

#Display result
print(all_rows_have_emoji)

True


In [21]:
#Determine how many tweets already contain emojis
emoji_counts = df["has_emoji"].value_counts()

#Print the count
print(emoji_counts)

#I have 566 data points already containing emojis for my work

False    2902
True      566
Name: has_emoji, dtype: int64


In [22]:
#Determine how many tweets containing emojis are sarcastic/non-sarcastic
sarcasm_counts = emoji_tweets["sarcastic"].value_counts()

#Print the count
print(sarcasm_counts)

#164 of the emoji containing tweets are sarcastic at present

0    402
1    164
Name: sarcastic, dtype: int64


# No emoji subset

In [23]:
# Create a subset of rows where "has_emoji" column is False
no_emoji_tweets = df[df["has_emoji"] == False].copy()

# Print the subset DataFrame
no_emoji_tweets.head()

Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question,has_emoji
0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",0.0,1.0,0.0,0.0,0.0,0.0,False
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,1.0,0.0,0.0,0.0,0.0,0.0,False
5,"@jimrossignol I choose to interpret it as ""XD""...",1,It's a terrible name and the product sounds aw...,0.0,1.0,0.0,1.0,0.0,0.0,False
6,Why would Alexa's recipe for Yorkshire pudding...,1,Great recipe from Alexa,0.0,1.0,0.0,0.0,0.0,1.0,False
7,someone hit me w a horse tranquilizer istg ive...,1,Simply “I’m miserable.”,1.0,0.0,0.0,0.0,0.0,0.0,False


In [24]:
#Check some of the tweets for emojis

#Select a random row from the DataFrame
random_row = no_emoji_tweets.sample(n=1)

#Retrieve the contents of the tweets column
random_tweet = random_row["tweet"].values[0]

#Print the tweet
print(random_tweet)

I'm enjoying this @Jeopardy tournament WAY more than I expected. And I already came in at an 11. Loud claps. Jumping around my living room. #JeopardyGOAT


In [25]:
#Verify no tweets in the subset contain emojis
none_of_the_rows_have_emoji = not no_emoji_tweets["tweet"].str.contains(emoji_pattern).any()

#Display the results
print(none_of_the_rows_have_emoji)

True


In [26]:
#Determine how many tweets not containing emojis are sarcastic/non-sarcastic
sarcasm_counts = no_emoji_tweets["sarcastic"].value_counts()

#Print the count
print(sarcasm_counts)

0    2199
1     703
Name: sarcastic, dtype: int64


In [33]:
#Create the subsets for the survey

#Create sarcastic/ non-sarcastic tweets subsets
sarc = df[df["sarcastic"] == 1].copy()
nonsarc = df[df["sarcastic"] == 0].copy()

#Check content
sarc.shape, nonsarc.shape

((867, 10), (2601, 10))

In [34]:
#Create subsets containing tweets w/o emojis for both
sarc_noemo = sarc[~sarc["tweet"].str.contains(emoji_pattern)].copy()
nonsarc_noemo = nonsarc[~nonsarc["tweet"].str.contains(emoji_pattern)].copy()

#Check content
sarc_noemo.shape, nonsarc_noemo.shape

((703, 10), (2199, 10))

In [35]:
#Not all of the non-sarcatic tweets without emojis will be used for the survey- randomly select the necessary amount

#Randomly sample 465 rows
nonsarc_noemo_ss = nonsarc_noemo.sample(n=465, random_state=42)

#Check content
nonsarc_noemo_ss.shape

(465, 10)

In [42]:
#Export to CSV to input into survey
nonsarc_noemo.to_csv('nonsarc_noemoji_surveyqs.csv', index=False)

In [43]:
#Repeat for sarcastic emojis
sarc_noemo.to_csv('sarc_noemoji_surveyqs.csv', index=False)