# Preprocessing iSarcasm dataset to extract useful data from survey

The purpose of this section is to preprocess the iSarcasm dataset to determine data suitable for use in the primary research survey. The goal is to extract subsets of sarcastic and non-sarcastic tweets from the d

In [2]:
#Import libraries
import pandas as pd
import re

In [14]:
#Read data in as CSV
df = pd.read_csv('isarcasm2022.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",0.0,1.0,0.0,0.0,0.0,0.0
1,1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,1.0,0.0,0.0,0.0,0.0,0.0
2,2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",0.0,1.0,0.0,0.0,0.0,0.0
3,3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",1.0,0.0,0.0,0.0,0.0,0.0
4,4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,1.0,0.0,0.0,0.0,0.0,0.0


In [15]:
#Drop extraneous column
df.drop("Unnamed: 0", axis=1, inplace=True)

# Emoji containing subset

In [18]:
#Create subset of tweets which already contain emojis

#Convert non-string values to string format
df["tweet"] = df["tweet"].astype(str)

#Create a regex pattern to match emojis
emoji_pattern = re.compile("[\U0001F600-\U0001F64F" #Emoticons
                           "\U0001F300-\U0001F5FF"  #Symbols & pictographs
                           "\U0001F680-\U0001F6FF"  #Transport & map symbols
                           "\U0001F1E0-\U0001F1FF"  #Flags
                           "\U00002702-\U000027B0"  #Dingbats
                           "\U000024C2-\U0001F251"  #Enclosed characters
                           "]+", flags=re.UNICODE)

#Use the regex pattern to check if the tweet column contains emojis
df["has_emoji"] = df["tweet"].apply(lambda x: bool(re.search(emoji_pattern, x)))

#Extract the rows where the tweet column contains emojis
emoji_tweets = df[df["has_emoji"]]

#Display subset
emoji_tweets.head()

Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question,has_emoji
1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,1.0,0.0,0.0,0.0,0.0,0.0,True
2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",0.0,1.0,0.0,0.0,0.0,0.0,True
3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",1.0,0.0,0.0,0.0,0.0,0.0,True
16,My eldest is having a wild Friday night out. S...,1,My eldest is going to play bingo tonight.,1.0,1.0,0.0,0.0,0.0,0.0,True
24,@AsdaServiceTeam imagine your delivery being 2...,1,It's not acceptable for you to just refund my ...,1.0,0.0,0.0,0.0,0.0,0.0,True


In [13]:
#Check some of the tweets for emojis

#Select a random row from the DataFrame
random_row = emoji_tweets.sample(n=1)

#Retrieve the contents of the tweets column
random_tweet = random_row["tweet"].values[0]

#Print the tweet
print(random_tweet)

Did you know? Abuse and alcoholism and depression will just ✨disappear✨ if the country is opened completely during a pandemic? #trumpisanidiot #bidenharris2020


In [20]:
#Check if all rows in the tweet column contain an emoji
all_rows_have_emoji = emoji_tweets["tweet"].str.contains(emoji_pattern).all()

#Display result
print(all_rows_have_emoji)

True


In [21]:
#Determine how many tweets already contain emojis
emoji_counts = df["has_emoji"].value_counts()

#Print the count
print(emoji_counts)

#I have 566 data points already containing emojis for my work

False    2902
True      566
Name: has_emoji, dtype: int64


In [22]:
#Determine how many tweets containing emojis are sarcastic/non-sarcastic
sarcasm_counts = emoji_tweets["sarcastic"].value_counts()

#Print the count
print(sarcasm_counts)

#164 of the emoji containing tweets are sarcastic at present

0    402
1    164
Name: sarcastic, dtype: int64


# No emoji subset

In [23]:
# Create a subset of rows where "has_emoji" column is False
no_emoji_tweets = df[df["has_emoji"] == False].copy()

# Print the subset DataFrame
no_emoji_tweets.head()

Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question,has_emoji
0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",0.0,1.0,0.0,0.0,0.0,0.0,False
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,1.0,0.0,0.0,0.0,0.0,0.0,False
5,"@jimrossignol I choose to interpret it as ""XD""...",1,It's a terrible name and the product sounds aw...,0.0,1.0,0.0,1.0,0.0,0.0,False
6,Why would Alexa's recipe for Yorkshire pudding...,1,Great recipe from Alexa,0.0,1.0,0.0,0.0,0.0,1.0,False
7,someone hit me w a horse tranquilizer istg ive...,1,Simply “I’m miserable.”,1.0,0.0,0.0,0.0,0.0,0.0,False


In [24]:
#Check some of the tweets for emojis

#Select a random row from the DataFrame
random_row = no_emoji_tweets.sample(n=1)

#Retrieve the contents of the tweets column
random_tweet = random_row["tweet"].values[0]

#Print the tweet
print(random_tweet)

I'm enjoying this @Jeopardy tournament WAY more than I expected. And I already came in at an 11. Loud claps. Jumping around my living room. #JeopardyGOAT


In [25]:
#Verify no tweets in the subset contain emojis
none_of_the_rows_have_emoji = not no_emoji_tweets["tweet"].str.contains(emoji_pattern).any()

#Display the results
print(none_of_the_rows_have_emoji)

True


In [26]:
#Determine how many tweets not containing emojis are sarcastic/non-sarcastic
sarcasm_counts = no_emoji_tweets["sarcastic"].value_counts()

#Print the count
print(sarcasm_counts)

0    2199
1     703
Name: sarcastic, dtype: int64


In [33]:
#Create the subsets for the survey

#Create sarcastic/ non-sarcastic tweets subsets
sarc = df[df["sarcastic"] == 1].copy()
nonsarc = df[df["sarcastic"] == 0].copy()

#Check content
sarc.shape, nonsarc.shape

((867, 10), (2601, 10))

In [34]:
#Create subsets containing tweets w/o emojis for both
sarc_noemo = sarc[~sarc["tweet"].str.contains(emoji_pattern)].copy()
nonsarc_noemo = nonsarc[~nonsarc["tweet"].str.contains(emoji_pattern)].copy()

#Check content
sarc_noemo.shape, nonsarc_noemo.shape

((703, 10), (2199, 10))

In [35]:
#Not all of the non-sarcatic tweets without emojis will be used for the survey- randomly select the necessary amount

#Randomly sample 465 rows
nonsarc_noemo_ss = nonsarc_noemo.sample(n=465, random_state=42)

#Check content
nonsarc_noemo_ss.shape

(465, 10)

In [42]:
#Export to CSV to input into survey
nonsarc_noemo.to_csv('nonsarc_noemoji_surveyqs.csv', index=False)

In [43]:
#Repeat for sarcastic emojis
sarc_noemo.to_csv('sarc_noemoji_surveyqs.csv', index=False)

# Processing results

In [3]:
#Importing isarcasm dataset
df_isarcasm = pd.read_csv('isarcasm2022.csv')
df_isarcasm.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",0.0,1.0,0.0,0.0,0.0,0.0
1,1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,1.0,0.0,0.0,0.0,0.0,0.0
2,2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",0.0,1.0,0.0,0.0,0.0,0.0
3,3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",1.0,0.0,0.0,0.0,0.0,0.0
4,4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,1.0,0.0,0.0,0.0,0.0,0.0


In [7]:
#Importing survey results
df_survey = pd.read_excel('survey_results_data.xlsx')
df_survey.head()

Unnamed: 0,participant_no,over_18,use_emojis,gender,age,original_text,class_survey,edited_text
0,1,Yes,Yes,Female,18-24,The only thing I got from college was a caffei...,It is sarcastic,The only thing I got from college was a caffei...
1,1,Yes,Yes,Female,18-24,@WalesOnline Riveting news.,I don't know,@WalesOnline Riveting news.
2,1,Yes,Yes,Female,18-24,"""My (extended) fam was discussing going on a t...",It is sarcastic,"""My (extended) fam was discussing going on a t..."
3,1,Yes,Yes,Female,18-24,i love shoegaze sm,It is sarcastic,i love shoegaze sm 😁
4,1,Yes,Yes,Female,18-24,men are so grimey,It is sarcastic,men are so grimey 😢


# Stats for survey results

In [9]:
#How many responses 
df_survey['participant_no'].nunique()

87

In [10]:
#All over 18?
df_survey['over_18'].nunique()

1

In [11]:
#All emoji users?
df_survey['use_emojis'].nunique()

1

In [14]:
#Gender response breakdown

#Remove all but one response from each participant
df_unique_participants = df_survey.drop_duplicates(subset='participant_no')

#Check responses by gender
df_unique_participants['gender'].value_counts()

gender
Female                 49
Male                   35
I don't want to say     1
Other                   1
Queer                   1
Name: count, dtype: int64

In [18]:
#Age response breakdown
print('Overall age breakdown:')
print(df_unique_participants['age'].value_counts())

#Break down by gender
male_subset = df_unique_participants[df_unique_participants['gender'] == 'Male']
print('Breakdown for male respondents:')
print(male_subset['age'].value_counts())
    
female_subset = df_unique_participants[df_unique_participants['gender'] == 'Female']
print('Breakdown for female respondents:')
print(female_subset['age'].value_counts())

Overall age breakdown:
age
25-34    29
18-24    27
35-44    24
45-64     7
Name: count, dtype: int64
Breakdown for male respondents:
age
18-24    11
35-44    11
25-34     8
45-64     5
Name: count, dtype: int64
Breakdown for female respondents:
age
25-34    19
18-24    15
35-44    13
45-64     2
Name: count, dtype: int64


In [19]:
#Check distribution of classifications
df_survey['class_survey'].value_counts()

class_survey
It is not sarcastic    359
It is sarcastic        345
I don't know           128
Name: count, dtype: int64

In [22]:
#Break this down by gender

#Remake the gender subsets to include all responses
male_subset = df_survey[df_survey['gender'] == 'Male']
female_subset = df_survey[df_survey['gender'] == 'Female']

print('Male response breakdown:')
print(male_subset['class_survey'].value_counts())

print('Female response breakdown:')
print(female_subset['class_survey'].value_counts())

Male response breakdown:
class_survey
It is sarcastic        141
It is not sarcastic    135
I don't know            58
Name: count, dtype: int64
Female response breakdown:
class_survey
It is not sarcastic    209
It is sarcastic        196
I don't know            63
Name: count, dtype: int64


In [24]:
#Break this down by ages
a_subset = df_survey[df_survey['age'] == '18-24']
b_subset = df_survey[df_survey['age'] == '25-34']
c_subset = df_survey[df_survey['age'] == '35-44']
d_subset = df_survey[df_survey['age'] == '45-64']

print('18-24 age group response breakdown:')
print(a_subset['class_survey'].value_counts())

print('25-34 age group response breakdown:')
print(b_subset['class_survey'].value_counts())

print('35-44 age group response breakdown:')
print(c_subset['class_survey'].value_counts())

print('45-64 age group response breakdown:')
print(d_subset['class_survey'].value_counts())

18-24 age group response breakdown:
class_survey
It is not sarcastic    114
It is sarcastic        111
I don't know            35
Name: count, dtype: int64
25-34 age group response breakdown:
class_survey
It is not sarcastic    130
It is sarcastic        114
I don't know            50
Name: count, dtype: int64
35-44 age group response breakdown:
class_survey
It is sarcastic        95
It is not sarcastic    83
I don't know           35
Name: count, dtype: int64
45-64 age group response breakdown:
class_survey
It is not sarcastic    32
It is sarcastic        25
I don't know            8
Name: count, dtype: int64


# Quality control question evaluation

In [41]:
#Search for columns that contain the first QC question
phrase = 'caffeine addiction'
qc = df_survey[df_survey['original_text'].str.contains(phrase, case=False)]

# Print the rows where the phrase was found
qc.head(10)

Unnamed: 0,participant_no,over_18,use_emojis,gender,age,original_text,class_survey,edited_text
0,1,Yes,Yes,Female,18-24,The only thing I got from college was a caffei...,It is sarcastic,The only thing I got from college was a caffei...
10,2,Yes,Yes,Male,18-24,The only thing I got from college was a caffei...,It is sarcastic,The only thing I got from college is a caffein...
20,3,Yes,Yes,I don't want to say,25-34,The only thing I got from college is a caffein...,It is sarcastic,The only thing I got from college is a caffein...
30,4,Yes,Yes,Female,18-24,The only thing I got from college is a caffein...,It is sarcastic,The only thing I got from college is a caffein...
40,5,Yes,Yes,Female,18-24,The only thing I got from college is a caffein...,It is sarcastic,The only thing I got from college is a caffein...
50,6,Yes,Yes,Male,18-24,The only thing I got from college is a caffein...,It is sarcastic,The only thing I got from college is a caffein...
143,15,Yes,Yes,Female,18-24,The only thing I got from college is a caffein...,It is sarcastic,The only thing I got from college is a caffein...


In [36]:
#Breakdown of responses for this question
qc['class_survey'].value_counts()

class_survey
It is sarcastic    7
Name: count, dtype: int64

In [42]:
#Print the responses
print('Response 1:')
print(qc['edited_text'].iloc[0])
print('Age:')
print(qc['age'].iloc[0])
print('Gender:')
print(qc['gender'].iloc[0])
print('')
print('Response 2:')
print(qc['edited_text'].iloc[1])
print('Age:')
print(qc['age'].iloc[1])
print('Gender:')
print(qc['gender'].iloc[1])
print('')
print('Response 3:')
print(qc['edited_text'].iloc[2])
print('Age:')
print(qc['age'].iloc[2])
print('Gender:')
print(qc['gender'].iloc[2])
print('')
print('Response 4:')
print(qc['edited_text'].iloc[3])
print('Age:')
print(qc['age'].iloc[3])
print('Gender:')
print(qc['gender'].iloc[3])
print('')
print('Response 5:')
print(qc['edited_text'].iloc[4])
print('Age:')
print(qc['age'].iloc[4])
print('Gender:')
print(qc['gender'].iloc[4])
print('')
print('Response 6:')
print(qc['edited_text'].iloc[5])
print('Age:')
print(qc['age'].iloc[5])
print('Gender:')
print(qc['gender'].iloc[5])
print('')
print('Response 7:')
print(qc['edited_text'].iloc[6])
print('Age:')
print(qc['age'].iloc[6])
print('Gender:')
print(qc['gender'].iloc[6])

Response 1:
The only thing I got from college was a caffeine addiction 😏
Age:
18-24
Gender:
Female

Response 2:
The only thing I got from college is a caffeine addiction 😭
Age:
18-24
Gender:
Male

Response 3:
The only thing I got from college is a caffeine addiction 😂
Age:
25-34
Gender:
I don't want to say

Response 4:
The only thing I got from college is a caffeine addiction 🤠
Age:
18-24
Gender:
Female

Response 5:
The only thing I got from college is a caffeine addiction😭
Age:
18-24
Gender:
Female

Response 6:
The only thing I got from college is a caffeine addiction 😵‍💫😭😞🫠
Age:
18-24
Gender:
Male

Response 7:
The only thing I got from college is a caffeine addiction 😂
Age:
18-24
Gender:
Female


In [43]:
#Search for columns that contain the first QC question
phrase = 'business casual attire'
qc = df_survey[df_survey['original_text'].str.contains(phrase, case=False)]

# Print the rows where the phrase was found
qc.head()

Unnamed: 0,participant_no,over_18,use_emojis,gender,age,original_text,class_survey,edited_text
555,59,Yes,Yes,Female,25-34,was not aware that Crocs were appropriate busi...,It is sarcastic,was not aware that Crocs were appropriate busi...
754,80,Yes,Yes,Female,35-44,was not aware that Crocs were appropriate busi...,It is sarcastic,was not aware that Crocs were appropriate busi...
764,81,Yes,Yes,Female,35-44,was not aware that Crocs were appropriate busi...,It is sarcastic,was not aware that Crocs were appropriate busi...
774,82,Yes,Yes,Female,25-34,was not aware that Crocs were appropriate busi...,It is sarcastic,was not aware that Crocs were appropriate busi...
784,83,Yes,Yes,Female,45-64,was not aware that Crocs were appropriate busi...,It is sarcastic,was not aware that Crocs were appropriate busi...


In [44]:
#Breakdown of responses for this question
qc['class_survey'].value_counts()

class_survey
It is sarcastic    7
Name: count, dtype: int64

In [45]:
#Print the responses
print('Response 1:')
print(qc['edited_text'].iloc[0])
print('Age:')
print(qc['age'].iloc[0])
print('Gender:')
print(qc['gender'].iloc[0])
print('')
print('Response 2:')
print(qc['edited_text'].iloc[1])
print('Age:')
print(qc['age'].iloc[1])
print('Gender:')
print(qc['gender'].iloc[1])
print('')
print('Response 3:')
print(qc['edited_text'].iloc[2])
print('Age:')
print(qc['age'].iloc[2])
print('Gender:')
print(qc['gender'].iloc[2])
print('')
print('Response 4:')
print(qc['edited_text'].iloc[3])
print('Age:')
print(qc['age'].iloc[3])
print('Gender:')
print(qc['gender'].iloc[3])
print('')
print('Response 5:')
print(qc['edited_text'].iloc[4])
print('Age:')
print(qc['age'].iloc[4])
print('Gender:')
print(qc['gender'].iloc[4])
print('')
print('Response 6:')
print(qc['edited_text'].iloc[5])
print('Age:')
print(qc['age'].iloc[5])
print('Gender:')
print(qc['gender'].iloc[5])
print('')
print('Response 7:')
print(qc['edited_text'].iloc[6])
print('Age:')
print(qc['age'].iloc[6])
print('Gender:')
print(qc['gender'].iloc[6])

Response 1:
was not aware that Crocs were appropriate business casual attire.
Age:
25-34
Gender:
Female

Response 2:
was not aware that Crocs were appropriate business casual attire. 😒
Age:
35-44
Gender:
Female

Response 3:
was not aware that Crocs were appropriate business casual attire. 😅
Age:
35-44
Gender:
Female

Response 4:
was not aware that Crocs were appropriate business casual attire.😂
Age:
25-34
Gender:
Female

Response 5:
was not aware that Crocs were appropriate business casual attire. 🙄
Age:
45-64
Gender:
Female

Response 6:
was not aware that Crocs were appropriate business casual attire. 🙄
Age:
25-34
Gender:
Female

Response 7:
was not aware that Crocs were appropriate business casual attire.🤔
Age:
25-34
Gender:
Female


In [46]:
#Search for columns that contain the first QC question
phrase = 'villains in super hero films'
qc = df_survey[df_survey['original_text'].str.contains(phrase, case=False)]

# Print the rows where the phrase was found
qc.head()

Unnamed: 0,participant_no,over_18,use_emojis,gender,age,original_text,class_survey,edited_text
60,7,Yes,Yes,Female,35-44,The villains in super hero films are awfully p...,It is sarcastic,The villains in super hero films are awfully p...
71,8,Yes,Yes,Female,25-34,The villains in super hero films are awfully p...,It is sarcastic,The villains in super hero films are awfully p...
81,9,Yes,Yes,Female,25-34,The villains in super hero films are awfully p...,It is sarcastic,The villains in super hero films are awfully p...
91,10,Yes,Yes,Other,25-34,The villains in super hero films are awfully p...,It is not sarcastic,The villains in super hero films are awfully p...
101,11,Yes,Yes,Female,25-34,The villains in super hero films are awfully p...,It is sarcastic,The villains in super hero films are awfully p...


In [47]:
#Breakdown of responses for this question
qc['class_survey'].value_counts()

class_survey
It is sarcastic        16
It is not sarcastic     2
I don't know            1
Name: count, dtype: int64

In [48]:
#Print the sarcastic responses
subset_qc = qc[qc['class_survey'] == 'It is sarcastic']

print('Response 1:')
print(subset_qc['edited_text'].iloc[0])
print('Age:')
print(subset_qc['age'].iloc[0])
print('Gender:')
print(subset_qc['gender'].iloc[0])
print('')
print('Response 2:')
print(subset_qc['edited_text'].iloc[1])
print('Age:')
print(subset_qc['age'].iloc[1])
print('Gender:')
print(subset_qc['gender'].iloc[1])
print('')
print('Response 3:')
print(subset_qc['edited_text'].iloc[2])
print('Age:')
print(subset_qc['age'].iloc[2])
print('Gender:')
print(subset_qc['gender'].iloc[2])
print('')
print('Response 4:')
print(subset_qc['edited_text'].iloc[3])
print('Age:')
print(subset_qc['age'].iloc[3])
print('Gender:')
print(subset_qc['gender'].iloc[3])
print('')
print('Response 5:')
print(subset_qc['edited_text'].iloc[4])
print('Age:')
print(subset_qc['age'].iloc[4])
print('Gender:')
print(subset_qc['gender'].iloc[4])
print('')
print('Response 6:')
print(subset_qc['edited_text'].iloc[5])
print('Age:')
print(subset_qc['age'].iloc[5])
print('Gender:')
print(subset_qc['gender'].iloc[5])
print('')
print('Response 7:')
print(subset_qc['edited_text'].iloc[6])
print('Age:')
print(subset_qc['age'].iloc[6])
print('Gender:')
print(subset_qc['gender'].iloc[6])
print('Response 8:')
print(subset_qc['edited_text'].iloc[7])
print('Age:')
print(subset_qc['age'].iloc[7])
print('Gender:')
print(subset_qc['gender'].iloc[7])
print('')
print('Response 9:')
print(subset_qc['edited_text'].iloc[8])
print('Age:')
print(subset_qc['age'].iloc[8])
print('Gender:')
print(subset_qc['gender'].iloc[8])
print('')
print('Response 10:')
print(subset_qc['edited_text'].iloc[9])
print('Age:')
print(subset_qc['age'].iloc[9])
print('Gender:')
print(subset_qc['gender'].iloc[9])
print('')
print('Response 11:')
print(subset_qc['edited_text'].iloc[10])
print('Age:')
print(subset_qc['age'].iloc[10])
print('Gender:')
print(subset_qc['gender'].iloc[10])
print('')
print('Response 12:')
print(subset_qc['edited_text'].iloc[11])
print('Age:')
print(subset_qc['age'].iloc[11])
print('Gender:')
print(subset_qc['gender'].iloc[11])
print('')
print('Response 13:')
print(subset_qc['edited_text'].iloc[12])
print('Age:')
print(subset_qc['age'].iloc[12])
print('Gender:')
print(subset_qc['gender'].iloc[12])
print('')
print('Response 14:')
print(subset_qc['edited_text'].iloc[13])
print('Age:')
print(subset_qc['age'].iloc[13])
print('Gender:')
print(subset_qc['gender'].iloc[13])
print('Response 15:')
print(subset_qc['edited_text'].iloc[14])
print('Age:')
print(subset_qc['age'].iloc[14])
print('Gender:')
print(subset_qc['gender'].iloc[14])
print('')
print('Response 16:')
print(subset_qc['edited_text'].iloc[15])
print('Age:')
print(subset_qc['age'].iloc[15])
print('Gender:')
print(subset_qc['gender'].iloc[15])

Response 1:
The villains in super hero films are awfully polite. They always confine their invasions to being above one particular city
Age:
35-44
Gender:
Female

Response 2:
The villains in super hero films are awfully polite. They always confine their invasions to being above one particular city 😌
Age:
25-34
Gender:
Female

Response 3:
The villains in super hero films are awfully polite. They always confine their invasions to being above one particular city 😉
Age:
25-34
Gender:
Female

Response 4:
The villains in super hero films are awfully polite. They always confine their invasions to being above one particular city 😂
Age:
25-34
Gender:
Female

Response 5:
The villains in super hero films are awfully polite. They always confine their invasions to being above one particular city💀
Age:
35-44
Gender:
Male

Response 6:
The villains in super hero films are awfully polite. They always confine their invasions to being above one particular city 😉
Age:
25-34
Gender:
Female

Response 7:
The

In [49]:
#Print the non sarcastic responses
subset_qc = qc[qc['class_survey'] == 'It is not sarcastic']

print('Response 1:')
print(subset_qc['edited_text'].iloc[0])
print('Age:')
print(subset_qc['age'].iloc[0])
print('Gender:')
print(subset_qc['gender'].iloc[0])
print('')
print('Response 2:')
print(subset_qc['edited_text'].iloc[1])
print('Age:')
print(subset_qc['age'].iloc[1])
print('Gender:')
print(subset_qc['gender'].iloc[1])

Response 1:
The villains in super hero films are awfully polite. They always confine their invasions to being above one particular city 🤔
Age:
25-34
Gender:
Other

Response 2:
The villains in super hero films are awfully polite. They always confine their invasions to being above one particular city😡
Age:
25-34
Gender:
Female


In [50]:
#Print the unknown responses
subset_qc = qc[qc['class_survey'] == "I don't know"]

print('Response 1:')
print(subset_qc['edited_text'].iloc[0])
print('Age:')
print(subset_qc['age'].iloc[0])
print('Gender:')
print(subset_qc['gender'].iloc[0])
print('')

Response 1:
The villains in super hero films are awfully polite. They always confine their invasions to being above one particular city
Age:
18-24
Gender:
Female



In [51]:
#Search for columns that contain the first QC question
phrase = 'season 4 of trump does america'
qc = df_survey[df_survey['original_text'].str.contains(phrase, case=False)]

# Print the rows where the phrase was found
qc.head()

Unnamed: 0,participant_no,over_18,use_emojis,gender,age,original_text,class_survey,edited_text
410,44,Yes,Yes,Male,18-24,Loving season 4 of trump does America. Funnies...,It is sarcastic,Loving season 4 of trump does America. Funnies...
420,45,Yes,Yes,Male,35-44,Loving season 4 of trump does America. Funnies...,It is not sarcastic,Loving season 4 of trump does America. 😜Funnie...
430,46,Yes,Yes,Male,45-64,Loving season 4 of trump does America. Funnies...,It is sarcastic,Loving season 4 of trump does America. Funnies...
440,47,Yes,Yes,Male,25-34,Loving season 4 of trump does America. Funnies...,I don't know,Loving season 4 of trump does America. Funnies...
442,48,Yes,Yes,Female,25-34,Loving season 4 of trump does America. Funnies...,I don't know,Loving season 4 of trump does America. Funnies...


In [52]:
#Breakdown of responses for this question
qc['class_survey'].value_counts()

class_survey
It is sarcastic        2
It is not sarcastic    2
I don't know           2
Name: count, dtype: int64

In [53]:
#Print the sarcastic responses
subset_qc = qc[qc['class_survey'] == 'It is sarcastic']
print('Sarcastic Responses:')
print('Response 1:')
print(subset_qc['edited_text'].iloc[0])
print('Age:')
print(subset_qc['age'].iloc[0])
print('Gender:')
print(subset_qc['gender'].iloc[0])
print('')
print('Response 2:')
print(subset_qc['edited_text'].iloc[1])
print('Age:')
print(subset_qc['age'].iloc[1])
print('Gender:')
print(subset_qc['gender'].iloc[1])

#Print non sarcastic responses
subset_qc = qc[qc['class_survey'] == 'It is not sarcastic']
print('Non-Sarcastic Responses:')
print('Response 1:')
print(subset_qc['edited_text'].iloc[0])
print('Age:')
print(subset_qc['age'].iloc[0])
print('Gender:')
print(subset_qc['gender'].iloc[0])
print('')
print('Response 2:')
print(subset_qc['edited_text'].iloc[1])
print('Age:')
print(subset_qc['age'].iloc[1])
print('Gender:')
print(subset_qc['gender'].iloc[1])

#Print unknown sarcastic responses
subset_qc = qc[qc['class_survey'] == "I don't know"]
print('Unknown Classification Responses:')
print('Response 1:')
print(subset_qc['edited_text'].iloc[0])
print('Age:')
print(subset_qc['age'].iloc[0])
print('Gender:')
print(subset_qc['gender'].iloc[0])
print('')
print('Response 2:')
print(subset_qc['edited_text'].iloc[1])
print('Age:')
print(subset_qc['age'].iloc[1])
print('Gender:')
print(subset_qc['gender'].iloc[1])

Sarcastic Responses:
Response 1:
Loving season 4 of trump does America. Funniest season yet #DonaldTrump #Trump #MAGA #MAGA2020 🤡🤡
Age:
18-24
Gender:
Male

Response 2:
Loving season 4 of trump does America. Funniest season yet #DonaldTrump #Trump #MAGA #MAGA2020 😉
Age:
45-64
Gender:
Male
Non-Sarcastic Responses:
Response 1:
Loving season 4 of trump does America. 😜Funniest season yet 😂 #DonaldTrump #Trump #MAGA #MAGA2020
Age:
35-44
Gender:
Male

Response 2:
Loving season 4 of trump does America. Funniest season yet 🤣🙏🇺🇸🇺🇸🇺🇸#DonaldTrump #Trump #MAGA #MAGA2020
Age:
18-24
Gender:
Female
Unknown Classification Responses:
Response 1:
Loving season 4 of trump does America. Funniest season yet 🤣 #DonaldTrump #Trump #MAGA #MAGA2020
Age:
25-34
Gender:
Male

Response 2:
Loving season 4 of trump does America. Funniest season yet #DonaldTrump #Trump #MAGA #MAGA2020
Age:
25-34
Gender:
Female


In [54]:
#Search for columns that contain the first QC question
phrase = 'thank you, science'
qc = df_survey[df_survey['original_text'].str.contains(phrase, case=False)]

# Print the rows where the phrase was found
qc.head()

Unnamed: 0,participant_no,over_18,use_emojis,gender,age,original_text,class_survey,edited_text
255,27,Yes,Yes,Male,25-34,"Vaccine dose 1. Thank you, science.",I don't know,"Vaccine dose 1. Thank you, science."
283,30,Yes,Yes,Male,18-24,"Vaccine dose 1. Thank you, science.",It is not sarcastic,"Vaccine dose 1. Thank you, science.👍"
311,33,Yes,Yes,Female,35-44,"Vaccine dose 1. Thank you, science.",It is sarcastic,"Vaccine dose 1. Thank you, science. 💀"
339,36,Yes,Yes,Female,25-34,"Vaccine dose 1. Thank you, science.",It is sarcastic,"Vaccine dose 1. Thank you, science. 😜"
367,39,Yes,Yes,Male,18-24,"Vaccine dose 1. Thank you, science.",It is not sarcastic,"Vaccine dose 1. Thank you, science.👍"


In [55]:
#Breakdown of responses for this question
qc['class_survey'].value_counts()

class_survey
It is not sarcastic    3
It is sarcastic        2
I don't know           1
Name: count, dtype: int64

In [56]:
#Print the sarcastic responses
subset_qc = qc[qc['class_survey'] == 'It is sarcastic']
print('Sarcastic Responses:')
print('Response 1:')
print(subset_qc['edited_text'].iloc[0])
print('Age:')
print(subset_qc['age'].iloc[0])
print('Gender:')
print(subset_qc['gender'].iloc[0])
print('')
print('Response 2:')
print(subset_qc['edited_text'].iloc[1])
print('Age:')
print(subset_qc['age'].iloc[1])
print('Gender:')
print(subset_qc['gender'].iloc[1])

#Print non sarcastic responses
subset_qc = qc[qc['class_survey'] == 'It is not sarcastic']
print('Non-Sarcastic Responses:')
print('Response 1:')
print(subset_qc['edited_text'].iloc[0])
print('Age:')
print(subset_qc['age'].iloc[0])
print('Gender:')
print(subset_qc['gender'].iloc[0])
print('')
print('Response 2:')
print(subset_qc['edited_text'].iloc[1])
print('Age:')
print(subset_qc['age'].iloc[1])
print('Gender:')
print(subset_qc['gender'].iloc[1])
print('Response 3:')
print(subset_qc['edited_text'].iloc[2])
print('Age:')
print(subset_qc['age'].iloc[2])
print('Gender:')
print(subset_qc['gender'].iloc[2])

#Print unknown sarcastic responses
subset_qc = qc[qc['class_survey'] == "I don't know"]
print('Unknown Classification Responses:')
print('Response 1:')
print(subset_qc['edited_text'].iloc[0])
print('Age:')
print(subset_qc['age'].iloc[0])
print('Gender:')
print(subset_qc['gender'].iloc[0])

Sarcastic Responses:
Response 1:
Vaccine dose 1. Thank you, science. 💀
Age:
35-44
Gender:
Female

Response 2:
Vaccine dose 1. Thank you, science. 😜
Age:
25-34
Gender:
Female
Non-Sarcastic Responses:
Response 1:
Vaccine dose 1. Thank you, science.👍
Age:
18-24
Gender:
Male

Response 2:
Vaccine dose 1. Thank you, science.👍
Age:
18-24
Gender:
Male
Response 3:
Vaccine dose 1. Thank you, science. 💛
Age:
18-24
Gender:
Male
Unknown Classification Responses:
Response 1:
Vaccine dose 1. Thank you, science.
Age:
25-34
Gender:
Male


# Emoji use frequency

In [63]:
#Importing libraries
import emoji
from statistics import mean
import demoji

In [57]:
#Make a subsets based on classifications
sarc = df_survey[df_survey['class_survey'] == 'It is sarcastic']
nonsarc = df_survey[df_survey['class_survey'] == 'It is not sarcastic']
idk = df_survey[df_survey['class_survey'] == "I don't know"]

In [64]:
#Determine the average number of emojis used in a sarcastic tweet

#Make a list to hold number of emojis per tweet after editing
emojis_per_string = []

#Determin the number of emojis per tweet
for text in sarc['edited_text']:
    emoji_count = len(demoji.findall(text))
    emojis_per_string.append(emoji_count)

#Average over the df
average_emojis_per_string = mean(emojis_per_string)

#Print result
print("Average emojis per string:", average_emojis_per_string)

Average emojis per string: 0.9797101449275363


In [65]:
#Determine the average number of emojis used in a non sarcastic tweet

#Make a list to hold number of emojis per tweet after editing
emojis_per_string = []

#Determin the number of emojis per tweet
for text in nonsarc['edited_text']:
    emoji_count = len(demoji.findall(text))
    emojis_per_string.append(emoji_count)

#Average over the df
average_emojis_per_string = mean(emojis_per_string)

#Print result
print("Average emojis per string:", average_emojis_per_string)

Average emojis per string: 0.6072423398328691


In [66]:
#Determine the average number of emojis used in a tweet of unknown classification

#Make a list to hold number of emojis per tweet after editing
emojis_per_string = []

#Determin the number of emojis per tweet
for text in idk['edited_text']:
    emoji_count = len(demoji.findall(text))
    emojis_per_string.append(emoji_count)

#Average over the df
average_emojis_per_string = mean(emojis_per_string)

#Print result
print("Average emojis per string:", average_emojis_per_string)

Average emojis per string: 0.296875


# Emoji position in text

In [70]:
#Determine the average position of an emoji in a sarcastic tweet

emoji_positions = []

for text in sarc['edited_text']:
    emojis = demoji.findall(text)
    if emojis:
        emoji = list(emojis.keys())[0]
        emoji_position = text.find(emoji) / len(text)  # Normalize position to a range between 0 and 1
        emoji_positions.append(emoji_position)
    else:
        emoji_positions.append(1.0)  # Append 1.0 if no emoji is found

average_emoji_position = mean(emoji_positions)

print("Average position of emoji:", average_emoji_position)

Average position of emoji: 0.949034045067618


In [71]:
#Determine the average position of an emoji in a non sarcastic tweet

emoji_positions = []

for text in nonsarc['edited_text']:
    emojis = demoji.findall(text)
    if emojis:
        emoji = list(emojis.keys())[0]
        emoji_position = text.find(emoji) / len(text)  # Normalize position to a range between 0 and 1
        emoji_positions.append(emoji_position)
    else:
        emoji_positions.append(1.0)  # Append 1.0 if no emoji is found

average_emoji_position = mean(emoji_positions)

print("Average position of emoji:", average_emoji_position)

Average position of emoji: 0.9708414894280922


In [72]:
#Determine the average position of an emoji in a tweet of unknown classification

emoji_positions = []

for text in idk['edited_text']:
    emojis = demoji.findall(text)
    if emojis:
        emoji = list(emojis.keys())[0]
        emoji_position = text.find(emoji) / len(text)  # Normalize position to a range between 0 and 1
        emoji_positions.append(emoji_position)
    else:
        emoji_positions.append(1.0)  # Append 1.0 if no emoji is found

average_emoji_position = mean(emoji_positions)

print("Average position of emoji:", average_emoji_position)

Average position of emoji: 0.9757688171762811


# Sentiment polarity