# Basic initial cleaning

In [60]:
#Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import emoji
import scipy.stats as stats

In [3]:
#Importing data
df = pd.read_csv("isarcasm2022.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",0.0,1.0,0.0,0.0,0.0,0.0
1,1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,1.0,0.0,0.0,0.0,0.0,0.0
2,2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",0.0,1.0,0.0,0.0,0.0,0.0
3,3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",1.0,0.0,0.0,0.0,0.0,0.0
4,4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,1.0,0.0,0.0,0.0,0.0,0.0


In [4]:
#Remove columns that are no longer needed
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",0.0,1.0,0.0,0.0,0.0,0.0
1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,1.0,0.0,0.0,0.0,0.0,0.0
2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",0.0,1.0,0.0,0.0,0.0,0.0
3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",1.0,0.0,0.0,0.0,0.0,0.0
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,1.0,0.0,0.0,0.0,0.0,0.0


In [5]:
df.shape

(3468, 9)

In [9]:
#Check for null values
df.isnull().sum()

tweet                     1
sarcastic                 0
rephrase               2601
sarcasm                2601
irony                  2601
satire                 2601
understatement         2601
overstatement          2601
rhetorical_question    2601
dtype: int64

In [10]:
#Remove rows where tweet column is empty
df.dropna(subset=['tweet'], inplace=True)

In [11]:
#Check number of words in the tweets
data_len = df['tweet'].apply(lambda x: len(x.split(' '))).sum()
print(f'{data_len} words')

65266 words


In [14]:
#Remove rows containing sarcasm type for now
df = df.drop(columns=['sarcasm', 'irony', 'satire', 'overstatement', 'understatement', 'rhetorical_question'])
df.head()

Unnamed: 0,tweet,sarcastic,rephrase
0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring..."
1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...
2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall..."
3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to..."
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...


In [17]:
#Check the unique values in label column
df.sarcastic.unique()

array([1, 0], dtype=int64)

In [19]:
#Check balance of dataset
df['sarcastic'].value_counts()

0    2600
1     867
Name: sarcastic, dtype: int64

# Extraction Statistical Features - Pragmatic Features

The following work aims to extract features from the tweets which may be relevant to determine the presence of sarcasm in text. Features in the text which will be extracted include the frequencies of structural features of the text such as capitalized words, capitalisation within words, user mentions, hashtags, markers of laughter, punctuation, emoji frequency, and linguistic features including the presence of vocabulary classed as an intensifiers, interjectors, negators or affirmators. These features are each postulated to provide indicators of pragmatics within the text and thus may be able to provide insight into how sarcasm is detected within text.

In [22]:
#Capitalised words

#Define function to count capitalised words
def count_capital_words(text):
    words = text.split()
    capital_words_count = sum(1 for word in words if word[0].isupper())
    return capital_words_count

#Apply to tweets column
df['capitalised_words'] = df['tweet'].apply(count_capital_words)
df.head()

Unnamed: 0,tweet,sarcastic,rephrase,capitalised_words
0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",2
1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,3
2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",4
3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",2
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,5


In [23]:
#User mentions

#Define function to count user mentions
def count_user_mentions(text):
    words = text.split()
    user_mentions_count = sum(1 for word in words if word.startswith('@'))
    return user_mentions_count

#Apply to tweets column
df['user_mentions'] = df['tweet'].apply(count_user_mentions)
df.head()

Unnamed: 0,tweet,sarcastic,rephrase,capitalised_words,user_mentions
0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",2,0
1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,3,0
2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",4,1
3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",2,0
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,5,3


In [24]:
#Hashtags

#Define function to count hashtags
def count_hashtags(text):
    words = text.split()
    hashtag_count = sum(1 for word in words if word.startswith('#'))
    return hashtag_count

#Apply to tweets column
df['hashtags'] = df['tweet'].apply(count_hashtags)
df.head()

Unnamed: 0,tweet,sarcastic,rephrase,capitalised_words,user_mentions,hashtags
0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",2,0,0
1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,3,0,0
2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",4,1,0
3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",2,0,0
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,5,3,0


In [27]:
#Laughter markers

#Define the function to count laughter occurrences
def count_laughter(text):
    words = text.split()
    laugh_count = sum(1 for word in words if word.startswith("haha") or re.match('l(o)+l$', word, re.IGNORECASE))
    return laugh_count

#Apply to tweets column
df['laughter'] = df['tweet'].apply(count_laughter)
df.head()

Unnamed: 0,tweet,sarcastic,rephrase,capitalised_words,user_mentions,hashtags,laughter
0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",2,0,0,0
1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,3,0,0,0
2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",4,1,0,0
3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",2,0,0,0
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,5,3,0,0


In [29]:
#Punctuation

#Make a list of potentially relevant punctuation 
#Commas, quotation marks single colons etc as these are unlikely to be relevant for the problem set
#Only what may be relevant is added to the list
punctuation = ["?", "!", "..."]

#Define the function to count relevant punctuation
def count_punctuation(text, punctuation):
    words = text.split()
    punctuation_count = sum(text.count(p) for p in punctuation)
    return punctuation_count
    
#Apply to tweets column
df['punctuation'] = df['tweet'].apply(lambda x: count_punctuation(x, punctuation))
df.head()

Unnamed: 0,tweet,sarcastic,rephrase,capitalised_words,user_mentions,hashtags,laughter,punctuation
0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",2,0,0,0,0
1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,3,0,0,0,0
2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",4,1,0,0,2
3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",2,0,0,0,0
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,5,3,0,0,0


In [50]:
#Affermatives

#Make list of strong affermatives
strong_affermatives = ["yes", "yeah", "always", "all", "any", "every", "everybody", "everywhere", "ever"]

#Define the function to count strong affermatives
def count_affermatives(text, strong_affermatives):
    words = text.split()
    affermatives_count = sum(text.count(n) for n in strong_affermatives)
    return affermatives_count
    
#Apply to tweets column
df['affermatives'] = df['tweet'].apply(lambda x: count_affermatives(x, strong_affermatives))
df.head()

Unnamed: 0,tweet,sarcastic,rephrase,capitalised_words,user_mentions,hashtags,laughter,punctuation,negations,intensifiers,interjections,affermatives
0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",2,0,0,0,0,0,0,0,0
1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,3,0,0,0,0,0,0,2,2
2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",4,1,0,0,2,0,0,0,1
3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",2,0,0,0,0,2,0,0,0
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,5,3,0,0,0,2,1,0,0


In [47]:
#Negations

#Make list of strong negations
strong_negations = ["no", "not", "never", "none", "n't", "nothing", "neither", "nobody", "nowhere"]

#Define the function to count strong negations
def count_negation(text, strong_negations):
    words = text.split()
    negations_count = sum(text.count(n) for n in strong_negations)
    return negations_count
    
#Apply to tweets column
df['negations'] = df['tweet'].apply(lambda x: count_negation(x, strong_negations))
df.head()

Unnamed: 0,tweet,sarcastic,rephrase,capitalised_words,user_mentions,hashtags,laughter,punctuation,negations
0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",2,0,0,0,0,0
1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,3,0,0,0,0,0
2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",4,1,0,0,2,0
3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",2,0,0,0,0,2
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,5,3,0,0,0,2


In [48]:
#Intensifiers

#Make a list of intensifiers
intensifiers = ["amazingly", "astoundingly", "awful", "bare", "bloody", "crazy", "dreadfully",
                "colossally", "especially", "exceptionally", "excessively", "extremely",
                "extraordinarily", "fantastically", "frightfully", "fucking", "fully", "hella",
                "holy", "incredibly", "insanely", "literally", "mightily", "moderately", "most",
                "outrageously", "phenomenally", "precious", "quite", "radically", "rather",
                "really", "remarkably", "right", "sick", "strikingly", "super", "supremely",
                "surprisingly", "terribly", "terrifically", "too", "totally", "uncommonly",
                "unusually", "veritable", "very", "wicked"]

#Define the function to count intensifiers
def count_intensifiers(text, intensifiers):
    words = text.split()
    intensifier_count = sum(text.count(n) for n in intensifiers)
    return intensifier_count
    
#Apply to tweets column
df['intensifiers'] = df['tweet'].apply(lambda x: count_intensifiers(x, intensifiers))
df.head()

Unnamed: 0,tweet,sarcastic,rephrase,capitalised_words,user_mentions,hashtags,laughter,punctuation,negations,intensifiers
0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",2,0,0,0,0,0,0
1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,3,0,0,0,0,0,0
2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",4,1,0,0,2,0,0
3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",2,0,0,0,0,2,0
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,5,3,0,0,0,2,1


In [49]:
#Interjections

#Make a list of interjections
interjections = ["oh", "hey", "wow", "aha", "aham", "aw", "bam", "blah", "bingo", "boo", "bravo",
                 "cheers", "congratulations", "congrats", "duh", "eh", "gee", "gosh", "hey", "hmm",
                 "huh", "hurray", "oh", "oh dear", "oh my", "oh well", "oops", "ouch", "ow", "phew",
                 "shh", "uh", "uh-huh", "mhm", "ugh", "well", "wow", "woah", "yeah", "yep", "yikes", "yo"]

#Define the function to count interjections
def count_interjections(text, interjections):
    words = text.split()
    interjection_count = sum(text.count(n) for n in interjections)
    return interjection_count
    
#Apply to tweets column
df['interjections'] = df['tweet'].apply(lambda x: count_interjections(x, interjections))
df.head()

Unnamed: 0,tweet,sarcastic,rephrase,capitalised_words,user_mentions,hashtags,laughter,punctuation,negations,intensifiers,interjections
0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",2,0,0,0,0,0,0,0
1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,3,0,0,0,0,0,0,2
2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",4,1,0,0,2,0,0,0
3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",2,0,0,0,0,2,0,0
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,5,3,0,0,0,2,1,0


In [56]:
#Emojis

#Define the function to count emojis
def count_emojis(text):
    text_without_emojis = emoji.demojize(text)
    emojis_count = text.count(":")
    return emojis_count

#Apply to tweets column
df['emoji'] = df['tweet'].apply(count_emojis)
df.head()

Unnamed: 0,tweet,sarcastic,rephrase,capitalised_words,user_mentions,hashtags,laughter,punctuation,negations,intensifiers,interjections,affermatives,midword_capitalisation,emoji
0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",2,0,0,0,0,0,0,0,0,0,0
1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,3,0,0,0,0,0,0,2,2,0,0
2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",4,1,0,0,2,0,0,0,1,1,0
3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",2,0,0,0,0,2,0,0,0,0,0
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,5,3,0,0,0,2,1,0,0,1,0


In [55]:
#Capitalisations in the middle of words

#Define the function to count capitalisation in the middle of words
def count_midword_capitalisation(text):
    words = text.split()
    midword_capitalisation_count = sum(1 for word in words if any(char.isupper() for char in word[1:]))
    return midword_capitalisation_count

#Apply to tweets column
df['midword_capitalisation'] = df['tweet'].apply(count_midword_capitalisation)
df.head()

Unnamed: 0,tweet,sarcastic,rephrase,capitalised_words,user_mentions,hashtags,laughter,punctuation,negations,intensifiers,interjections,affermatives,midword_capitalisation
0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",2,0,0,0,0,0,0,0,0,0
1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,3,0,0,0,0,0,0,2,2,0
2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",4,1,0,0,2,0,0,0,1,1
3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",2,0,0,0,0,2,0,0,0,0
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,5,3,0,0,0,2,1,0,0,1


In [58]:
#Length of tweets
df['tweet_length'] = df['tweet'].apply(lambda x: len(x.split()))
df.head()

Unnamed: 0,tweet,sarcastic,rephrase,capitalised_words,user_mentions,hashtags,laughter,punctuation,negations,intensifiers,interjections,affermatives,midword_capitalisation,emoji,tweet_length
0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",2,0,0,0,0,0,0,0,0,0,0,11
1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,3,0,0,0,0,0,0,2,2,0,0,26
2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",4,1,0,0,2,0,0,0,1,1,0,36
3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",2,0,0,0,0,2,0,0,0,0,0,17
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,5,3,0,0,0,2,1,0,0,1,0,29


# Statistical analysis of pragmatic features

Following the extraction of a wide range of features from the tweets in the dataset, work will be carried out to further understand which of these features have statistically significant differences between sarcastic and non-sarcastic tweets. 

In [62]:
#Decide if I should do parametric/non-parametric test
#Check if data is normally distributed via Shapiro Wilk test

#Create subsets
sarcastic_tweets = df[df['sarcastic'] == 1]
nonsarcastic_tweets = df[df['sarcastic'] == 0]

#Make a list of columns to iterate through
columns = ['capitalised_words', 'user_mentions', 'hashtags', 'laughter', 'punctuation', 'negations', 'intensifiers',
           'interjections', 'affermatives', 'midword_capitalisation', 'emoji', 'tweet_length']

#Set test to perform at 95% confidence
alpha = 0.05

# Perform the t-test for each column
for column in columns:
    t_stat, p_value = stats.ttest_ind(sarcastic_tweets[column], nonsarcastic_tweets[column], equal_var=False)
    print(f"Column: {column}")
    print("T-statistic:", t_stat)
    print("P-value:", p_value)
    if p_value > alpha:
        print("Fail to reject the null hypothesis. The data is probably normally distributed.")
    else:
        print("Reject the null hypothesis. The data is probably not noramlly distributed.")

Column: capitalised_words
T-statistic: -1.9971356921076624
P-value: 0.045986307172021255
Reject the null hypothesis. The data is probably not noramlly distributed.
Column: user_mentions
T-statistic: 1.3040958369470357
P-value: 0.19241657171667584
Fail to reject the null hypothesis. The data is probably normally distributed.
Column: hashtags
T-statistic: -1.8136318643062843
P-value: 0.06990234608595593
Fail to reject the null hypothesis. The data is probably normally distributed.
Column: laughter
T-statistic: 2.947156998427032
P-value: 0.003285644147078121
Reject the null hypothesis. The data is probably not noramlly distributed.
Column: punctuation
T-statistic: 2.3829115375149326
P-value: 0.017318796050705982
Reject the null hypothesis. The data is probably not noramlly distributed.
Column: negations
T-statistic: -0.15948991549326813
P-value: 0.8733038975428206
Fail to reject the null hypothesis. The data is probably normally distributed.
Column: intensifiers
T-statistic: -0.8324921258