###This notebook:
+ feature analysis

###Check requirements

In [None]:
! python --version

In [None]:
!pip install --upgrade pip

In [None]:
!pip install tensorflow_hub
!pip install keras tf-models-official pydot graphviz

In [None]:
pip install contractions

In [None]:
pip install emojis

In [None]:
import nltk
nltk.download('punkt')

In [None]:
nltk.download('averaged_perceptron_tagger')

In [None]:
import os

import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub

from keras.utils import np_utils

import official.nlp.bert.bert_models
import official.nlp.bert.configs
import official.nlp.bert.run_classifier
import official.nlp.bert.tokenization as tokenization

from official.modeling import tf_utils
from official import nlp
from official.nlp import bert

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

1 Physical GPUs, 1 Logical GPUs
Version:  2.5.0
Eager mode:  True
Hub version:  0.12.0
GPU is available


Correlation matrix


+ -1 indicates a perfectly negative linear correlation between two variables
+ 0 indicates no linear correlation between two variables
+ 1 indicates a perfectly positive linear correlation between two variables

###Load data

In [None]:
# Load train data
train_path = '/content/drive/MyDrive/TeamLab/data/semeval_taskA_corrected.csv'

df_train = pd.read_csv(train_path, header=0, names=['index',
                                                    'irony_label',
                                                    'tweet'])
                                                

In [None]:
df_train.head()

Unnamed: 0,index,irony_label,tweet
0,1,1,Sweet United Nations video. Just in time for C...
1,2,1,@mrdahl87 We are rumored to have talked to Erv...
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...
3,4,0,3 episodes left I'm dying over here
4,5,1,I can't breathe! was chosen as the most notabl...


In [None]:
# Check if dataset is balanced

# Classes are 1 and 0. Tweet can either be ironic or non-ironic -> binary classification
classes = df_train.irony_label.unique()

print((df_train.irony_label == 0).sum())
print((df_train.irony_label == 1).sum())

# => Balanced

1923
1911


In [None]:
# Load test data
test_path = '/content/drive/MyDrive/TeamLab/data/semeval_taskA_test.csv'

df_test = pd.read_csv(test_path, sep='\t', header=0, names=['index',
                                                            'irony_label',
                                                            'tweet'])

print((df_test.irony_label == 0).sum())
print((df_test.irony_label == 1).sum())

df_test.head()

473
311


Unnamed: 0,index,irony_label,tweet
0,1,0,@Callisto1947 Can U Help?||More conservatives ...
1,2,1,"Just walked in to #Starbucks and asked for a ""..."
2,3,0,#NOT GONNA WIN http://t.co/Mc9ebqjAqj
3,4,0,@mickymantell He is exactly that sort of perso...
4,5,1,So much #sarcasm at work mate 10/10 #boring 10...


In [None]:
x_train = df_train['tweet'].to_numpy()
y_train = df_train['irony_label'].to_numpy()

x_test = df_test['tweet'].to_numpy()
y_test = df_test['irony_label'].to_numpy()

In [None]:
x_train

array(['Sweet United Nations video. Just in time for Christmas. #imagine #NoReligion  http://t.co/fej2v3OUBR',
       "@mrdahl87 We are rumored to have talked to Erv's agent... and the Angels asked about Ed Escobar... that's hardly nothing    ;)",
       'Hey there! Nice to see you Minnesota/ND Winter Weather', ...,
       "well now that i've listened to all of into the woods, i'm going to listen to some FOB #nosurprisethere",
       'Hummingbirds #Are  #Experts #at #Hovering #After #All: #Background #Motion ...: If the hovering ability of the... http://t.co/E189iHBpZr',
       'Only thing missing now is a session at the gym... Want to do every body part though and CV!  #possible'],
      dtype=object)

###Normalisation functions

In [None]:
import emojis
import contractions
import re

def normalise_for_allcaps(tweet):

    norm_tweet = re.sub("http:.*", "", tweet)
    norm_tweet = re.sub("@[a-zA-Z0-9]+", "", norm_tweet)
    norm_tweet = re.sub(r"[#@!()/_;:{}=~|,\-\.\?\[\]]", "", norm_tweet)
    norm_tweet = re.sub("'s", "", norm_tweet)

    norm_tweet = emojis.decode(norm_tweet)
    norm_tweet = re.sub(":\s?\w{3,}\s?:", "", norm_tweet)
    norm_tweet = re.sub("\d+", "", norm_tweet)
    norm_tweet = contractions.fix(norm_tweet)
        
    return norm_tweet.strip()


def normalise_for_pol(tweet):

    norm_tweet = re.sub("http:.*", "", tweet) 
    norm_tweet = re.sub("'s", "", norm_tweet)
    norm_tweet = re.sub("@[a-zA-Z0-9]+", "", norm_tweet)

    norm_tweet = emojis.decode(norm_tweet)
    norm_tweet = re.sub(r"[#@!()/_;:{}=~|,\-\.\?\[\]]", "", norm_tweet)
    norm_tweet = re.sub("\d+", "", norm_tweet)
        
    return norm_tweet.strip()

def normalise_for_pos(tweet):

    norm_tweet = re.sub("http:.*", "", tweet)
    norm_tweet = re.sub("@[a-zA-Z0-9]+", "", norm_tweet)
    norm_tweet = re.sub(r"[#@!()/_;:{}=~|,\-\.\?\[\]]", "", norm_tweet)
    norm_tweet = re.sub("'s", "", norm_tweet)

    norm_tweet = emojis.decode(norm_tweet)
    norm_tweet = re.sub(":\s?\w{3,}\s?:", "", norm_tweet)
    norm_tweet = re.sub("\d+", "", norm_tweet)
    norm_tweet = contractions.fix(norm_tweet)
    norm_tweet = norm_tweet.lower()

    return norm_tweet.strip()

def normalise_for_punct(tweet):

    norm_tweet = re.sub("http:.*", "", tweet)
    norm_tweet = re.sub("@[a-zA-Z0-9]+", "", norm_tweet)
    norm_tweet = re.sub(r"[#@()/_;:{}=~|,\-\.\[\]]", "", norm_tweet)
    norm_tweet = re.sub("'s", "", norm_tweet)

    norm_tweet = emojis.decode(norm_tweet)
    norm_tweet = re.sub(":\s?\w{3,}\s?:", "", norm_tweet)
    norm_tweet = re.sub("\d+", "", norm_tweet)
    norm_tweet = contractions.fix(norm_tweet)
    norm_tweet = norm_tweet.lower()

    return norm_tweet.strip()


##Statistics

###All caps

In [None]:
# create new x_train with normalised tweets
x_train_mod1 = []
for tweet in x_train:
    x_train_mod1.append(normalise_for_allcaps(tweet))

In [None]:
# Takes a dataset as input and returns a list of 1s and 0s,
# indicating which index correspond to a tweet that contains the feature
def get_all_caps(dataset):
    all_caps = []

    for tweet in dataset:
        upper_w = 0
        n_words = len(tweet.split())
        for word in tweet.split():
            if word == 'I':
                n_words +- 1
            if word != 'I' and word.isupper():
                upper_w += 1

        if upper_w >= round(n_words*80/100):
            all_caps.append(1)
        else:
            all_caps.append(0)
            
    return all_caps

In [None]:
all_caps = get_all_caps(x_train_mod1)

# Number of tweets with feature all_caps
all_caps.count(1)

52

In [None]:
len(all_caps) == len(y_train)

True

In [None]:
# create new feature dataset
df_feat = df_train.copy()

# add column to feature dataset
df_feat['all_caps'] = all_caps
df_feat.head()

Unnamed: 0,index,irony_label,tweet,all_caps
0,1,1,Sweet United Nations video. Just in time for C...,0
1,2,1,@mrdahl87 We are rumored to have talked to Erv...,0
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...,0
3,4,0,3 episodes left I'm dying over here,0
4,5,1,I can't breathe! was chosen as the most notabl...,0


In [None]:
# calculate correlation between the irony labels and the feature
# -1 indicates a perfectly negative linear correlation between two variables
# 0 indicates no linear correlation between two variables
# 1 indicates a perfectly positive linear correlation between two variables

df_feat[['irony_label', 'all_caps']].corr()

Unnamed: 0,irony_label,all_caps
irony_label,1.0,-0.062772
all_caps,-0.062772,1.0


In [None]:
# list of 1s and 0s
# irony labels of the tweets that DO contain the feature
corr_list_allcaps = []
for i in range(len(y_train)):
    if all_caps[i] == 1:
        corr_list_allcaps.append(y_train[i])

print(corr_list_allcaps.count(0))
print(corr_list_allcaps.count(1))

40
12


In [None]:
p_ironic_all_caps = round(corr_list_allcaps.count(1)*100/(corr_list_allcaps.count(0)+corr_list_allcaps.count(1)),2)

print("Out of the tweets that contain the feature <all_caps>,", p_ironic_all_caps, "% are ironic")

Out of the tweets that contain the feature <all_caps>, 23.08 % are ironic


In [None]:
# Tweets where both irony label and feature are 1
# (ironic tweets that contain feature)
df_feat.loc[(df_feat['all_caps'] == 1) & (df_feat['irony_label'] == 1)]

Unnamed: 0,index,irony_label,tweet,all_caps
557,558,1,http://t.co/xMzzKi6kn6 << This should be Chris...,1
750,751,1,OMG YES. 7:30 FINALS ON A FRIDAY ARE MY FAVORI...,1
1523,1524,1,@HikeTheGamer: .@PlayStation @Xbox EVERYTHING ...,1
1562,1563,1,@bophiesurch SHE SHOWED YOU!,1
1569,1570,1,MLS Transactions 2015 #MLS http://t.co/hGIAVdL...,1
2500,2501,1,@SocialxClub HOW DARE YOU HAVE FUN AS CHRISTIA...,1
2761,2762,1,http://t.co/N1eVKeTMZ0 celebs takes drastic me...,1
2767,2768,1,"http://t.co/QdYdOaQ48B Because, progress... #A...",1
2885,2886,1,@AdamRubinESPN # SHOCKING,1
3066,3067,1,http://t.co/nzK3dNmOAD Having sooo much fun in...,1


In [None]:
tot_ironic_tweets = len(df_feat.loc[(df_feat['irony_label'] == 1)])
tot_ironic_tweets

1911

In [None]:
num_ironic_tweets_all_caps = len(df_feat.loc[(df_feat['all_caps'] == 1) & 
                                         (df_feat['irony_label'] == 1)])

p_all_caps = round((num_ironic_tweets_all_caps / tot_ironic_tweets * 100), 2)

print("Ironic tweets contain the feature <all_caps>", p_all_caps, "% of the times")

Ironic tweets contain the feature <all_caps> 0.63 % of the times


###Ellipsis

In [None]:
# Takes a dataset as input and returns a list of 1s and 0s,
# indicating which index correspond to a tweet that contains the feature
def get_ellipsis(dataset):
    ellipsis = []

    for tweet in dataset:
        el = int(bool(re.search("\.\.\.+", tweet)))
        if el == 1:
            ellipsis.append(1)
        else:
            ellipsis.append(0)
            
    return ellipsis

In [None]:
ellipsis = get_ellipsis(x_train)

# Number of tweets with feature ellipsis
ellipsis.count(1)

484

In [None]:
# add column to feature dataset
df_feat['ellipsis'] = ellipsis
df_feat.head()

Unnamed: 0,index,irony_label,tweet,all_caps,ellipsis
0,1,1,Sweet United Nations video. Just in time for C...,0,0
1,2,1,@mrdahl87 We are rumored to have talked to Erv...,0,1
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...,0,0
3,4,0,3 episodes left I'm dying over here,0,0
4,5,1,I can't breathe! was chosen as the most notabl...,0,0


In [None]:
# calculate correlation between the irony labels and the feature
# -1 indicates a perfectly negative linear correlation between two variables
# 0 indicates no linear correlation between two variables
# 1 indicates a perfectly positive linear correlation between two variables
df_feat[['irony_label', 'ellipsis']].corr()

Unnamed: 0,irony_label,ellipsis
irony_label,1.0,-0.02237
ellipsis,-0.02237,1.0


In [None]:
# list of 1s and 0s
# irony lables of the tweets that DO contain the feature
corr_list_elip = []
for i in range(len(y_train)):
    if ellipsis[i] == 1:
        corr_list_elip.append(y_train[i])

print(corr_list_elip.count(0))
print(corr_list_elip.count(1))

257
227


In [None]:
p_ironic_ellipsis = round(corr_list_elip.count(1)*100/(corr_list_elip.count(0)+corr_list_elip.count(1)),2)

print("Out of the tweets that contain the feature <ellipsis>,", p_ironic_ellipsis, "% are ironic")

Out of the tweets that contain the feature <ellipsis>, 46.9 % are ironic


In [None]:
# Tweets where both irony label and feature are 1
# (ironic tweets that contain feature)
df_feat.loc[(df_feat['ellipsis'] == 1) & (df_feat['irony_label'] == 1)]

Unnamed: 0,index,irony_label,tweet,all_caps,ellipsis
1,2,1,@mrdahl87 We are rumored to have talked to Erv...,0,1
10,11,1,"Oh, thank GOD - our entire office email system...",0,1
30,31,1,Smh never knew something could be 80% halal......,0,1
36,37,1,"Twig is now ""Sprig""—3 sec limit on new social ...",0,1
42,43,1,"HaHaHaHa!! #hypocrisy ""MT @Independent: Katie...",0,1
...,...,...,...,...,...
3731,3732,1,"@Dodgers Gonna miss @TheRealMattKemp, @FlashGJ...",0,1
3736,3737,1,@nytimes oh so someone got in trouble for maki...,0,1
3738,3739,1,If I carried on the way half the men do on her...,0,1
3769,3770,1,@megynkelly have fun at that...,0,1


In [None]:
num_ironic_tweets_ellipsis = len(df_feat.loc[(df_feat['ellipsis'] == 1) & 
                                             (df_feat['irony_label'] == 1)])

p_ellipsis = round((num_ironic_tweets_ellipsis / tot_ironic_tweets * 100), 2)

print("Ironic tweets contain the feature <ellipsis>", p_ellipsis, "% of the times")

Ironic tweets contain the feature <ellipsis> 11.88 % of the times


###Polarity change

describe how i define pol change

In [None]:
from textblob import TextBlob

In [None]:
# create new x_train with normalised tweets
x_train_mod2 = []
for tweet in x_train:
    x_train_mod2.append(normalise_for_pol(tweet))

In [None]:
# Takes a dataset as input and returns a list of 1s and 0s,
# indicating which index correspond to a tweet that contains the feature
def get_pol_change(dataset):
    polarities = []

    for tweet in dataset:
        pol_of_words = []
        for word in tweet.split():
            x = TextBlob(word)
            pol_of_words.append(x.sentiment.polarity)
        if all(p == 0.0 for p in pol_of_words):
            polarities.append([0.0])
        else:
            inner = []
            for p in pol_of_words:
                if p != 0.0:
                    inner.append(p)
            polarities.append(inner)

    pol_change = []
    idx = 0
    for lst in polarities:
        pos = float(sum(pol > 0 for pol in lst))
        neg = float(sum(pol < 0 for pol in lst))
        if pos != 0.0 and neg != 0.0:
            pol_change.append(1)
        else:
            pol_change.append(0)
            
    return pol_change

In [None]:
pol_change = get_pol_change(x_train_mod2)

In [None]:
print(pol_change.count(0))
print(pol_change.count(1))

3347
487


In [None]:
# add column to feature dataset
df_feat['pol_change'] = pol_change
df_feat.head()

Unnamed: 0,index,irony_label,tweet,all_caps,ellipsis,pol_change
0,1,1,Sweet United Nations video. Just in time for C...,0,0,0
1,2,1,@mrdahl87 We are rumored to have talked to Erv...,0,1,0
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...,0,0,0
3,4,0,3 episodes left I'm dying over here,0,0,0
4,5,1,I can't breathe! was chosen as the most notabl...,0,0,0


In [None]:
# calculate correlation between the irony labels and the feature
# -1 indicates a perfectly negative linear correlation between two variables
# 0 indicates no linear correlation between two variables
# 1 indicates a perfectly positive linear correlation between two variables
df_feat[['irony_label', 'pol_change']].corr()

Unnamed: 0,irony_label,pol_change
irony_label,1.0,0.039574
pol_change,0.039574,1.0


In [None]:
# list of 1s and 0s
# irony lables of the tweets that DO contain the feature
corr_list_pol = []
for i in range(len(y_train)):
    if pol_change[i] == 1:
        corr_list_pol.append(y_train[i])

print(corr_list_pol.count(0))
print(corr_list_pol.count(1))

219
268


In [None]:
p_ironic_pol_change = round(corr_list_pol.count(1)*100/(corr_list_pol.count(0)+corr_list_pol.count(1)),2)

print("Out of the tweets that contain the feature <polarity_change>,", p_ironic_pol_change, "% are ironic")

Out of the tweets that contain the feature <polarity_change>, 55.03 % are ironic


In [None]:
# Tweets where both irony label and feature are 1
# (ironic tweets that contain feature)
df_feat.loc[(df_feat['pol_change'] == 1) & (df_feat['irony_label'] == 1)]

Unnamed: 0,index,irony_label,tweet,all_caps,ellipsis,pol_change
24,25,1,Love these cold winter mornings 😬 best feeling...,0,0,1
34,35,1,@RedSox you guys sure screwed this up royally....,0,0,1
36,37,1,"Twig is now ""Sprig""—3 sec limit on new social ...",0,1,1
50,51,1,It will be impossible for me to be late if I s...,0,0,1
58,59,1,@TheoCorleone @david_maclellan Shit! I better ...,0,0,1
...,...,...,...,...,...,...
3736,3737,1,@nytimes oh so someone got in trouble for maki...,0,1,1
3748,3749,1,Students too busy protesting a grand jury deci...,0,0,1
3749,3750,1,Why can't heroes be happy in DC Comics? I mean...,0,0,1
3803,3804,1,Feeling like crap. And being treated horribly ...,0,0,1


In [None]:
num_ironic_tweets_pol_change = len(df_feat.loc[(df_feat['pol_change'] == 1) & 
                                               (df_feat['irony_label'] == 1)])

p_pol_change = round((num_ironic_tweets_pol_change / tot_ironic_tweets * 100), 2)

print("Ironic tweets contain the feature <pol_change>", p_pol_change, "% of the times")

Ironic tweets contain the feature <pol_change> 14.02 % of the times


###Laughter

In [None]:
# Takes a dataset as input and returns a list of 1s and 0s,
# indicating which index correspond to a tweet that contains the feature
def get_laugh(dataset):
    laughs = []

    for tweet in dataset:
        laugh = int(bool(re.search(r"\b(a*ha+h[ha]*|o?l+o+l+[ol]*)\b", tweet.lower())))
        if laugh == 1:
            laughs.append(1)
        else:
            laughs.append(0)
    return laughs

In [None]:
laughs = get_laugh(x_train)

In [None]:
print(laughs.count(0))
print(laughs.count(1))

3700
134


In [None]:
# add column to feature dataset
df_feat['laugh'] = laughs
df_feat.head()

Unnamed: 0,index,irony_label,tweet,all_caps,ellipsis,pol_change,laugh
0,1,1,Sweet United Nations video. Just in time for C...,0,0,0,0
1,2,1,@mrdahl87 We are rumored to have talked to Erv...,0,1,0,0
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...,0,0,0,0
3,4,0,3 episodes left I'm dying over here,0,0,0,0
4,5,1,I can't breathe! was chosen as the most notabl...,0,0,0,0


In [None]:
# calculate correlation between the irony labels and the feature
# -1 indicates a perfectly negative linear correlation between two variables
# 0 indicates no linear correlation between two variables
# 1 indicates a perfectly positive linear correlation between two variables
df_feat[['irony_label', 'laugh']].corr()

Unnamed: 0,irony_label,laugh
irony_label,1.0,0.014798
laugh,0.014798,1.0


In [None]:
# list of 1s and 0s
# irony lables of the tweets that DO contain the feature
corr_list_laugh = []
for i in range(len(y_train)):
    if laughs[i] == 1:
        corr_list_laugh.append(y_train[i])

print(corr_list_laugh.count(0))
print(corr_list_laugh.count(1))

62
72


In [None]:
p_ironic_laugh = round(corr_list_laugh.count(1)*100/(corr_list_laugh.count(0)+corr_list_laugh.count(1)),2)

print("Out of the tweets that contain the feature <laugh>,", p_ironic_laugh, "% are ironic")

Out of the tweets that contain the feature <laugh>, 53.73 % are ironic


In [None]:
# Tweets where both irony label and feature are 1
# (ironic tweets that contain feature)
df_feat.loc[(df_feat['laugh'] == 1) & (df_feat['irony_label'] == 1)]

Unnamed: 0,index,irony_label,tweet,all_caps,ellipsis,pol_change,laugh
42,43,1,"HaHaHaHa!! #hypocrisy ""MT @Independent: Katie...",0,1,0,1
101,102,1,"A day where ruled @twitter, I got followed by...",0,0,0,1
156,157,1,Haha who the fuck says that to someone#TrueGen...,0,0,1,1
185,186,1,LOL.. Micromax is promising that service? htt...,0,0,0,1
201,202,1,Did you know ? If you are reading this right n...,0,1,0,1
...,...,...,...,...,...,...,...
3547,3548,1,@Olivian_Forever oh yeah!!! 2nd :D #HoldTheApp...,0,0,0,1
3579,3580,1,Lol at docs who don't know what they're doing....,0,0,0,1
3597,3598,1,I'll watch #TakeMeOut like the rest of the cou...,0,1,0,1
3637,3638,1,@billdossett LOOL from the guy with multiple t...,0,0,0,1


In [None]:
num_ironic_tweets_laugh = len(df_feat.loc[(df_feat['laugh'] == 1) & 
                                          (df_feat['irony_label'] == 1)])

p_laugh = round((num_ironic_tweets_laugh / tot_ironic_tweets * 100), 2)

print("Ironic tweets contain the feature <laugh>", p_laugh, "% of the times")

Ironic tweets contain the feature <laugh> 3.77 % of the times


###Emojis

In [None]:
# Takes a dataset as input and returns a list of 1s and 0s,
# indicating which index correspond to a tweet that contains the feature
def get_emojis(dataset):
    hasemoji = []

    for tweet in dataset:
        emo = emojis.count(tweet)
        if emo != 0:
            hasemoji.append(1)
        else:
            hasemoji.append(0)
    return hasemoji

In [None]:
hasemoji = get_emojis(x_train)

print(hasemoji.count(0))
print(hasemoji.count(1))

3433
401


In [None]:
# add column to feature dataset
df_feat['emojis'] = hasemoji
df_feat.head()

Unnamed: 0,index,irony_label,tweet,all_caps,ellipsis,pol_change,laugh,emojis
0,1,1,Sweet United Nations video. Just in time for C...,0,0,0,0,0
1,2,1,@mrdahl87 We are rumored to have talked to Erv...,0,1,0,0,0
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...,0,0,0,0,0
3,4,0,3 episodes left I'm dying over here,0,0,0,0,0
4,5,1,I can't breathe! was chosen as the most notabl...,0,0,0,0,0


In [None]:
# calculate correlation between the irony labels and the feature
# -1 indicates a perfectly negative linear correlation between two variables
# 0 indicates no linear correlation between two variables
# 1 indicates a perfectly positive linear correlation between two variables
df_feat[['irony_label', 'emojis']].corr()

Unnamed: 0,irony_label,emojis
irony_label,1.0,-0.047511
emojis,-0.047511,1.0


In [None]:
# list of 1s and 0s
# irony lables of the tweets that DO contain the feature
corr_list_emojis = []
for i in range(len(y_train)):
    if hasemoji[i] == 1:
        corr_list_emojis.append(y_train[i])

print(corr_list_emojis.count(0))
print(corr_list_emojis.count(1))

229
172


In [None]:
p_ironic_emojis = round(corr_list_emojis.count(1)*100/(corr_list_emojis.count(0)+corr_list_emojis.count(1)),2)

print("Out of the tweets that contain the feature <emojis>,", p_ironic_emojis, "% are ironic")

Out of the tweets that contain the feature <emojis>, 42.89 % are ironic


In [None]:
# Tweets where both irony label and feature are 1
# (ironic tweets that contain feature)
df_feat.loc[(df_feat['emojis'] == 1) & (df_feat['irony_label'] == 1)]

Unnamed: 0,index,irony_label,tweet,all_caps,ellipsis,pol_change,laugh,emojis
24,25,1,Love these cold winter mornings 😬 best feeling...,0,0,1,0,1
38,39,1,Yay for another work at 4am day 😐,0,0,0,0,1
40,41,1,Omg @FloptimusCrime didn't tell @giantfootyguy...,0,0,0,0,1
68,69,1,I asked God to protect me from my enemies .. s...,0,0,0,0,1
99,100,1,i just loveee bein short and having my sweatpa...,0,1,1,0,1
...,...,...,...,...,...,...,...,...
3712,3713,1,Tryna learn to skate tonight! 😎#pennyboard #sk...,0,0,0,0,1
3738,3739,1,If I carried on the way half the men do on her...,0,1,0,0,1
3764,3765,1,And then today was 💩. What a surprise! #2of6 ...,0,0,0,0,1
3811,3812,1,The world is such a smiley place. 😳,0,0,0,0,1


In [None]:
num_ironic_tweets_emojis = len(df_feat.loc[(df_feat['emojis'] == 1) & 
                                           (df_feat['irony_label'] == 1)])

p_emojis = round((num_ironic_tweets_emojis / tot_ironic_tweets * 100), 2)

print("Ironic tweets contain the feature <emojis>", p_emojis, "% of the times")

Ironic tweets contain the feature <emojis> 9.0 % of the times


###Interjections

In [None]:
# create new x_train with normalised tweets
x_train_mod3 = []

for tweet in x_train:
    x_train_mod3.append(normalise_for_pos(tweet))

In [None]:
def convert_to_pos(dataset):
    # A list of lists, where each inner list represents a sentece.
    # Instead of words, it contains the POS tags of the words present in the original tweet
    x_pos = []

    for tweet in dataset:
        words = nltk.word_tokenize(tweet)
        words_pos = nltk.pos_tag(words)
        pos_tweet = []
        for i in range(len(words_pos)):
            tag = words_pos[i][1]
            pos_tweet.append(tag)
        x_pos.append(pos_tweet)

    return x_pos

In [None]:
x_train_pos = convert_to_pos(x_train_mod3)

In [None]:
x_train_pos[2]

['NN', 'EX', 'JJ', 'TO', 'VB', 'PRP', 'VBP', 'JJR', 'NN']

In [None]:
x_train_mod3[2]

'hey there nice to see you minnesotand winter weather'

In [None]:
def get_interjection(pos_data):
    interj = []

    for tweet in pos_data:
        if 'UH' in tweet:
            interj.append(1)
        else:
            interj.append(0)
            
    return interj

In [None]:
interj = get_interjection(x_train_pos)

In [None]:
print(interj.count(0))
print(interj.count(1))

3801
33


In [None]:
# add column to feature dataset
df_feat['interjection'] = interj
df_feat.head()

Unnamed: 0,index,irony_label,tweet,all_caps,ellipsis,pol_change,laugh,emojis,interjection
0,1,1,Sweet United Nations video. Just in time for C...,0,0,0,0,0,0
1,2,1,@mrdahl87 We are rumored to have talked to Erv...,0,1,0,0,0,0
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...,0,0,0,0,0,0
3,4,0,3 episodes left I'm dying over here,0,0,0,0,0,0
4,5,1,I can't breathe! was chosen as the most notabl...,0,0,0,0,0,0


In [None]:
# calculate correlation between the irony labels and the feature
# -1 indicates a perfectly negative linear correlation between two variables
# 0 indicates no linear correlation between two variables
# 1 indicates a perfectly positive linear correlation between two variables
df_feat[['irony_label', 'interjection']].corr()

Unnamed: 0,irony_label,interjection
irony_label,1.0,0.025704
interjection,0.025704,1.0


In [None]:
# list of 1s and 0s
# irony lables of the tweets that DO contain the feature
corr_list_interj = []
for i in range(len(y_train)):
    if interj[i] == 1:
        corr_list_interj.append(y_train[i])

print(corr_list_interj.count(0))
print(corr_list_interj.count(1))

12
21


In [None]:
p_ironic_interj = round(corr_list_interj.count(1)*100/(corr_list_interj.count(0)+corr_list_interj.count(1)),2)

print("Out of the tweets that contain the feature <interjection>,", p_ironic_interj, "% are ironic")

Out of the tweets that contain the feature <interjection>, 63.64 % are ironic


In [None]:
# Tweets where both irony label and feature are 1
# (ironic tweets that contain feature)
df_feat.loc[(df_feat['interjection'] == 1) & (df_feat['irony_label'] == 1)]

Unnamed: 0,index,irony_label,tweet,all_caps,ellipsis,pol_change,laugh,emojis,interjection
10,11,1,"Oh, thank GOD - our entire office email system...",0,1,0,0,0,1
419,420,1,@STLguy1 Oh that sounds like a great plan. @6...,0,0,0,0,0,1
582,583,1,Oh hello flu! Thank you for fooling me in thin...,0,0,0,0,0,1
698,699,1,oh lord! RT @popularmsem: RT @ShockingFactsz:...,0,0,0,0,0,1
750,751,1,OMG YES. 7:30 FINALS ON A FRIDAY ARE MY FAVORI...,1,0,0,0,1,1
754,755,1,"Oh, joy! 'Tis the season for ""Christmas came e...",0,0,0,0,0,1
834,835,1,Oh and it's a positively balmy 7c,0,0,0,0,0,1
1037,1038,1,"Watching the news being like; ""oh this totally...",0,0,0,0,0,1
1121,1122,1,Don't you just love when you say hi to someone...,0,0,0,0,1,1
1241,1242,1,"@Johnnyebs06 oh joy, now i get to have even mo...",0,0,0,0,0,1


In [None]:
num_ironic_tweets_interj = len(df_feat.loc[(df_feat['interjection'] == 1) & 
                                           (df_feat['irony_label'] == 1)])

p_interj = round((num_ironic_tweets_interj / tot_ironic_tweets * 100), 2)

print("Ironic tweets contain the feature <interjection>", p_interj, "% of the times")

Ironic tweets contain the feature <interjection> 1.1 % of the times


###Exclamation

In [None]:
# create new x_train with normalised tweets
x_train_mod4 = []

for tweet in x_train:
    x_train_mod4.append(normalise_for_punct(tweet))

In [None]:
# Takes a dataset as input and returns a list of 1s and 0s,
# indicating which index correspond to a tweet that contains the feature
def get_exclamation(dataset):
    exclamation = []

    for tweet in dataset:
        ex = int(bool(re.search("!+", tweet)))
        if ex == 1:
            exclamation.append(1)
        else:
            exclamation.append(0)
    
    return exclamation

In [None]:
exclamation = get_exclamation(x_train_mod4)

In [None]:
# Number of tweets with feature exclamation
print(exclamation.count(0))
print(exclamation.count(1))

3194
640


In [None]:
# add column to feature dataset
df_feat['exclamation'] = exclamation
df_feat.head()

Unnamed: 0,index,irony_label,tweet,all_caps,ellipsis,pol_change,laugh,emojis,interjection,exclamation
0,1,1,Sweet United Nations video. Just in time for C...,0,0,0,0,0,0,0
1,2,1,@mrdahl87 We are rumored to have talked to Erv...,0,1,0,0,0,0,0
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...,0,0,0,0,0,0,1
3,4,0,3 episodes left I'm dying over here,0,0,0,0,0,0,0
4,5,1,I can't breathe! was chosen as the most notabl...,0,0,0,0,0,0,1


In [None]:
# calculate correlation between the irony labels and the feature
# -1 indicates a perfectly negative linear correlation between two variables
# 0 indicates no linear correlation between two variables
# 1 indicates a perfectly positive linear correlation between two variables
df_feat[['irony_label', 'exclamation']].corr()

Unnamed: 0,irony_label,exclamation
irony_label,1.0,0.01539
exclamation,0.01539,1.0


In [None]:
# list of 1s and 0s
# irony lables of the tweets that DO contain the feature
corr_list_excl = []
for i in range(len(y_train)):
    if exclamation[i] == 1:
        corr_list_excl.append(y_train[i])

print(corr_list_excl.count(0))
print(corr_list_excl.count(1))

310
330


In [None]:
p_ironic_excl = round(corr_list_excl.count(1)*100/(corr_list_excl.count(0)+corr_list_excl.count(1)),2)

print("Out of the tweets that contain the feature <exclamation>,", p_ironic_excl, "% are ironic")

Out of the tweets that contain the feature <exclamation>, 51.56 % are ironic


In [None]:
# Tweets where both irony label and feature are 1
# (ironic tweets that contain feature)
df_feat.loc[(df_feat['exclamation'] == 1) & (df_feat['irony_label'] == 1)]

Unnamed: 0,index,irony_label,tweet,all_caps,ellipsis,pol_change,laugh,emojis,interjection,exclamation
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...,0,0,0,0,0,0,1
4,5,1,I can't breathe! was chosen as the most notabl...,0,0,0,0,0,0,1
19,20,1,"But @DarklightDave was trying to find us, and ...",0,0,0,0,0,0,1
24,25,1,Love these cold winter mornings 😬 best feeling...,0,0,1,0,1,0,1
42,43,1,"HaHaHaHa!! #hypocrisy ""MT @Independent: Katie...",0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
3741,3742,1,Hit Them Angles And Duck Face To Hide Those Wr...,0,0,0,0,0,0,1
3764,3765,1,And then today was 💩. What a surprise! #2of6 ...,0,0,0,0,1,0,1
3768,3769,1,@VictoriasSecret @taylorswift13 really? My res...,0,0,0,0,0,0,1
3808,3809,1,@BBCRadMac @StuartMaconie years ago in M && S ...,0,0,0,0,0,0,1


In [None]:
num_ironic_tweets_exclamation = len(df_feat.loc[(df_feat['exclamation'] == 1) & 
                                                (df_feat['irony_label'] == 1)])

p_exclamation = round((num_ironic_tweets_exclamation / tot_ironic_tweets * 100), 2)

print("Ironic tweets contain the feature <exclamation>", p_exclamation, "% of the times")

Ironic tweets contain the feature <exclamation> 17.27 % of the times


###Hard exclamation

In [None]:
# Takes a dataset as input and returns a list of 1s and 0s,
# indicating which index correspond to a tweet that contains the feature
def get_hard_exclamation(dataset):
    hard_exclamation = []

    for tweet in dataset:
        #match 2 or more exclamation marks --> change to 3?
        hard_ex = int(bool(re.search("!{2,}", tweet)))
        if hard_ex == 1:
            hard_exclamation.append(1)
        else:
            hard_exclamation.append(0)
    
    return hard_exclamation

In [None]:
hard_exclamation = get_hard_exclamation(x_train_mod4)

In [None]:
# Number of tweets with feature ellipsis
print(hard_exclamation.count(0))
print(hard_exclamation.count(1))

3683
151


In [None]:
# add column to feature dataset
df_feat['hard_exclamation'] = hard_exclamation
df_feat.head()

Unnamed: 0,index,irony_label,tweet,all_caps,ellipsis,pol_change,laugh,emojis,interjection,exclamation,hard_exclamation
0,1,1,Sweet United Nations video. Just in time for C...,0,0,0,0,0,0,0,0
1,2,1,@mrdahl87 We are rumored to have talked to Erv...,0,1,0,0,0,0,0,0
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...,0,0,0,0,0,0,1,0
3,4,0,3 episodes left I'm dying over here,0,0,0,0,0,0,0,0
4,5,1,I can't breathe! was chosen as the most notabl...,0,0,0,0,0,0,1,0


In [None]:
# calculate correlation between the irony labels and the feature
# -1 indicates a perfectly negative linear correlation between two variables
# 0 indicates no linear correlation between two variables
# 1 indicates a perfectly positive linear correlation between two variables
df_feat[['irony_label', 'hard_exclamation']].corr()

Unnamed: 0,irony_label,hard_exclamation
irony_label,1.0,0.020748
hard_exclamation,0.020748,1.0


In [None]:
# list of 1s and 0s
# irony lables of the tweets that DO contain the feature
corr_list_hard_excl = []
for i in range(len(y_train)):
    if hard_exclamation[i] == 1:
        corr_list_hard_excl.append(y_train[i])

print(corr_list_hard_excl.count(0))
print(corr_list_hard_excl.count(1))

68
83


In [None]:
p_ironic_hard_ex = round(corr_list_hard_excl.count(1)*100/(corr_list_hard_excl.count(0)+corr_list_hard_excl.count(1)),2)

print("Out of the tweets that contain the feature <hard_exclamation>,", p_ironic_hard_ex, "% are ironic")

Out of the tweets that contain the feature <hard_exclamation>, 54.97 % are ironic


In [None]:
# Tweets where both irony label and feature are 1
# (ironic tweets that contain feature)
df_feat.loc[(df_feat['hard_exclamation'] == 1) & (df_feat['irony_label'] == 1)]

Unnamed: 0,index,irony_label,tweet,all_caps,ellipsis,pol_change,laugh,emojis,interjection,exclamation,hard_exclamation
19,20,1,"But @DarklightDave was trying to find us, and ...",0,0,0,0,0,0,1,1
42,43,1,"HaHaHaHa!! #hypocrisy ""MT @Independent: Katie...",0,1,0,1,0,0,1,1
69,70,1,Just delivered @DominiqueAnsel #cronuts to @Bo...,0,0,0,0,0,0,1,1
72,73,1,@MaggieLindemann: . @xSamSecrets123 awk moment...,0,0,0,0,0,0,1,1
79,80,1,.@nypost .@Cameron_Gray Oh that makes it all b...,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
3531,3532,1,"Got up early to get the earlier bus , & I miss...",0,1,0,0,0,0,1,1
3547,3548,1,@Olivian_Forever oh yeah!!! 2nd :D #HoldTheApp...,0,0,0,1,0,0,1,1
3638,3639,1,@AlWashingnutt Yeah...me too!!!!! !! x,0,1,0,0,0,0,1,1
3714,3715,1,@MicksTaxis my favourite day of the year !!!! ...,0,0,0,0,0,0,1,1


In [None]:
num_ironic_tweets_hard_ex = len(df_feat.loc[(df_feat['hard_exclamation'] == 1) &
                                            (df_feat['irony_label'] == 1)])

p_hard_ex = round((num_ironic_tweets_hard_ex / tot_ironic_tweets * 100), 2)

print("Ironic tweets contain the feature <exclamation>", p_hard_ex, "% of the times")

Ironic tweets contain the feature <exclamation> 4.34 % of the times


###Interrogative

In [None]:
# Takes a dataset as input and returns a list of 1s and 0s,
# indicating which index correspond to a tweet that contains the feature
def get_interrogative(dataset):
    interr = []

    for tweet in dataset:
        q = int(bool(re.search("\?+", tweet)))
        if q == 1:
            interr.append(1)
        else:
            interr.append(0)
    
    return interr

In [None]:
interr = get_interrogative(x_train_mod4)

In [None]:
# Number of tweets with feature interr
print(interr.count(0))
print(interr.count(1))

3422
412


In [None]:
# add column to feature dataset
df_feat['interrogative'] = interr
df_feat.head()

Unnamed: 0,index,irony_label,tweet,all_caps,ellipsis,pol_change,laugh,emojis,interjection,exclamation,hard_exclamation,interrogative
0,1,1,Sweet United Nations video. Just in time for C...,0,0,0,0,0,0,0,0,0
1,2,1,@mrdahl87 We are rumored to have talked to Erv...,0,1,0,0,0,0,0,0,0
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...,0,0,0,0,0,0,1,0,0
3,4,0,3 episodes left I'm dying over here,0,0,0,0,0,0,0,0,0
4,5,1,I can't breathe! was chosen as the most notabl...,0,0,0,0,0,0,1,0,0


In [None]:
# calculate correlation between the irony labels and the feature
# -1 indicates a perfectly negative linear correlation between two variables
# 0 indicates no linear correlation between two variables
# 1 indicates a perfectly positive linear correlation between two variables
df_feat[['irony_label', 'interrogative']].corr()

Unnamed: 0,irony_label,interrogative
irony_label,1.0,-0.05113
interrogative,-0.05113,1.0


In [None]:
# list of 1s and 0s
# irony lables of the tweets that DO contain the feature
corr_list_interr = []
for i in range(len(y_train)):
    if interr[i] == 1:
        corr_list_interr.append(y_train[i])

print(corr_list_interr.count(0))
print(corr_list_interr.count(1))

237
175


In [None]:
p_ironic_interr = round(corr_list_interr.count(1)*100/(corr_list_interr.count(0)+corr_list_interr.count(1)),2)

print("Out of the tweets that contain the feature <interrogative>,", p_ironic_interr, "% are ironic")

Out of the tweets that contain the feature <interrogative>, 42.48 % are ironic


In [None]:
# Tweets where both irony label and feature are 1
# (ironic tweets that contain feature)
df_feat.loc[(df_feat['interrogative'] == 1) & (df_feat['irony_label'] == 1)]

Unnamed: 0,index,irony_label,tweet,all_caps,ellipsis,pol_change,laugh,emojis,interjection,exclamation,hard_exclamation,interrogative
18,19,1,@YankeesWFAN @Ken_Rosenthal trading a SP for a...,0,0,0,0,0,0,0,0,1
19,20,1,"But @DarklightDave was trying to find us, and ...",0,0,0,0,0,0,1,1,1
41,42,1,"I THOUGHT WHAT WE HAD WAS REAL, HOW COULD YOU ...",0,0,0,0,0,0,0,0,1
64,65,1,Whatever happened to the Guano Apes? Did they ...,0,0,0,0,0,0,0,0,1
91,92,1,"I have an idea, how about 911 operators don't ...",0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
3762,3763,1,@Slate can you please book one of these appoin...,0,0,0,0,0,0,0,0,1
3768,3769,1,@VictoriasSecret @taylorswift13 really? My res...,0,0,0,0,0,0,1,0,1
3789,3790,1,.@sunrise how far back can I search my google ...,0,0,0,0,0,0,0,0,1
3791,3792,1,China calls US hypocritical on #humanrights bu...,0,0,0,0,0,0,0,0,1


In [None]:
num_ironic_tweets_interr = len(df_feat.loc[(df_feat['interrogative'] == 1) & 
                                           (df_feat['irony_label'] == 1)])

p_interr = round((num_ironic_tweets_interr / tot_ironic_tweets * 100), 2)

print("Ironic tweets contain the feature <interrogative>", p_interr, "% of the times")

Ironic tweets contain the feature <interrogative> 9.16 % of the times


###Hard interrogative

In [None]:
# Takes a dataset as input and returns a list of 1s and 0s,
# indicating which index correspond to a tweet that contains the feature
def get_hard_interrogative(dataset):
    hard_interr = []

    for tweet in dataset:
        #match 2 or more exclamation marks --> change to 3?
        hard_q = int(bool(re.search("\?{3,}", tweet)))
        if hard_q == 1:
            hard_interr.append(1)
        else:
            hard_interr.append(0)

    return hard_interr

In [None]:
hard_interr = get_hard_interrogative(x_train_mod4)

In [None]:
# Number of tweets with feature hard_interr
print(hard_interr.count(0))
print(hard_interr.count(1))

3823
11


In [None]:
# add column to feature dataset
df_feat['hard_interr'] = hard_interr
df_feat.head()

Unnamed: 0,index,irony_label,tweet,all_caps,ellipsis,pol_change,laugh,emojis,interjection,exclamation,hard_exclamation,interrogative,hard_interr
0,1,1,Sweet United Nations video. Just in time for C...,0,0,0,0,0,0,0,0,0,0
1,2,1,@mrdahl87 We are rumored to have talked to Erv...,0,1,0,0,0,0,0,0,0,0
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...,0,0,0,0,0,0,1,0,0,0
3,4,0,3 episodes left I'm dying over here,0,0,0,0,0,0,0,0,0,0
4,5,1,I can't breathe! was chosen as the most notabl...,0,0,0,0,0,0,1,0,0,0


In [None]:
# calculate correlation between the irony labels and the feature
# -1 indicates a perfectly negative linear correlation between two variables
# 0 indicates no linear correlation between two variables
# 1 indicates a perfectly positive linear correlation between two variables
df_feat[['irony_label', 'hard_interr']].corr()

Unnamed: 0,irony_label,hard_interr
irony_label,1.0,-0.004709
hard_interr,-0.004709,1.0


In [None]:
# list of 1s and 0s
# irony lables of the tweets that DO contain the feature
corr_list_hard_interr = []
for i in range(len(y_train)):
    if hard_interr[i] == 1:
        corr_list_hard_interr.append(y_train[i])

print(corr_list_hard_interr.count(0))
print(corr_list_hard_interr.count(1))

6
5


In [None]:
p_ironic_hard_interr = round(corr_list_hard_interr.count(1)*100/(corr_list_hard_interr.count(0)+corr_list_hard_interr.count(1)),2)

print("Out of the tweets that contain the feature <hard_interr>,", p_ironic_hard_interr, "% are ironic")

Out of the tweets that contain the feature <hard_interr>, 45.45 % are ironic


In [None]:
# Tweets where both irony label and feature are 1
# (ironic tweets that contain feature)
df_feat.loc[(df_feat['hard_interr'] == 1) & (df_feat['irony_label'] == 1)]

Unnamed: 0,index,irony_label,tweet,all_caps,ellipsis,pol_change,laugh,emojis,interjection,exclamation,hard_exclamation,interrogative,hard_interr
747,748,1,@msmalcriada @ershannon Time to listen to ppl ...,0,0,0,0,0,0,1,1,1,1
2606,2607,1,WOW !!! Really ??? Homework in the weekends?! ...,0,0,0,0,1,0,1,1,1,1
3130,3131,1,@sunlorrie @ctv ??? #Liberal used in same sent...,0,0,0,0,0,0,1,1,1,1
3169,3170,1,Where's the Good Governance that we are ordere...,0,0,0,0,0,0,0,0,1,1
3696,3697,1,@muayguyblog @EFCworldwide what???? No verbal ...,0,0,0,0,0,0,0,0,1,1


In [None]:
num_ironic_tweets_hard_interr = len(df_feat.loc[(df_feat['hard_interr'] == 1) & 
                                                (df_feat['irony_label'] == 1)])

p_hard_interr = round((num_ironic_tweets_hard_interr / tot_ironic_tweets * 100), 2)

print("Ironic tweets contain the feature <hard_interr>", p_hard_interr, "% of the times")

Ironic tweets contain the feature <hard_interr> 0.26 % of the times


###Tagged users

In [None]:
# Takes a dataset as input and returns a list of 1s and 0s,
# indicating which index correspond to a tweet that contains the feature
def get_tagged_users(dataset):
    tags = []

    for tweet in dataset:
        # not how many tags are present, but if at least one tag is present
        tag = int(bool(re.search("@[a-zA-Z0-9]+", tweet)))
        if tag == 1:
            tags.append(1)
        else:
            tags.append(0)
            
    return tags

In [None]:
tags = get_tagged_users(x_train)

# Number of tweets with feature ellipsis
tags.count(1)

1521

In [None]:
# add column to feature dataset
df_feat['user_tag'] = tags
df_feat.head()

Unnamed: 0,index,irony_label,tweet,all_caps,ellipsis,pol_change,laugh,emojis,interjection,exclamation,hard_exclamation,interrogative,hard_interr,user_tag
0,1,1,Sweet United Nations video. Just in time for C...,0,0,0,0,0,0,0,0,0,0,0
1,2,1,@mrdahl87 We are rumored to have talked to Erv...,0,1,0,0,0,0,0,0,0,0,1
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...,0,0,0,0,0,0,1,0,0,0,0
3,4,0,3 episodes left I'm dying over here,0,0,0,0,0,0,0,0,0,0,0
4,5,1,I can't breathe! was chosen as the most notabl...,0,0,0,0,0,0,1,0,0,0,0


In [None]:
# calculate correlation between the irony labels and the feature
# -1 indicates a perfectly negative linear correlation between two variables
# 0 indicates no linear correlation between two variables
# 1 indicates a perfectly positive linear correlation between two variables
df_feat[['irony_label', 'user_tag']].corr()

Unnamed: 0,irony_label,user_tag
irony_label,1.0,-0.122752
user_tag,-0.122752,1.0


In [None]:
# list of 1s and 0s
# irony lables of the tweets that DO contain the feature
corr_list_tags = []
for i in range(len(y_train)):
    if tags[i] == 1:
        corr_list_tags.append(y_train[i])

print(corr_list_tags.count(0))
print(corr_list_tags.count(1))

878
643


In [None]:
p_ironic_tag = round(corr_list_tags.count(1)*100/(corr_list_tags.count(0)+corr_list_tags.count(1)),2)

print("Out of the tweets that contain the feature <user_tag>,", p_ironic_tag, "% are ironic")

Out of the tweets that contain the feature <user_tag>, 42.27 % are ironic


In [None]:
# Tweets where both irony label and feature are 1
# (ironic tweets that contain feature)
df_feat.loc[(df_feat['user_tag'] == 1) & (df_feat['irony_label'] == 1)]

Unnamed: 0,index,irony_label,tweet,all_caps,ellipsis,pol_change,laugh,emojis,interjection,exclamation,hard_exclamation,interrogative,hard_interr,user_tag
1,2,1,@mrdahl87 We are rumored to have talked to Erv...,0,1,0,0,0,0,0,0,0,0,1
18,19,1,@YankeesWFAN @Ken_Rosenthal trading a SP for a...,0,0,0,0,0,0,0,0,1,0,1
19,20,1,"But @DarklightDave was trying to find us, and ...",0,0,0,0,0,0,1,1,1,0,1
20,21,1,@deputymartinski please do..i need the second ...,0,0,0,0,0,0,0,0,0,0,1
22,23,1,@yWTorres9 time to hit the books then,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3791,3792,1,China calls US hypocritical on #humanrights bu...,0,0,0,0,0,0,0,0,1,0,1
3793,3794,1,"@SouthamptonFC So Monday it is ,tickets on gen...",0,0,0,0,0,0,0,0,0,0,1
3795,3796,1,@ClayTravisBGID One could hope.,0,0,0,0,0,0,0,0,0,0,1
3808,3809,1,@BBCRadMac @StuartMaconie years ago in M && S ...,0,0,0,0,0,0,1,0,0,0,1


In [None]:
num_ironic_tweets_user_tag = len(df_feat.loc[(df_feat['user_tag'] == 1) & 
                                             (df_feat['irony_label'] == 1)])

p_user_tag = round((num_ironic_tweets_user_tag / tot_ironic_tweets * 100), 2)

print("Ironic tweets contain the feature <user_tag>", p_user_tag, "% of the times")

Ironic tweets contain the feature <user_tag> 33.65 % of the times


##Visualise

In [None]:
features = ['all_caps', 'ellipsis', 'pol_change', 'laugh', 'emojis', 
            'interjection', 'exclamation', 'hard_exclamation', 'interrogative', 
            'hard_interr', 'user_tag']

In [None]:
# Out of the tweets that contain <feature>, <p_ironic_feature>% are ironic
p_ironic_feat = [p_ironic_all_caps, p_ironic_ellipsis, p_ironic_pol_change, 
                 p_ironic_laugh, p_ironic_emojis, p_ironic_interj, 
                 p_ironic_excl, p_ironic_hard_ex, p_ironic_interr, 
                 p_ironic_hard_interr, p_ironic_tag]

In [None]:
# Ironic tweets contain <feature> <p_feature>% of the times
p_feat = [p_all_caps, p_ellipsis, p_pol_change, p_laugh, p_emojis, p_interj, 
          p_exclamation, p_hard_ex, p_interr, p_hard_interr, p_user_tag]

In [None]:
len(p_ironic_feat) == len(p_feat)

True

In [None]:
my_dict = {'features': features, 
           '% tweets with feature (base ironic tweet)': p_feat, 
           '% of tweets with feature (base whole dataset)': p_ironic_feat}

In [None]:
pd.DataFrame(my_dict)

Unnamed: 0,features,% of ironic tweets with feature,% of tweets with feature that are ironic
0,all_caps,0.63,23.08
1,ellipsis,11.88,46.9
2,pol_change,14.02,55.03
3,laugh,3.77,53.73
4,emojis,9.0,42.89
5,interjection,1.1,63.64
6,exclamation,17.27,51.56
7,hard_exclamation,4.34,54.97
8,interrogative,9.16,42.48
9,hard_interr,0.26,45.45


In [None]:
df_feat.corr()

Unnamed: 0,index,irony_label,all_caps,ellipsis,pol_change,laugh,emojis,interjection,exclamation,hard_exclamation,interrogative,hard_interr,user_tag
index,1.0,-0.015885,0.007127,-0.024508,0.005332,-0.002799,-0.00596,-0.017272,-0.011223,-0.011823,-0.007604,0.02142,0.029168
irony_label,-0.015885,1.0,-0.062772,-0.02237,0.039574,0.014798,-0.047511,0.025704,0.01539,0.020748,-0.05113,-0.004709,-0.122752
all_caps,0.007127,-0.062772,1.0,0.057274,-0.031184,-0.010037,-0.025338,0.013485,-0.016207,-0.000556,-0.018843,0.078028,-0.039774
ellipsis,-0.024508,-0.02237,0.057274,1.0,-0.008203,0.017464,0.0215,-0.00141,-0.031153,0.011862,-0.002562,-0.005706,-0.05299
pol_change,0.005332,0.039574,-0.031184,-0.008203,1.0,0.046825,0.020641,-0.027063,0.016186,0.007328,0.016862,-0.005817,-0.059559
laugh,-0.002799,0.014798,-0.010037,0.017464,0.046825,1.0,0.041697,-0.002358,0.032873,0.063687,-0.001832,-0.010208,0.054694
emojis,-0.00596,-0.047511,-0.025338,0.0215,0.020641,0.041697,1.0,0.005061,0.000142,0.040343,-0.063548,-0.002398,-0.068088
interjection,-0.017272,0.025704,0.013485,-0.00141,-0.027063,-0.002358,0.005061,1.0,0.026435,-0.00435,-0.004979,-0.004998,0.028329
exclamation,-0.011223,0.01539,-0.016207,-0.031153,0.016186,0.032873,0.000142,0.026435,1.0,0.45234,0.005027,0.041372,0.073062
hard_exclamation,-0.011823,0.020748,-0.000556,0.011862,0.007328,0.063687,0.040343,-0.00435,0.45234,1.0,-0.00964,0.114492,0.013969
