In [1]:
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import re
import time
import pickle
import nltk
from nltk.corpus import stopwords
import re,string,unicodedata
from bs4 import BeautifulSoup
import tensorflow_hub as hub
import tensorflow as tf
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score as acc
nltk.download('stopwords')
pd.set_option('display.max_colwidth', 200)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tryptophanv2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# read data
train_data = pd.read_csv("../Data/train.csv")
test_data = pd.read_csv("../Data/test.csv")

In [3]:
#Parse dataset -- In total : 28619 samples

# get stopwords
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

# Removing URL's
def remove_between_square_brackets(text):
    return re.sub(r'http\S+', '', text)

#Removing the stopwords from text
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_stopwords(text)
    return text

#Apply function on headline column
train_data['headline']=train_data['headline'].apply(denoise_text)
test_data['headline']=test_data['headline'].apply(denoise_text)

In [4]:
train_data.head()

Unnamed: 0,is_sarcastic,headline
0,1,'you better give dad good trade deal sorry!' shout angry trump boys phone employee local chinese restaurant
1,0,open letter graduation speakers
2,0,donald trump makes dubious claim inauguration singer jackie evancho
3,1,pope francis clarifies god one many immortal beings speak every day
4,1,woman geared complain work sidelined friend marital problems


In [5]:
#lemmatize (normalize) the text by leveraging the popular spaCy library.


# import spaCy's language model
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# function to lemmatize text
def lemmatization(texts):
    output = []
    for i in texts:
        s = [token.lemma_ for token in nlp(i)]
        output.append(' '.join(s))
    return output

#lemmatize train and test data
train_data['headline'] = lemmatization(train_data['headline'])
test_data['headline'] = lemmatization(test_data['headline'])

In [6]:
train_data.head()

Unnamed: 0,is_sarcastic,headline
0,1,' -PRON- well give dad good trade deal sorry ! ' shout angry trump boy phone employee local chinese restaurant
1,0,open letter graduation speaker
2,0,donald trump make dubious claim inauguration singer jackie evancho
3,1,pope francis clarify god one many immortal being speak every day
4,1,woman gear complain work sideline friend marital problem


In [7]:
# Get pretrained ELMO vectors
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

#Function to get elmo vectors for words in a string
def elmo_vectors(x):
  embeddings = elmo(x.tolist(), signature="default", as_dict=True)["elmo"]
  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    # return average of ELMo features
    return sess.run(tf.reduce_mean(embeddings,1))

In [8]:
##################################################################################################################
############################## ONLY RUN CELL IF ELMO EMBEDDED DATA ISNT AVAILABLE ################################
##################################################################################################################



# #Break training and test data down as using ELMO embeddings for all data at once will cause memory issues
# list_train = [train_data[i:i+100] for i in range(0,train_data.shape[0],100)]
# list_test = [test_data[i:i+100] for i in range(0,test_data.shape[0],100)]

# # Extract ELMo embeddings
# elmo_train = []
# elmo_test = []
# tr_len = len(list_train)
# te_len = len(list_test)

# for i,tr in enumerate(list_train):
#     elmo_train.append(elmo_vectors(tr['headline']))
#     print(f"Processed {i}/{tr_len}")
# for i,te in enumerate(list_test):
#     elmo_test.append(elmo_vectors(te['headline']))
#     print(f"Processed {i}/{te_len}")

# # Concatenate data back again
# elmo_train_data = np.concatenate(elmo_train, axis = 0)
# elmo_test_data = np.concatenate(elmo_test, axis = 0)
# with open('elmo_train_data.pkl','wb') as f:
#     pickle.dump(elmo_train_data, f)
    
# with open('elmo_test_data.pkl','wb') as f:
#     pickle.dump(elmo_test_data, f)

In [9]:
with open('../Data/elmo_train_data.pkl','rb') as f:
    elmo_train_data = pickle.load(f)
    
with open('../Data/elmo_test_data.pkl','rb') as f:
    elmo_test_data = pickle.load(f)

print(elmo_train_data.shape,elmo_test_data.shape)

(21464, 1024) (7155, 1024)


In [10]:
# Split data for training and validating
xtrain, xvalid, ytrain, yvalid = train_test_split(elmo_train_data, 
                                                  train_data['is_sarcastic'],  
                                                  random_state=69, 
                                                  test_size=0.2)
# Fit a logistic regression
lreg = LogisticRegression(max_iter=2500)
lreg.fit(xtrain, ytrain)

# Get predictions
preds_valid = lreg.predict(xvalid)
acc(yvalid, preds_valid)


0.7905893314698346

In [11]:
# What about the final test set?
preds_test = lreg.predict(elmo_test_data)
acc(preds_test,test_data['is_sarcastic'])

0.7916142557651992

In [14]:
# Output as predicted probabilites -- first column is 0 (not sarcastic) and second column is 1 (is sarcastic)
print(lreg.predict_proba(elmo_test_data))

[[0.98625291 0.01374709]
 [0.69060755 0.30939245]
 [0.03140663 0.96859337]
 ...
 [0.02532157 0.97467843]
 [0.06298205 0.93701795]
 [0.18268463 0.81731537]]
