##### Implementation was inspired from this [notebook](https://www.kaggle.com/code/madz2000/sarcasm-detection-with-glove-word2vec-83-accuracy) at kaggle.

In [1]:
# Import all required libraries
import numpy as np
import pandas as pd
import seaborn as sns
import spacy
import matplotlib.pyplot as plt
import nltk
import tensorflow
import tensorflow.keras as keras
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import re,string,unicodedata
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from string import punctuation
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,LSTM,Dropout,Bidirectional,GRU
import gensim
import pickle

In [6]:
# read data
train_data = pd.read_csv("../Data/train.csv")
test_data = pd.read_csv("../Data/test.csv")

In [3]:
#Parse dataset -- In total : 28619 samples

# get stopwords
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

# Removing URL's
def remove_between_square_brackets(text):
    return re.sub(r'http\S+', '', text)

#Removing the stopwords from text
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_stopwords(text)
    return text

#Apply function on headline column
train_data['headline']=train_data['headline'].apply(denoise_text)
test_data['headline']=test_data['headline'].apply(denoise_text)

In [4]:
# Gets words by splitting sentences
def get_words(sentences=pd.concat([train_data['headline'], test_data['headline']], ignore_index=True)):
    words = []
    for i in sentences.values:
        words.append(i.split())
    return words


#Dimension to embed the words
EMBEDDING_DIM = 200

#Creating Word Vectors by Word2Vec Method (takes time...)
w2v_model = gensim.models.Word2Vec(sentences = get_words() , vector_size=EMBEDDING_DIM , window = 5 , min_count = 1)

In [28]:
#tokenizer 
def tokenize(words = get_words()):
    tokenizer = text.Tokenizer(num_words=35000)
    tokenizer.fit_on_texts(words)
    tokenized_train = tokenizer.texts_to_sequences(words)
    x = sequence.pad_sequences(tokenized_train, maxlen = 20)
    vocab_size = len(tokenizer.word_index) + 1
    return (tokenizer,x,vocab_size)

#Get back train and test data
tokenizer,x,vocab_size = tokenize()
x_train = x[:len(train_data),:]
x_test = x[len(train_data):(len(train_data)+len(test_data)),:]
y_train = train_data['is_sarcastic'].tolist()
y_test = test_data['is_sarcastic'].tolist()

In [6]:
# Function to create weight matrix from word2vec gensim model
def get_weight_matrix(model, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        weight_matrix[i] = model.wv[word]
    return weight_matrix

In [7]:
#Getting embedding vectors from word2vec and usings it as weights of non-trainable keras embedding layer
embedding_vectors = get_weight_matrix(w2v_model, tokenizer.word_index)

In [8]:
#Defining Neural Network
model = Sequential()
#Non-trainable embeddidng layer
model.add(Embedding(vocab_size, output_dim=EMBEDDING_DIM, weights=[embedding_vectors], input_length=20, trainable=True))
#LSTM 
model.add(Bidirectional(LSTM(units=128 , recurrent_dropout = 0.3 , dropout = 0.3,return_sequences = True)))
model.add(Bidirectional(GRU(units=32 , recurrent_dropout = 0.1 , dropout = 0.1)))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=keras.optimizers.Adam(lr = 0.01), loss='binary_crossentropy', metrics=['acc'])

del embedding_vectors

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [9]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 200)           7614400   
_________________________________________________________________
bidirectional (Bidirectional (None, 20, 256)           336896    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                55488     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 8,006,849
Trainable params: 8,006,849
Non-trainable params: 0
_________________________________________________________________


In [29]:
# Fit model on trian and test data
history = model.fit(x_train, y_train, batch_size = 128 , validation_data = (x_test,y_test) , epochs = 3)

Train on 21464 samples, validate on 7155 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [33]:
# Get prediction probabilities
pred_probs = model.predict_proba(x_test)

In [4]:
# Dump probabilities as a pickle file
def dump_probs(pred_probs = pred_probs):
    with open('./probs_for_1.pkl','wb') as f:
        pickle.dump(pred_probs,f)

In [3]:
# Read from pickle file
with open('./probs_for_1.pkl','rb') as f:
    preds = pickle.load(f)

In [9]:
# Get labels from probabilities
labels = [1 if x > 0.5 else 0 for x in preds]

In [10]:
# Get accuracy score
accuracy_score(labels,test_data['is_sarcastic'])

0.8756114605171209