The datasets for this project is taken from kaggle and this project is based on to detect sarcasm  in news headline. There are only two categories i.e 0 for not sarcasm and 1 for sarcasm. 

In [0]:
import numpy as np 
import pandas as pd 
import json

In [2]:
df=pd.read_json("Sarcasm_Headlines_Dataset.json",lines=True)
df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [3]:
df.shape

(26709, 3)

In [4]:
# since i am not using article_link column so droping it

df.drop("article_link", axis=1, inplace=True)
df.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [5]:
# using nltk for preprocessing of texts

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stoplist = stopwords.words('english')

training_data=df.headline.to_list()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
train_data=[]
for i in training_data:
    string=""
    for j in i.lower().split():
        if j not in stoplist: # removing stopwords from reviews
            string=string+j+" "
    train_data.append(string.rstrip())

In [7]:
train_data[:3]

["former versace store clerk sues secret 'black code' minority shoppers",
 "'roseanne' revival catches thorny political mood, better worse",
 "mom starting fear son's web series closest thing grandchild"]

In [8]:
train_labels=np.array(df.is_sarcastic.to_list())
train_labels[:5]

array([0, 0, 1, 1, 0])

In [9]:
%tensorflow_version 2.x
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

oov_tok="<OOV_tok>"
padding="post"
max_length=25
trunc_type="post"
vocab_size=20000

#using tokenization on train data
tokenizer=Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_data)
word_index=tokenizer.word_index

train_sequence=tokenizer.texts_to_sequences(train_data)
train_pad_sequence=pad_sequences(train_sequence, padding=padding, maxlen=max_length, truncating=trunc_type)

TensorFlow 2.x selected.


In [10]:
# find maximum length of a review

max_length=[len(i.split()) for i in training_data]
print(max(max_length))

39


In [0]:
# reverse the key, value pair of tokenized words in word index dictionary

reverse_word_index=dict([(j, i) for i, j in word_index.items()])

In [23]:
len(reverse_word_index)

29590

In [14]:
def decode_review(review):
    return " ".join([reverse_word_index.get(i, "?") for i in review])

print(decode_review(train_pad_sequence[1]))
print(train_data[:1])

'roseanne' revival catches thorny political mood better worse ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
["former versace store clerk sues secret 'black code' minority shoppers"]


In [0]:
embid_dim=16
vocab_size=20000
max_length=25

model=tf.keras.models.Sequential([
                                tf.keras.layers.Embedding(vocab_size, embid_dim, input_length=max_length),
                                tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
                                tf.keras.layers.Dense(6, activation="relu"),
                                tf.keras.layers.Dense(1, activation="sigmoid")
])

In [16]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 16)            320000    
_________________________________________________________________
bidirectional (Bidirectional (None, 64)                12544     
_________________________________________________________________
dense (Dense)                (None, 6)                 390       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 332,941
Trainable params: 332,941
Non-trainable params: 0
_________________________________________________________________


In [17]:
epochs=10

history=model.fit(train_pad_sequence, train_labels, epochs=epochs)

Train on 26709 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
# now using version2 of dataset for testing

df2=pd.read_json("Sarcasm_Headlines_Dataset_v2.json", lines=True)
df2.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [19]:
test_data=df2.headline.to_list()
test_data[:3]

['thirtysomething scientists unveil doomsday clock of hair loss',
 'dem rep. totally nails why congress is falling short on gender, racial equality',
 'eat your veggies: 9 deliciously different recipes']

In [0]:
test_sequence=tokenizer.texts_to_sequences(test_data)
test_pad_sequence=pad_sequences(test_sequence, maxlen=max_length, padding=padding, truncating=trunc_type)

In [0]:
# predicting class

predictions=model.predict_classes(test_pad_sequence)

In [22]:
# checking accuracy with actual given labels

from sklearn.metrics import accuracy_score
print(accuracy_score(df2.is_sarcastic, predictions))


0.9247702575212271
