1. Read and explore the data

In [42]:
import json
data = []
for x in open('/content/Sarcasm_Headlines_Dataset.json', 'r'):
    data.append(json.loads(x))

In [43]:
import pandas as pd
df = pd.DataFrame(data, columns = ['article_link', 'headline','is_sarcastic']) 

In [44]:
#Check the first few samples, shape, info of the data 
df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [45]:
#Shape of the data
df.shape
#There are 26709 records in the dataset and 3 columns

(26709, 3)

In [46]:
# lets check the basic information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26709 entries, 0 to 26708
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   article_link  26709 non-null  object
 1   headline      26709 non-null  object
 2   is_sarcastic  26709 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 626.1+ KB


In [47]:
#Checking for the presence of null values
df['headline'].isna().sum()
#There are no null values in the dataset

0

2. Retain relevant columns

In [48]:
df=df.drop(['article_link'],axis=1)
#Dropping the 'article_link' column as it seems to be invaild


In [49]:
df
#Now there are only two columns in the dataset (headline,is_sarcastic)

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0
...,...,...
26704,american politics in moral free-fall,0
26705,america's best 20 hikes,0
26706,reparations and obama,0
26707,israeli ban targeting boycott supporters raise...,0


3. Get length for each sentence

In [50]:
df['headline'].str.len()
#Checking the length of each sentence

0        78
1        84
2        79
3        84
4        64
         ..
26704    36
26705    23
26706    21
26707    60
26708    33
Name: headline, Length: 26709, dtype: int64

In [51]:
#But this is including the spaces, so we remove the spaces to get the actual length of sentences and store it in a separate column
df['head_len'] = df['headline'].apply(lambda x:len(x.split(" ")))

In [52]:
#Checking the maximum length of the sentence
max(df["head_len"])

39

In [53]:
df.sample(5)
#We can see that each sentence is of different length

Unnamed: 0,headline,is_sarcastic,head_len
7650,secret service agent not so secret about being...,1,12
24400,'boo 2! a madea halloween' leads a sluggish we...,0,13
11320,owner by far creepiest man in bar,1,7
23256,talking to our kids: the conversation we shoul...,0,10
22409,prince charles warns that the lessons of wwii ...,0,11


4. Define parameters

In [54]:
max_len = 25      #-set the max number of words
max_feat = 5000   #-set the max number of features

5. Get indices for words

In [55]:
#Using tokenizer for indexing
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


tokenizer = Tokenizer(num_words=max_feat)
tokenizer.fit_on_texts(list(df['headline']))

index = tokenizer.texts_to_sequences(df['headline'])

6. Create features and labels

In [56]:
#Padding the sentences to maintain a constant length and storing it in X variable and labels in y variable
X = pad_sequences(index, maxlen = max_len)
y = np.asarray(df['is_sarcastic'])
y

array([0, 0, 1, ..., 0, 0, 0])

7. Get vocabulary size

In [57]:
X.size

667725

In [58]:
X.shape

(26709, 25)

8. Create a weight matrix using GloVe embeddings

In [59]:
#Using the vectorized word embeddings(2nd text file)
word_emb ="glove.6B.100d.txt"
embed = {}
for x in open(word_emb, 'r', encoding='utf-8'):
#Splitting each word in the text file
 word_splt = x.split(" ")[0]
 embed_splt = x.split(" ")[1:]
 embed_splt = np.asarray(embed_splt, dtype='float32')
  

In [60]:
#Calculating the total number of words in the text file 
ttl_wrds = len(tokenizer.word_index) + 1
#Creating the weight matrix
embed_mat = np.zeros((ttl_wrds, 25))

for word, i in tokenizer.word_index.items():
    embed_vector = embed.get(word)
    if embed_vector is not None:
        embed_mat[i] = embed_vector

9. Define and compile a Bidirectional LSTM model.

In [61]:
#Splitting the X and y variables into train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =28, stratify=y, test_size=0.2)

In [62]:
#Checking the shape of X train and test
print(X_train.shape, X_test.shape)

(21367, 25) (5342, 25)


In [74]:
#Importing the required libraries
import tensorflow as tf
from tensorflow.keras.layers import BatchNormalization
from keras.models import Model
from tensorflow.keras.models import Sequential
from sklearn.metrics import mean_squared_error
from tensorflow.keras.layers import Embedding, Activation, Bidirectional, LSTM, Dense, Dropout, Flatten, Input

#Using bidirectional model
def Bidirectional():
    
    input = Input(name='inputs',shape=[max_len])
    x = Embedding(ttl_wrds,25,input_length=max_len, weights = [embed_mat])(input)
    x = LSTM(128)(x)
    x = Dense(64,name='Features')(x)
    x = Activation('LeakyReLU')(x)
    x = Dense(1,name='Out')(x)
    model = Model(inputs=input,outputs=x)
    return model

10. Fit the model and check the validation accuracy

In [75]:
# build the model
model = Bidirectional()
#Compiling the model with mean squared error as a loss function,accuracy as a metric and adam optimizer
model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(learning_rate=0.03), metrics=['accuracy'])

In [76]:
#checking the summmary of the model
model.summary()

Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 25)]              0         
                                                                 
 embedding_7 (Embedding)     (None, 25, 25)            741425    
                                                                 
 lstm_7 (LSTM)               (None, 128)               78848     
                                                                 
 Features (Dense)            (None, 64)                8256      
                                                                 
 activation_7 (Activation)   (None, 64)                0         
                                                                 
 Out (Dense)                 (None, 1)                 65        
                                                                 
Total params: 828,594
Trainable params: 828,594
Non-trainab

In [77]:
#defining batch size and number of epochs
batch_size = 100
epochs = 15
#training the model
training_history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [78]:
#Evaluating the model
score = model.evaluate(X_test, y_test)
print("Loss: {}, Accuracy:{}".format(score[0], score[1]))

Loss: 0.138594850897789, Accuracy:0.8292773962020874


In [77]:
# we achieved a good validation accuracy of 82% with val loss(0.13) using bidirectional LSTM model after trying different parameters(increasing dense,activation and flatten 
#and layers). Bidirectional LSTM with the above parameters performed the best