#### Machine Learning Model for Sarcasm Detection 

This project makes use of the News Headlines dataset, where the dataset will contain the article link, headline and label for the headline. 

#### Data Pre-Processing

Loading the dataset. 


In [4]:
import pandas as pd 
import numpy as np 

data = pd.read_json("dataset/Sarcasm_Headlines_Dataset.json", lines=True)
data2 = pd.read_json("dataset/Sarcasm_Headlines_Dataset_v2.json", lines=True)

In [10]:
data = pd.concat([data, data2])
data.drop(['article_link'], axis=1, inplace= True)

In [11]:
data.reset_index(drop=True, inplace=True)

#### Preprocessing data using Tokenization & Padding

In [12]:
import tensorflow as tf 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, Bidirectional, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.models import Sequential 
from tensorflow.keras.optimizers import Adam

2022-12-05 19:36:53.652998: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['headline'])
total_words = len(tokenizer.word_index)

In [16]:
def applyToken(s):
    tokens = tokenizer.texts_to_sequences(s)[0]
    return tokens

data['token'] = [applyToken([x]) for x in data['headline']]
max_len = max([len(x) for x in data['token']])
print(max_len)

152


In [18]:
padded = np.array(pad_sequences(data['token'], maxlen = max_len, padding= 'pre'))
padded
print(len(padded))


112566


array([[    0,     0,     0, ...,     5,  2782,  9018],
       [    0,     0,     0, ...,   251,     8,  1081],
       [    0,     0,     0, ...,    43,     1, 11426],
       ...,
       [    0,     0,     0, ...,     6,   818,  1861],
       [    0,     0,     0, ...,  2466,   837,  6340],
       [    0,     0,     0, ...,     6,   258,   179]], dtype=int32)

#### Building Model

In [20]:
split_train = int(0.8* len(padded))
X_train = padded[:split_train]
X_val = padded[split_train:]
Y_train =  data['is_sarcastic'][:split_train]
Y_val = data['is_sarcastic'][split_train:]

In [21]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(total_words + 1, 16,),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32)),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()
history = model.fit(X_train, Y_train, epochs=5, validation_data=(X_val, Y_val),verbose=1)

2022-12-05 19:55:19.867158: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          494160    
                                                                 
 bidirectional (Bidirectiona  (None, 64)               9600      
 l)                                                              
                                                                 
 dense (Dense)               (None, 6)                 390       
                                                                 
 dense_1 (Dense)             (None, 1)                 7         
                                                                 
Total params: 504,157
Trainable params: 504,157
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


#### Saving the model 

In [22]:
model.save('model.h5')

#### Prediction 

In [26]:
s = 'Cows lose their jobs as milk prices drop'
tokenizer.fit_on_texts(s)
s = tokenizer.texts_to_sequences([s])
s = pad_sequences(s, maxlen= max_len, padding='pre')

if model.predict(s) >= 0.75:
    print("Sarcastic")
else:
    print("Not Sarcastic")

Sarcastic
