In [83]:
import tensorflow as tf
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [84]:
df = pd.read_json('../input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset.json', lines=True)
df.head()

In [85]:
headlines = df['headline']
labels = df['is_sarcastic']

In [86]:
from sklearn.model_selection import train_test_split
# training set : validation set : test daset = 8 : 1: 1 
x_train, x_test, y_train, y_test = train_test_split(headlines, labels, test_size=0.2, random_state=100)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=100)

In [87]:
print(x_train.shape)
print(x_val.shape)
print(x_test.shape)
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

In [88]:
# Tokenize and pad
vocab_size = 10000
oov_token = '<00V>'
max_length = 120
padding_type = 'post'
trunc_type = 'post'
embedding_dim = 16
num_epochs = 10

#Tokenize and Pad
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(x_train)
word_index = tokenizer.word_index

x_train = tokenizer.texts_to_sequences(x_train)
train_padded = pad_sequences(x_train, maxlen=max_length, padding=padding_type, truncating=trunc_type)

x_val = tokenizer.texts_to_sequences(x_val)
val_padded = pad_sequences(x_val, maxlen=max_length, padding=padding_type, truncating=trunc_type)

x_test = tokenizer.texts_to_sequences(x_test)
test_padded = pad_sequences(x_test, maxlen=max_length, padding=padding_type, truncating=trunc_type)

#LSTM
model_lstm = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_lstm.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
model_lstm.summary()
history_lstm = model_lstm.fit(train_padded, 
                    y_train, 
                    validation_data=(val_padded, y_val), 
                    epochs=num_epochs, 
                    verbose=2)

In [89]:
y_predict = model_lstm.predict(test_padded)

In [90]:
import numpy as np
from sklearn.metrics import accuracy_score

y_test.tolist()
y_pred_list = y_predict.reshape(y_predict.shape[0]).tolist()

# get the predicted label 
y_pred_label = [0]*y_predict.shape[0]
for i in range(len(y_predict)):
    y_pred_label[i] = round(y_pred_list[i])

y_true = y_test
accuracy_score(y_true, y_pred_label)
