In [4]:
import json

# Initialize empty lists to store data
sentences = []
labels = []
urls = []

# Open the JSON file
with open('Sarcasm_Headlines_Dataset.json', 'r') as f:
    # Read each line (assuming each line is a separate JSON object)
    for line in f:
        # Parse the JSON object in the line
        item = json.loads(line)
        sentences.append(item['headline'])
        labels.append(item['is_sarcastic'])
        urls.append(item['article_link'])




In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer=Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index=tokenizer.word_index
#print(word_index)

sequences=tokenizer.texts_to_sequences(sentences)
paddin=pad_sequences(sequences,padding='post')
print(paddin[0])
print(paddin.shape)

[  308 15115   679  3337  2298    48   382  2576 15116     6  2577  8434
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
(26709, 40)


In [26]:
import tensorflow as tf
training_size=26709
vocab_size=10000
embedding_dim=100
#split into training and testing
training_sentences=sentences[0:training_size]
testing_sentences=sentences[training_size:]
training_labels=labels[0:training_size]
testing_labels=labels[training_size:]

In [27]:
tokenizer=Tokenizer(num_words=vocab_size,oov_token='<OOV>')
tokenizer.fit_on_texts(training_sentences)
word_index=tokenizer.word_index
#creating sequence and padding for test and train data
training_sequences=tokenizer.texts_to_sequences(training_sentences)
training_padded=pad_sequences(training_sequences,maxlen=100,padding='post',truncating='post')
testing_sequences=tokenizer.texts_to_sequences(testing_sentences)
testing_padded=pad_sequences(testing_sequences,maxlen=100,padding='post',truncating='post')

In [28]:
#converting into numpy arrays
import numpy as np
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [31]:
#model
model=tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=100),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
    
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
history=model.fit(training_padded,training_labels,epochs=30,validation_data=(testing_padded,testing_labels),verbose=2)

Epoch 1/30
835/835 - 16s - loss: 0.5313 - accuracy: 0.7180 - 16s/epoch - 20ms/step
Epoch 2/30
835/835 - 14s - loss: 0.3025 - accuracy: 0.8755 - 14s/epoch - 17ms/step
Epoch 3/30
835/835 - 14s - loss: 0.2408 - accuracy: 0.9039 - 14s/epoch - 16ms/step
Epoch 4/30
835/835 - 14s - loss: 0.2057 - accuracy: 0.9191 - 14s/epoch - 16ms/step
Epoch 5/30
835/835 - 14s - loss: 0.1825 - accuracy: 0.9287 - 14s/epoch - 16ms/step
Epoch 6/30
835/835 - 15s - loss: 0.1615 - accuracy: 0.9392 - 15s/epoch - 17ms/step
Epoch 7/30
835/835 - 15s - loss: 0.1496 - accuracy: 0.9431 - 15s/epoch - 17ms/step
Epoch 8/30
835/835 - 14s - loss: 0.1353 - accuracy: 0.9490 - 14s/epoch - 17ms/step
Epoch 9/30
835/835 - 14s - loss: 0.1275 - accuracy: 0.9526 - 14s/epoch - 17ms/step
Epoch 10/30
835/835 - 14s - loss: 0.1180 - accuracy: 0.9565 - 14s/epoch - 17ms/step
Epoch 11/30
835/835 - 14s - loss: 0.1070 - accuracy: 0.9624 - 14s/epoch - 17ms/step
Epoch 12/30
835/835 - 15s - loss: 0.1041 - accuracy: 0.9614 - 15s/epoch - 17ms/step
E

In [32]:
#testing
sentence=['It’s okay if you don’t like me. Not everyone has good taste.',
           'Tomorrow is Friday']
sequences=tokenizer.texts_to_sequences(sentence)
padded=pad_sequences(sequences,maxlen=100,padding='post',truncating='post')
print(model.predict(padded))

[[0.9999401 ]
 [0.10061174]]


In [None]:
#since 0.99 is near to 1 -sarcastic
#0.10 is near to 0 -not sarcastic