# Sarcasm Analysis

# Library and Variable

In [16]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

vocab_size = 10000
embedding_dim = 16
max_length = 32
trunch_type = 'post'
padding_type = 'post'
oov_tok = '<oov>'
training_size = 20000

## Importing Data

In [17]:
import json

with open("sarcasm.json", "r") as f:
    datastore = json.load(f) # now datastore is a list
    
sentences = []
labels = []

for all in datastore:
    sentences.append(all['headline'])
    labels.append(all['is_sarcastic'])
    
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

training_labels = np.array(training_labels)
testing_labels = np.array(testing_labels)

## Tokenizer

In [18]:
vocab_size = 10000
embedding_dim = 16
max_length = 32
trunch_type = 'post'
padding_type = 'post'
oov_tok = '<oov>'
training_size = 20000

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen= max_length, padding=padding_type, truncating=trunch_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen= max_length, padding=padding_type, truncating=trunch_type)


index = 250
print(f"headline: {training_sequences[index]}")
print(f"padded seq: {training_padded[index]}")

headline: [6918, 595, 1, 2871, 70, 47, 2240, 1787, 4, 99, 11, 473, 2, 288]
padded seq: [6918  595    1 2871   70   47 2240 1787    4   99   11  473    2  288
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]


## Build the Model

In [22]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.fit(
    training_padded,
    training_labels,
    epochs=30,
    validation_data=(testing_padded, testing_labels),
    verbose=2
)

Epoch 1/30
625/625 - 4s - 7ms/step - accuracy: 0.7502 - loss: 0.4776 - val_accuracy: 0.8506 - val_loss: 0.3532
Epoch 2/30
625/625 - 3s - 4ms/step - accuracy: 0.8803 - loss: 0.2806 - val_accuracy: 0.8454 - val_loss: 0.3492
Epoch 3/30
625/625 - 2s - 3ms/step - accuracy: 0.9081 - loss: 0.2241 - val_accuracy: 0.8429 - val_loss: 0.3843
Epoch 4/30
625/625 - 2s - 3ms/step - accuracy: 0.9317 - loss: 0.1773 - val_accuracy: 0.8287 - val_loss: 0.4284
Epoch 5/30
625/625 - 1s - 2ms/step - accuracy: 0.9404 - loss: 0.1523 - val_accuracy: 0.8410 - val_loss: 0.4341
Epoch 6/30
625/625 - 1s - 2ms/step - accuracy: 0.9488 - loss: 0.1364 - val_accuracy: 0.8398 - val_loss: 0.4605
Epoch 7/30
625/625 - 1s - 2ms/step - accuracy: 0.9558 - loss: 0.1216 - val_accuracy: 0.7857 - val_loss: 0.6834
Epoch 8/30
625/625 - 2s - 3ms/step - accuracy: 0.9567 - loss: 0.1170 - val_accuracy: 0.8271 - val_loss: 0.5439
Epoch 9/30
625/625 - 3s - 5ms/step - accuracy: 0.9625 - loss: 0.1024 - val_accuracy: 0.8159 - val_loss: 0.5338
E

<keras.src.callbacks.history.History at 0x280af2616a0>