<a href="https://colab.research.google.com/github/rezamohamadlo/SarcasmDetectionUsingLSTM/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import json
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

!wget https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json

# Load the JSON file
with open("./sarcasm.json", 'r') as f:
    datastore = json.load(f)

# Initialize lists for sentences and labels
sentences = []
labels = []

# Collect sentences and labels from the dataset
for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])

# Define parameters for text preprocessing
vocab_size = 10000  # Maximum number of unique words in the vocabulary
max_length = 120  # Maximum length of sequences (pad or truncate to this length)
trunc_type = 'post'  # Truncate sequences at the end
padding_type = 'post'  # Pad sequences at the end
oov_tok = "<OOV>"  # Token for out-of-vocabulary words

# Initialize the Tokenizer class
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(sentences)  # Build the word index based on the dataset
word_index = tokenizer.word_index  # Retrieve the word index dictionary

# Convert sentences to sequences of integers
sequences = tokenizer.texts_to_sequences(sentences)

# Pad sequences to ensure uniform input size
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Split the data into training and testing sets
training_size = 22000
training_sequences = padded_sequences[:training_size]
testing_sequences = padded_sequences[training_size:]

# Split the labels into training and testing sets
training_labels = np.array(labels[:training_size])
testing_labels = np.array(labels[training_size:])

# Save the processed data to files (optional)
np.save('training_sequences.npy', training_sequences)
np.save('testing_sequences.npy', testing_sequences)
np.save('training_labels.npy', training_labels)
np.save('testing_labels.npy', testing_labels)

print("Data preprocessing complete!")


--2024-09-06 12:14:38--  https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.111.207, 142.251.163.207, 142.251.167.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.111.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘sarcasm.json’


2024-09-06 12:14:38 (89.5 MB/s) - ‘sarcasm.json’ saved [5643545/5643545]

Data preprocessing complete!
