In [1]:
import tensorflow as tf
import numpy as np
import json
import tensorflow_datasets as tfds

In [2]:
dataset_path = r"D:\TensorflowPractice\venv\Datasets\sarcasm\Sarcasm_Headlines_Dataset.json"
datastore = []

with open(dataset_path, 'r') as f:
    for line in f:
        datastore.append(json.loads(line))  # Use loads(), not load()

In [5]:
datastore[:5]

[{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5',
  'headline': "former versace store clerk sues over secret 'black code' for minority shoppers",
  'is_sarcastic': 0},
 {'article_link': 'https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365',
  'headline': "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
  'is_sarcastic': 0},
 {'article_link': 'https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697',
  'headline': "mom starting to fear son's web series closest thing she will have to grandchild",
  'is_sarcastic': 1},
 {'article_link': 'https://politics.theonion.com/boehner-just-wants-wife-to-listen-not-come-up-with-alt-1819574302',
  'headline': 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
  'is_sarcastic': 1},
 {'article_link': 'https://www.huffingtonpost.com/entry/jk-rowling-w

In [6]:
sentences = [item['headline'] for item in datastore]

In [8]:
vectorise_layer = tf.keras.layers.TextVectorization()
vectorise_layer.adapt(sentences)
print(len(vectorise_layer.get_vocabulary()))

28435


In [9]:
post_padded_sequence = vectorise_layer(sentences)

In [10]:
index = 2
print(f'sample headline {sentences[index]}')
print(f'padded sequence {post_padded_sequence[index]}')
print()
print(f'padded sequence shape {post_padded_sequence.shape}')

sample headline mom starting to fear son's web series closest thing she will have to grandchild
padded sequence [  140   825     2   813  1100  2048   571  5057   199   139    39    46
     2 13050     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0]

padded sequence shape (26709, 39)


In [11]:
vectorise_layer1 = tf.keras.layers.TextVectorization(ragged=True)
vectorise_layer1.adapt(sentences)
ragged_sequence = vectorise_layer1(sentences)
index = 2
print(f'sample headline {sentences[index]}')
print(f'ragged sequence {ragged_sequence[index]}')
print()
print(f'ragged sequence shape {ragged_sequence.shape}')

sample headline mom starting to fear son's web series closest thing she will have to grandchild
ragged sequence [  140   825     2   813  1100  2048   571  5057   199   139    39    46
     2 13050]

ragged sequence shape (26709, None)


In [16]:
pre_padded_sequences = tf.keras.utils.pad_sequences(ragged_sequence.numpy(), padding='pre')
pre_padded_sequences[5]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0, 14545,
           4,   351,    72], dtype=int32)

In [17]:
# Print a sample headline and sequence
index = 2
print(f'sample headline: {sentences[index]}')
print()
print(f'post-padded sequence: {post_padded_sequence[index]}')
print()
print(f'pre-padded sequence: {pre_padded_sequences[index]}')
print()

# Print dimensions of padded sequences
print(f'shape of post-padded sequences: {post_padded_sequence.shape}')
print(f'shape of pre-padded sequences: {pre_padded_sequences.shape}')

sample headline: mom starting to fear son's web series closest thing she will have to grandchild

post-padded sequence: [  140   825     2   813  1100  2048   571  5057   199   139    39    46
     2 13050     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0]

pre-padded sequence: [    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0   140   825     2   813  1100  2048   571  5057   199   139    39
    46     2 13050]

shape of post-padded sequences: (26709, 39)
shape of pre-padded sequences: (26709, 39)
