<a href="https://colab.research.google.com/github/shielamms/Tensorflow-Notebooks/blob/main/Text_Preprocessing_for_Sentiment_Analysis_in_TF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sample text preprocessing

In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [26]:
sentences = [
    'I have a dog',
    'She has a wonderful cat!',
    'You, do you have a pet?',
    'The dog jumps over the moon'
]

In [27]:
tokenizer = Tokenizer(
    num_words=100,
    lower=True,
    oov_token='<OOV>'
)

tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)

print('Word index:', word_index)
print('Sequences:', sequences)

Word index: {'<OOV>': 1, 'a': 2, 'have': 3, 'dog': 4, 'you': 5, 'the': 6, 'i': 7, 'she': 8, 'has': 9, 'wonderful': 10, 'cat': 11, 'do': 12, 'pet': 13, 'jumps': 14, 'over': 15, 'moon': 16}
Sequences: [[7, 3, 2, 4], [8, 9, 2, 10, 11], [5, 12, 5, 3, 2, 13], [6, 4, 14, 15, 6, 16]]


In [28]:
padded_sequences = pad_sequences(sequences, maxlen=5)
padded_sequences

array([[ 0,  7,  3,  2,  4],
       [ 8,  9,  2, 10, 11],
       [12,  5,  3,  2, 13],
       [ 4, 14, 15,  6, 16]], dtype=int32)

In [31]:
tokenizer.texts_to_sequences(['Hello there dog!!'])

[[1, 1, 4]]

# Sarcasm Detection

In [1]:
!wget https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json


--2023-03-30 12:01:14--  https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.196.128, 173.194.210.128, 173.194.214.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.196.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘sarcasm.json.1’


2023-03-30 12:01:15 (115 MB/s) - ‘sarcasm.json.1’ saved [5643545/5643545]



In [2]:
import json

with open('./sarcasm.json', 'r') as file:
  data = json.load(file)

In [3]:
data[0]

{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5',
 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers",
 'is_sarcastic': 0}

In [4]:
data[1030]

{'article_link': 'https://www.theonion.com/4-hours-scrolling-through-facebook-before-bed-referred-1819578025',
 'headline': "4 hours scrolling through facebook before bed referred to as 'winding down'",
 'is_sarcastic': 1}

In [19]:
sentences = []
labels = []
urls = []

In [20]:
for entry in data:
  sentences.append(entry.get('headline'))
  labels.append(entry.get('is_sarcastic'))
  urls.append(entry.get('article_link'))

In [21]:

# Preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(oov_token='<OOV>', lower=False)

tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print('Word index size:', len(word_index))

Word index size: 29657


In [23]:
sequences = tokenizer.texts_to_sequences(sentences)
padded_sequences = pad_sequences(sequences)

# sample
index = 5
print('Original sentence:', sentences[index])
print('Tokenized sequence:', sequences[index])
print('Padded sequence:', padded_sequences[index])


Original sentence: advancing the world's women
Tokenized sequence: [10738, 4, 365, 73]
Padded sequence: [    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
 10738     4   365    73]
