<a href="https://colab.research.google.com/github/stbalaji/stbalaji/blob/main/01RetailDemo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
sentences = ['I love my dog', 'I love my cat', 'Good for you!', "Do you think my dog is amazing?"]
tk = Tokenizer(num_words = 100)
tk.fit_on_texts(sentences)
wi = tk.word_index  # dictionary of Words & Encoding
print(wi)  # Note: No 'comma' or '!' in output
print("\n")
seq = tk.texts_to_sequences(sentences) # Set of Sequences
print(seq)

{'my': 1, 'i': 2, 'love': 3, 'dog': 4, 'you': 5, 'cat': 6, 'good': 7, 'for': 8, 'do': 9, 'think': 10, 'is': 11, 'amazing': 12}


[[2, 3, 1, 4], [2, 3, 1, 6], [7, 8, 5], [9, 5, 10, 1, 4, 11, 12]]


In [None]:
# Try a new sentence where word is not in the vocabulary
sentences2 = ['I love my Elephant']
seq2 = tk.texts_to_sequences(sentences2) # Set of Sequences
print(seq2) # [3, 4, 2, 1] ==> The value of 1 is used for Elephant as that is OOV 

[[3, 4, 2, 1]]


In [None]:
seq_padded = pad_sequences(seq, padding='post', maxlen=5, truncating='post')
print(wi)
print(seq)
print(seq_padded)

{'<OOV>': 1, 'my': 2, 'i': 3, 'love': 4, 'dog': 5, 'you': 6, 'cat': 7, 'good': 8, 'for': 9, 'do': 10, 'think': 11, 'is': 12, 'amazing': 13}
[[3, 4, 2, 5], [3, 4, 2, 7], [8, 9, 6], [10, 6, 11, 2, 5, 12, 13]]
[[ 3  4  2  5  0]
 [ 3  4  2  7  0]
 [ 8  9  6  0  0]
 [10  6 11  2  5]]


# Real-world Larger Dataset
https://rishabhmisra.github.io/publications/#datasets
* is_sarcastic : 1 is sarcastic
* headline : the headline of news article
* article_link: Supplementary information

In [None]:
!wget https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json

--2022-09-13 06:13:17--  https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.204.128, 64.233.188.128, 64.233.189.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.204.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘sarcasm.json’


2022-09-13 06:13:17 (163 MB/s) - ‘sarcasm.json’ saved [5643545/5643545]



In [None]:
datafile = open("sarcasm.json",'r') 
data = json.load(datafile)
print(datafile)

for i in data[:5] :
    print(i)

<_io.TextIOWrapper name='sarcasm.json' mode='r' encoding='UTF-8'>
{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5', 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers", 'is_sarcastic': 0}
{'article_link': 'https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365', 'headline': "the 'roseanne' revival catches up to our thorny political mood, for better and worse", 'is_sarcastic': 0}
{'article_link': 'https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697', 'headline': "mom starting to fear son's web series closest thing she will have to grandchild", 'is_sarcastic': 1}
{'article_link': 'https://politics.theonion.com/boehner-just-wants-wife-to-listen-not-come-up-with-alt-1819574302', 'headline': 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas', 'is_sarcastic': 1}
{'article_link': 'https://

In [None]:
sentences, labels, urls = [], [], []

for item in data:
    sentences.append(item["headline"])
    labels.append(item["is_sarcastic"])
    urls.append(item["article_link"])

print(len(labels))

26709


# Processing the Headlines...Padded Sequences

In [None]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)

wi = tokenizer.word_index
print("\nWord Index len", len(wi))  #wi is dictionary
print("Word Indexes", type(wi))
seq = tokenizer.texts_to_sequences(sentences)
# seq[0:3]
padded_seq = pad_sequences(seq, padding='post')



Word Index len 29657
Word Indexes <class 'dict'>


In [None]:
import random
ind = random.randint(0, len(seq))
print("Sample: ", sentences[ind])
print("Sample: ", padded_seq[ind])
print("shape: ", padded_seq.shape)


Sample:  shaken attorney general resigns after learning what murder is
Sample:  [4228 1657  896 1955   21 1085   33  779   11    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
Sample:  (26709, 40)


# New Section

Note: the text to sequences called can take any set of sentences,
so it can encode them based on the **word set **that it
learned from the one that was passed into fit on texts