<a href="https://colab.research.google.com/github/qamtam/Hands-on-machine-learning/blob/main/CH16_PART2_sentiment_analysis_with_notes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
    IS_COLAB = True
except Exception:
    IS_COLAB = False

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. CNNs can be very slow without a GPU.")
    if IS_COLAB:
        print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "cnn"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [None]:
(X_train, y_train), (X_test, y_test) = keras.datasets.imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [None]:
# 0 to padding token
# 1 to start-of-sequence-token
# 2 to nieznane słowa
# poza tym to im mniejszy numer tym popularniejsze słowo

# przykład wizualizacji
word_index = keras.datasets.imdb.get_word_index()
id_to_word = {id_ + 3: word for word, id_ in word_index.items()} # {} set () tuple [] list
for id_, token in enumerate(("<pad>", "<sus>", "unknown")):
  id_to_word[id_] = token

" ".join([id_to_word[id_] for id_ in X_train[0]])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


"<sus> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert redford's is an amazing actor and now the same being director norman's father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for retail and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also congratulations to the two little boy's that played the part's of norman and paul they were just brilliant children are often left out of the praising list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and shou

In [None]:
id_to_word[5] # można indeksować w secie po pełnej nazwie

'and'

In [None]:
# we do the big boy preprocessing ourselves here, equivalent to downloading ready version above
import tensorflow_datasets as tfds
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True) # importujemy jako tekst w postaci byte strings
train_size = info.splits["train"].num_examples


def preprocess(X_batch, y_batch):
  X_batch = tf.strings.substr(X_batch, 0, 300)
  X_batch = tf.strings.regex_replace(X_batch, b"<br//s*/?", b" ")# Z CZEGO, CO MA ZAMIENIĆ, NA CO MA ZAMIENIĆ ( "<br />" -> " ")
  X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ") # any not letters and not quotes -> " "
  X_batch = tf.strings.split(X_batch) # splits into ragged tensor
  return X_batch.to_tensor(default_value=b"<pad>"), y_batch # changes ragged into dense


[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…







HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteA07JGM/imdb_reviews-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteA07JGM/imdb_reviews-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteA07JGM/imdb_reviews-unsupervised.tfrecord


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [None]:
from collections import Counter
vocabulary = Counter()
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess):
  for review in X_batch:
    vocabulary.update(list(review.numpy()))

In [None]:
vocabulary.most_common()[:10]

[(b'<pad>', 205484),
 (b'the', 61137),
 (b'a', 38564),
 (b'of', 33983),
 (b'and', 33431),
 (b'to', 27707),
 (b'I', 27019),
 (b'is', 25719),
 (b'in', 18966),
 (b'this', 18490)]

In [None]:
vocab_size = 10000
truncated_vocabulary = [
                        word for word, count in vocabulary.most_common()[:vocab_size]
]

In [None]:
truncated_vocabulary[:50] #no counters

[b'<pad>',
 b'the',
 b'a',
 b'of',
 b'and',
 b'to',
 b'I',
 b'is',
 b'in',
 b'this',
 b'it',
 b'br',
 b'was',
 b'movie',
 b'that',
 b'The',
 b'film',
 b'with',
 b'for',
 b'as',
 b'on',
 b'but',
 b'have',
 b'This',
 b'one',
 b'not',
 b'be',
 b'are',
 b'you',
 b'an',
 b'at',
 b'about',
 b'by',
 b'all',
 b'his',
 b'so',
 b'like',
 b'from',
 b'who',
 b'has',
 b'It',
 b'good',
 b'my',
 b'just',
 b'very',
 b'out',
 b'or',
 b'story',
 b'some',
 b'time']

In [None]:
#another preprocessing step -> replace each word with its ID
#for  words that aren't in the vocabulary we yeet them into one of 1000 oov (out-of-vocabulary) buckets

words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype = tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids) #inicjalizator tabeli łączący <pad> a etc z 1..10000
num_oov_buckets = 1000

In [None]:
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets) #wrapper do inicjalizatora który poradzi sobie z dopakowywaniem oov

In [None]:
table.lookup(tf.constant([b"This movie was greaaat".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   23,    13,    12, 10745]])>

In [None]:
#stworzenie ostatecznegoi training setu
#batch
#convert reviews to split simple sequences with preprocessing function
#encode the split words with encode_words(), który używa tabeli table
#prefetch

def encode_words(X_batch, y_batch):
  return table.lookup(X_batch), y_batch #<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   23,    13,    12, 10745]])>

train_set = datasets["train"].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

embed_size = 128
model = keras.models.Sequential([
                                 keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size, input_shape=[None]),
                                 keras.layers.GRU(128, return_sequences=True),
                                 keras.layers.GRU(128),
                                 keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, epochs=5)




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
for x in train_set.take(1):
  print(x)

(<tf.Tensor: shape=(32, 62), dtype=int64, numpy=
array([[  23,   12,   29, ...,    0,    0,    0],
       [   6,   22,   71, ...,    0,    0,    0],
       [4099, 6881,    1, ...,    0,    0,    0],
       ...,
       [  23,   13,  119, ...,  332, 1047,    0],
       [1757, 4101,  452, ...,    0,    0,    0],
       [3365, 4392,    6, ...,    0,    0,    0]])>, <tf.Tensor: shape=(32,), dtype=int64, numpy=
array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 0])>)


In [None]:
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    version=1.0.0,
    description='Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.',
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    total_num_examples=100000,
    splits={
        'test': 25000,
        'train': 25000,
        'unsupervised': 50000,
    },
    supervised_keys=('text', 'label'),
    citation="""@InProceedings{maas-EtAl:2011:ACL-HLT2011,
      author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
      title     = {Learning Word

In [None]:
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess).take(10):
  for x in X_batch:
    print(x)

tf.Tensor(
[b'This' b'was' b'an' b'absolutely' b'terrible' b'movie' b"Don't" b'be'
 b'lured' b'in' b'by' b'Christopher' b'Walken' b'or' b'Michael'
 b'Ironside' b'Both' b'are' b'great' b'actors' b'but' b'this' b'must'
 b'simply' b'be' b'their' b'worst' b'role' b'in' b'history' b'Even'
 b'their' b'great' b'acting' b'could' b'not' b'redeem' b'this' b"movie's"
 b'ridiculous' b'storyline' b'This' b'movie' b'is' b'an' b'early'
 b'nineties' b'US' b'propaganda' b'pi' b'<pad>' b'<pad>' b'<pad>' b'<pad>'
 b'<pad>' b'<pad>' b'<pad>' b'<pad>' b'<pad>' b'<pad>' b'<pad>' b'<pad>'], shape=(62,), dtype=string)
tf.Tensor(
[b'I' b'have' b'been' b'known' b'to' b'fall' b'asleep' b'during' b'films'
 b'but' b'this' b'is' b'usually' b'due' b'to' b'a' b'combination' b'of'
 b'things' b'including' b'really' b'tired' b'being' b'warm' b'and'
 b'comfortable' b'on' b'the' b'sette' b'and' b'having' b'just' b'eaten'
 b'a' b'lot' b'However' b'on' b'this' b'occasion' b'I' b'fell' b'asleep'
 b'because' b'the' b'film' b'

In [None]:
# w bardziej sskomplikowanych modelach trzeba używać ręcznie maskowania padów żeby się nie zjebawszy


K = keras.backend
inputs = keras.layers.Input(shape=[None])
mask = keras.layers.Lambda(lambda  inputs: K.not_equal(inputs, 0))(inputs) # if not 0 it is true
z = keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size)(inputs)
z = keras.layers.GRU(128, return_sequences=True)(z, mask=mask)
outputs = keras.layers.Dense(1, activation="sigmoid")(z)
model = keras.Model(inputs=[inputs], outputs=[outputs])

NameError: ignored