In [1]:
import numpy as np
import gzip
import re
import json
import datetime
from gensim.models import word2vec
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.cluster import KMeans

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Activation, TimeDistributed

import multiprocessing # number of threads

import matplotlib.pyplot as plt # plot
import matplotlib.gridspec as gridspec

from typing import Dict,Any,List,Generator,Tuple

2023-04-03 18:59:19.593298: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
class SentimentAnalysis():
    def __init__(self, json_file, hidden_layer = (300,50), train_test_split: float = 0.7, random_state: int = None):
        self._reviews=json.load(gzip.open(json_file,'rb'))
        
        # split in train-test
        self._test_reviews=self._reviews[int(len(self._reviews)*train_test_split):]
        self._reviews=self._reviews[:-len(self._test_reviews)]
        
        # save variables for the rest of the functions
        self._hidden_layer = hidden_layer
        self._random_state = random_state
        self._train_test_split = train_test_split
    
    @property
    def reviews(self) -> List[Dict[str,Any]]:
        return self._reviews
    
    def padded_reviews(self, vectorizer: word2vec.Word2Vec, batch_size: int = -1, iterate_forever: bool = False) -> Generator[None,Tuple[np.ndarray,np.ndarray],None]:
        """
        Returns all the train reviews in a matrix, padding the sentences so all have the same size
        @param vectorizer      Word2Vec object to convert from word to vector
        @param batch_size      Number of reviews each returned ndarray has. Use -1 if use all the reviews.
        @param iterate_forever If True, it will iterate indefinitely
        @return Generator of ndarray of size (<batch_size>,<max words in review>,<Word2Vec return vector lenght>),
                and its expected value (<batch_size>,5)
        """
        vector_size = vectorizer.wv[0].shape
        reviews = [x['text'] for x in self._reviews]
        y_reviews = [int(x['score']) for x in self._reviews]
        largest_review_size = self.largest_review_size
        if batch_size == -1: batch_size = len(reviews)
        
        offset = 0
        while True:
            reviews_slice = reviews[offset*batch_size:(offset+1)*batch_size]
            y_reviews_slice = y_reviews[offset*batch_size:(offset+1)*batch_size]
            r = np.zeros(shape=( len(reviews_slice),largest_review_size,vector_size[-1] ))
            y = np.zeros(shape=( len(reviews_slice),5 ))
            
            for review_index,review in enumerate(reviews_slice):
                review_offset = largest_review_size-len(review) # will help with the padding
                for word_index,word in enumerate(review):
                    if word in vectorizer.wv: # if not, leave it as 0
                        r[review_index,review_offset+word_index,:] = vectorizer.wv[word]
                y[review_index,y_reviews_slice[review_index]-1] = 1 # the index is from 1 to 5
            
            yield (r,y)
            
            offset += 1
            if offset >= len(reviews):
                offset = 0
                
                if not iterate_forever:
                    return
    
    @property
    def largest_review_size(self) -> int:
        # @ref https://stackoverflow.com/a/1582670/9178470
        return len(max(self.review_sentences, key=len))
        
    @property
    def review_sentences(self) -> List[str]:
        return [x['text'] for x in self._reviews]
    
    @property
    def num_reviews(self) -> int:
        return len(self._reviews)

    def train_word2vec_model(self, size: int, window: int = 5, min_count: int = 5, workers: int = -1) -> word2vec.Word2Vec:
        """
        Get a word2vec model according to the input data.
        @ref https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
        
        @param size:      Dimensionality of the word vectors
        @param window:    Maximum distance between the current and predicted word
        @param min_count: Ignores all words with total frequency lower than this
        @param workers:   Use these many worker threads to train the model; -1 to use all
        @return Word2Vec model
        
        """
        
        sentences=self.review_sentences
        
        if workers == -1:
            workers = SentimentAnalysis.get_num_max_workers()
            print(f"[v] Using {workers} workers for the Word2Vec operation...")
            
        return word2vec.Word2Vec(sentences, vector_size=size, window=window,
                                       min_count=min_count, workers=workers)
    
    @staticmethod
    def get_num_max_workers() -> int:
        return multiprocessing.cpu_count()
    
    @staticmethod
    def save_word2vec(model: word2vec.Word2Vec, file_name: str):
        model.save(file_name)

    @staticmethod
    def load_word2vec(file_name: str) -> word2vec.Word2Vec:
        return word2vec.Word2Vec.load(file_name)

# global variables
WORD2VEC_VECTOR_SIZE = 300
INPUT_HIDDEN_DIM = 52 # must be a power of 2
DENSE_HIDDEN_DIM = (200,50)
BATCH_SIZE = 128
EPOCHS = 3

# create an inscance of the class
SaHandler=SentimentAnalysis('data/Watches_withstopwords.json.gz', random_state=1)

In [3]:
print(f"{SaHandler.reviews[0]['score']}: {' '.join(SaHandler.reviews[0]['text'])}")

4.0: having owned two previous g shocks in my life including the first series in 1984 i ve long appreciated their quality the main reason i stopped wearing them was simply because the resin straps would break and having worn them for 4 years i grew weary of them however having burned through many fashion watches in the last 10 years i ve been disappointed to spend only to get 1 or 2 years worth of usage so i m back to g shock i think this model g1710d 7av represents a nice blend between good looks and practical durability which most guys really want the face is smaller on my wrist than i expected from a g shock but i think it s still classy for the office the side buttons are hidden in gray plastic which you can t see in picture but the rest of the watch is metal you might find a classier analog face g shock in the 200 300 but this one is a better value the only reason i didn t give it 5 stars is because the led isn t backlight it s an amber light that comes around the inside bevel so 

In [4]:
max_input_size = SaHandler.largest_review_size
print(max_input_size)

avg_lenght = 0
for review in SaHandler.review_sentences:
    avg_lenght += len(review)
avg_lenght = avg_lenght / SaHandler.num_reviews
print(avg_lenght)

4187
85.73769566762105


Here we'll train a Word2Vec model using the train data.
Word2Vec will learn word associations from a large corpus of text. It will be able to detect synonymous words,[ref](https://en.wikipedia.org/w/index.php?title=Word2vec&oldid=1143734439) as words are represented by vectors that states the context of that word.

In [5]:
word2vec_model = SaHandler.train_word2vec_model(WORD2VEC_VECTOR_SIZE)
SentimentAnalysis.save_word2vec(word2vec_model, 'word2vec.bin')
word_vectors = word2vec_model.wv

[v] Using 32 workers for the Word2Vec operation...


What if we visualize the generated word2vec model?
In order to do it we'll need some kind of clustering algorithm; we'll use k-Means. I've also tried DBSCAN and OPTICS, as they are other methods found on [sk-learn](https://scikit-learn.org/stable/modules/clustering.html#overview-of-clustering-methods), but it seems like there's no relevant separation to use a density-based algorithm, as they both report one single group.

Some information needed to understand what we'll do next:
- `Word2Vec#wv.vectors` will return a matrix of `<number of input words> rows x <vector size> columns`, representing (for each input word) its vector
- `Word2Vec#wv.index_to_key` will return a list with all the input words. This will be useful in order to relate a vector to an actual word

Also, the section ["What can I do with word vectors?", on gensim wiki](https://radimrehurek.com/gensim/models/keyedvectors.html#what-can-i-do-with-word-vectors) is very interesting to see the word extrapolation, but it won't be discussed in this Jupyter Notebook.

In [6]:
num_clusters = 10

# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans( n_clusters = num_clusters, n_init='auto' )
# `fit_predict` will force each of the `word_vectors.vectors` vectors into one of the 10 clusters
idx = kmeans_clustering.fit_predict( word_vectors.vectors )

# Create a Word / Index dictionary, mapping each vocabulary word to a cluster number
word_centroid_map = dict(list(zip( word_vectors.index_to_key, idx )))

# For each cluster
for cluster in range(num_clusters):
    # Find all of the words for that cluster number, and print them out
    words = []
    for key, value in zip(list(word_centroid_map.keys()),list(word_centroid_map.values())):
        if (value == cluster):
            words.append(key)
    
    print(f"Cluster {cluster}: {words[:10]}")

Cluster 0: ['and', 'for', 'of', 'in', 'with', 'on', 'at', 'has', 'are', 'an']
Cluster 1: ['can', 'would', 'will', 'don', 'does', 'could', 'did', 'doesn', 'should', 'didn']
Cluster 2: ['about', 'after', 'two', 'over', '2', '5', 'few', '3', 'many', '1']
Cluster 3: ['from', 'they', 'them', 'casio', 'seiko', 'these', 'invicta', 'less', 'their', 'service']
Cluster 4: ['the', 'band', 'than', 'which', 'face', 'also', 'light', 'little', 'there', 'small']
Cluster 5: ['to', 'have', 'you', 't', 'so', 'be', 'if', 'or', 'just', 'when']
Cluster 6: ['great', 'like', 'good', 'nice', 'well', 'easy', 'perfect', 'better', 'beautiful', 'happy']
Cluster 7: ['it', 'a', 'watch', 'is', 'this', 'that', 'but', 'not', 's', 'very']
Cluster 8: ['time', 'only', 'day', 'use', 'battery', 'date', 'second', 'set', 'every', 'hand']
Cluster 9: ['i', 'my', 'was', 'one', 'had', 'me', 'bought', 'years', 'am', 'amazon']


In [None]:
# @ref https://www.tensorflow.org/api_docs/python/tf/keras/utils/Sequence
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, sa: SentimentAnalysis, word2vec_model: word2vec.Word2Vec, batch_size: int = 32):
        self._vector_size = word2vec_model.wv[0].shape
        self._reviews = [x['text'] for x in sa._reviews]
        self._y_reviews = [int(x['score']) for x in sa._reviews]
        self._largest_review_size = sa.largest_review_size
        self._vectorizer = word2vec_model
        
        self._batch_size = batch_size
        self._num_batches_per_epoch = -(sa.num_reviews // -batch_size) # ceil; @ref https://stackoverflow.com/a/17511341/9178470

    def __len__(self) -> int:
        return self._num_batches_per_epoch

    def __getitem__(self, idx: int) -> Tuple[np.ndarray,np.ndarray]:
        reviews_slice = self._reviews[idx*self._batch_size:(idx+1)*self._batch_size]
        y_reviews_slice = self._y_reviews[idx*self._batch_size:(idx+1)*self._batch_size]
        r = np.zeros(shape=( len(reviews_slice),self._largest_review_size,self._vector_size[-1] ))
        y = np.zeros(shape=( len(reviews_slice),5 ))

        for review_index,review in enumerate(reviews_slice):
            review_offset = self._largest_review_size-len(review) # will help with the padding
            for word_index,word in enumerate(review):
                if word in self._vectorizer.wv: # if not, leave it as 0
                    r[review_index,review_offset+word_index,:] = self._vectorizer.wv[word]
            y[review_index,y_reviews_slice[review_index]-1] = 1 # the index is from 1 to 5

        return (r,y)

inputs = tf.keras.Input(shape=(None, SaHandler.largest_review_size, WORD2VEC_VECTOR_SIZE))

model = Sequential()
# Assuming that your input size (X.shape) is n X t X f where
# n:Batch size
# t: sequence length/time-steps/no:of unrollings)
# f: Nºof feature per time-step
# Note: input_shape=(t,f)
# @ref https://stackoverflow.com/a/62994263/9178470
vector_processor = Bidirectional(LSTM(INPUT_HIDDEN_DIM // 2, return_sequences=False),
                        input_shape=(inputs.shape[-2], inputs.shape[-1]),
                        merge_mode='concat') # concat will return n X t X <HIDDEN_DIM/2>*2 # TODO why *2 and not *<nº of LSTM>?
model.add(vector_processor)
for hidden_dense in DENSE_HIDDEN_DIM:
    model.add(Dense(hidden_dense))
model.add(Dense(5, activation='softmax'))
#model.add(Activation('softmax'))
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])

training_generator = DataGenerator(SaHandler, word2vec_model, BATCH_SIZE)
model.fit(training_generator, epochs=EPOCHS, batch_size=BATCH_SIZE)

print(training_generator[0][0].shape)
print(training_generator[0][1].shape)
for i,layer in enumerate(model.layers):
    print(f"Layer {i}: {layer.input_shape}, {layer.output_shape}")


As it seems like the LSTM didn't suceed (I've tried multiple combinations of size, and none was better than 0.6) we'll try [neel aproach, on How to get vector for a sentence from the word2vec of tokens in sentence](https://stackoverflow.com/a/31738627/9178470), by making the average of Word2Vec vectors with TF-IDF.

In [8]:
count_vectorizer=CountVectorizer()
counts=count_vectorizer.fit_transform([' '.join(x) for x in SaHandler.review_sentences])
print(count_vectorizer.get_feature_names_out())

transformer = TfidfTransformer()
BoW=transformer.fit_transform(counts)
BoW_train=BoW[:len(SaHandler.reviews),:]

print(BoW_train.shape) # we'll have a vector of size <nº train data> x <nº of different words>, with the value representing the weight of that word

class TfidDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, sa: SentimentAnalysis, word2vec_model: word2vec.Word2Vec, count_vectorizer: CountVectorizer, BoW: np.ndarray, batch_size: int = 32):
        self._vector_size = word2vec_model.wv[0].shape
        self._reviews = [x['text'] for x in sa.reviews]
        self._y_reviews = [int(x['score']) for x in sa.reviews]
        self._vectorizer = word2vec_model
        self._count_vectorizer = count_vectorizer
        self._tfid = BoW
        
        self._batch_size = batch_size
        self._num_batches_per_epoch = -(sa.num_reviews // -batch_size) # ceil; @ref https://stackoverflow.com/a/17511341/9178470
        
        self._word_to_index = {}
        for index, word in enumerate(self._count_vectorizer.get_feature_names_out()):
            self._word_to_index[word] = index

    def _get_count_vectorizer_index(self, searching: str) -> int:
        r = np.where(self._count_vectorizer.get_feature_names_out() == searching)
        if len(r[0]) == 0:
            return None
        return r[0][0]
        
    def __len__(self) -> int:
        return self._num_batches_per_epoch

    def __getitem__(self, idx: int) -> Tuple[np.ndarray,np.ndarray]:
        offset = idx*self._batch_size
        reviews_slice = self._reviews[offset:offset+self._batch_size]
        y_reviews_slice = self._y_reviews[idx*self._batch_size:(idx+1)*self._batch_size]
        r = np.zeros(shape=( len(reviews_slice),self._vector_size[-1] ))
        y = np.zeros(shape=( len(reviews_slice),5 ))

        for review_index,review in enumerate(reviews_slice):
            BoW_train_index = review_index + offset

            feature_index = self._tfid[BoW_train_index,:].nonzero()[1] # @ref https://stackoverflow.com/a/38770335/9178470

            for word in review:
                # the word must be inside the Tfid & word2vec
                if not word in self._word_to_index: continue
                if not word in self._vectorizer.wv: continue

                r[review_index,:] += self._tfid[BoW_train_index, self._word_to_index[word]]*self._vectorizer.wv[word]
            y[review_index,y_reviews_slice[review_index]-1] = 1 # the index is from 1 to 5

        return (r,y)
    
training_generator = TfidDataGenerator(SaHandler, word2vec_model, count_vectorizer, BoW_train, BATCH_SIZE)
print(str(training_generator.__getitem__(0)[0].shape)) # now we just have a BATCH_SIZE x WORD2VEC_VECTOR_SIZE input

# -- building the DNN --
model = Sequential()
for hidden_dense in DENSE_HIDDEN_DIM:
    model.add(Dense(hidden_dense))
model.add(Dense(5, activation='softmax'))
#model.add(Activation('softmax'))

model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(training_generator, epochs=EPOCHS, batch_size=BATCH_SIZE)

['00' '000' '0000' ... 'zwitzerland' 'zxc' 'zzzffth']
(47849, 42034)
(128, 400)
Epoch 1/3


2023-04-03 18:59:53.925162: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:630] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-04-03 18:59:54.889011: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x559ad84cad10 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-04-03 18:59:54.889056: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA GeForce RTX 3070, Compute Capability 8.6

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.
2023-04-03 18:59:55.015522: W tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.cc:234] Falling back to the CUDA driver for PTX compilation; ptxas does not support CC 8.6
2023-04-03 18:59:55.015549: W tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.cc:237] Used ptxas at ptxas
2023-04-03 18:59:55.015587: W tensorflow/compiler/xla/service/gpu/nvptx_compiler.

  1/374 [..............................] - ETA: 13:09 - loss: 2.2876 - accuracy: 0.2109


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fb32afa1300>