In [11]:
import numpy as np
import gzip
import re
import json
import datetime
from gensim.models import word2vec
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.cluster import KMeans

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Activation, TimeDistributed

import multiprocessing # number of threads

import matplotlib.pyplot as plt # plot
import matplotlib.gridspec as gridspec

from typing import Dict,Any,List,Generator

In [16]:
class SentimentAnalysis():
    def __init__(self, json_file, hidden_layer = (300,50), train_test_split: float = 0.7, random_state: int = None):
        self._reviews=json.load(gzip.open(json_file,'rb'))
        
        # split in train-test
        self._test_reviews=self._reviews[int(len(self._reviews)*train_test_split):]
        self._reviews=self._reviews[:-len(self._test_reviews)]
        
        # save variables for the rest of the functions
        self._hidden_layer = hidden_layer
        self._random_state = random_state
        self._train_test_split = train_test_split
    
    @property
    def reviews(self) -> List[Dict[str,Any]]:
        return self._reviews
    
    def padded_reviews(self, vectorizer: word2vec.Word2Vec, batch_size: int = -1, iterate_forever: bool = False) -> Generator[None,Tuple[np.ndarray,np.ndarray],None]:
        """
        Returns all the train reviews in a matrix, padding the sentences so all have the same size
        @param vectorizer      Word2Vec object to convert from word to vector
        @param batch_size      Number of reviews each returned ndarray has. Use -1 if use all the reviews.
        @param iterate_forever If True, it will iterate indefinitely
        @return Generator of ndarray of size (<batch_size>,<max words in review>,<Word2Vec return vector lenght>).
        """
        vector_size = vectorizer.wv[0].shape
        reviews = self.review_sentences
        largest_review_size = self.largest_review_size
        if batch_size == -1: batch_size = len(reviews)
        
        offset = 0
        while True:
            reviews_slice = reviews[offset*batch_size:(offset+1)*batch_size]
            r = np.zeros(shape=( len(reviews_slice),largest_review_size,vector_size[-1] ))
            print(r.shape)
            for review_index,review in enumerate(reviews_slice):
                review_offset = largest_review_size-len(review) # will help with the padding
                for word_index,word in enumerate(review):
                    if word in vectorizer.wv: # if not, leave it as 0
                        r[review_index,review_offset+word_index,:] = vectorizer.wv[word]
            
            yield r
            
            offset += 1
            if offset >= len(reviews):
                offset = 0
                
                if not iterate_forever:
                    return
    
    @property
    def largest_review_size(self) -> int:
        # @ref https://stackoverflow.com/a/1582670/9178470
        return len(max(self.review_sentences, key=len))
        
    @property
    def review_sentences(self) -> List[str]:
        return [x['text'] for x in self._reviews]
    
    @property
    def num_reviews(self) -> int:
        return len(self._reviews)

    def train_word2vec_model(self, size: int, window: int = 5, min_count: int = 5, workers: int = -1) -> word2vec.Word2Vec:
        """
        Get a word2vec model according to the input data.
        @ref https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
        
        @param size:      Dimensionality of the word vectors
        @param window:    Maximum distance between the current and predicted word
        @param min_count: Ignores all words with total frequency lower than this
        @param workers:   Use these many worker threads to train the model; -1 to use all
        @return Word2Vec model
        
        """
        
        sentences=self.review_sentences
        
        if workers == -1:
            workers = SentimentAnalysis.get_num_max_workers()
            print(f"[v] Using {workers} workers for the Word2Vec operation...")
            
        return word2vec.Word2Vec(sentences, vector_size=size, window=window,
                                       min_count=min_count, workers=workers)
    
    @staticmethod
    def get_num_max_workers() -> int:
        return multiprocessing.cpu_count()
    
    @staticmethod
    def save_word2vec(model: word2vec.Word2Vec, file_name: str):
        model.save(file_name)

    @staticmethod
    def load_word2vec(file_name: str) -> word2vec.Word2Vec:
        return word2vec.Word2Vec.load(file_name)

# global variables
WORD2VEC_VECTOR_SIZE = 300
INPUT_HIDDEN_DIM = 52 # must be a power of 2

# create an inscance of the class
SaHandler=SentimentAnalysis('data/Watches_withstopwords.json.gz', random_state=1)

In [3]:
print(f"{SaHandler.reviews[0]['score']}: {' '.join(SaHandler.reviews[0]['text'])}")

4.0: having owned two previous g shocks in my life including the first series in 1984 i ve long appreciated their quality the main reason i stopped wearing them was simply because the resin straps would break and having worn them for 4 years i grew weary of them however having burned through many fashion watches in the last 10 years i ve been disappointed to spend only to get 1 or 2 years worth of usage so i m back to g shock i think this model g1710d 7av represents a nice blend between good looks and practical durability which most guys really want the face is smaller on my wrist than i expected from a g shock but i think it s still classy for the office the side buttons are hidden in gray plastic which you can t see in picture but the rest of the watch is metal you might find a classier analog face g shock in the 200 300 but this one is a better value the only reason i didn t give it 5 stars is because the led isn t backlight it s an amber light that comes around the inside bevel so 

In [4]:
max_input_size = SaHandler.largest_review_size
print(max_input_size)

4187


Here we'll train a Word2Vec model using the train data.
Word2Vec will learn word associations from a large corpus of text. It will be able to detect synonymous words,[ref](https://en.wikipedia.org/w/index.php?title=Word2vec&oldid=1143734439) as words are represented by vectors that states the context of that word.

In [5]:
word2vec_model = SaHandler.train_word2vec_model(WORD2VEC_VECTOR_SIZE)
SentimentAnalysis.save_word2vec(word2vec_model, 'word2vec.bin')
word_vectors = word2vec_model.wv

[v] Using 32 workers for the Word2Vec operation...


What if we visualize the generated word2vec model?
In order to do it we'll need some kind of clustering algorithm; we'll use k-Means. I've also tried DBSCAN and OPTICS, as they are other methods found on [sk-learn](https://scikit-learn.org/stable/modules/clustering.html#overview-of-clustering-methods), but it seems like there's no relevant separation to use a density-based algorithm, as they both report one single group.

Some information needed to understand what we'll do next:
- `Word2Vec#wv.vectors` will return a matrix of `<number of input words> rows x <vector size> columns`, representing (for each input word) its vector
- `Word2Vec#wv.index_to_key` will return a list with all the input words. This will be useful in order to relate a vector to an actual word
Also, the section ["What can I do with word vectors?", on gensim wiki](https://radimrehurek.com/gensim/models/keyedvectors.html#what-can-i-do-with-word-vectors) is very interesting to see the word extrapolation, but it won't be discussed in this Jupyter Notebook.

In [6]:
num_clusters = 10

# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans( n_clusters = num_clusters, n_init='auto' )
# `fit_predict` will force each of the `word_vectors.vectors` vectors into one of the 10 clusters
idx = kmeans_clustering.fit_predict( word_vectors.vectors )

# Create a Word / Index dictionary, mapping each vocabulary word to a cluster number
word_centroid_map = dict(list(zip( word_vectors.index_to_key, idx )))

# For each cluster
for cluster in range(num_clusters):
    # Find all of the words for that cluster number, and print them out
    words = []
    for key, value in zip(list(word_centroid_map.keys()),list(word_centroid_map.values())):
        if (value == cluster):
            words.append(key)
    
    print(f"Cluster {cluster}: {words[:10]}")

Cluster 0: ['the', 'that', 'band', 'which', 'face', 'wrist', 'little', 'small', 'strap', 'case']
Cluster 1: ['on', 'or', 'can', 'when', 'out', 'up', 'off', 'even', 'your', 'never']
Cluster 2: ['time', 'only', 'day', 'light', 'there', 'date', 'second', 'set', 'hand', 'hands']
Cluster 3: ['it', 'and', 'watch', 'is', 'this', 'but', 'not', 's', 'very', 'as']
Cluster 4: ['about', 'after', 'now', 'two', 'over', '2', '5', 'few', '3', 'many']
Cluster 5: ['battery', 'batteries']
Cluster 6: ['i', 'was', 'had', 'me', 'bought', 'years', 'am', 'amazon', 'm', 'been']
Cluster 7: ['one', 'watches', 'than', 'more', 'they', 'other', 've', 'casio', 'timex', 'same']
Cluster 8: ['have', 'you', 't', 'so', 'be', 'if', 'get', 'wear', 'use', 'do']
Cluster 9: ['a', 'to', 'for', 'of', 'in', 'my', 'with', 'at', 'has', 'are']


In [17]:
# TODO Average of Word2Vec vectors with TF-IDF : this is one of the best approach which I will recommend. Just take the word vectors and multiply it with their TF-IDF scores. Just take the average and it will represent your sentence vector.
# @ref https://stackoverflow.com/a/31738627/9178470

inputs = tf.keras.Input(shape=(None, 2, 300))

model = Sequential()
# Assuming that your input size (X.shape) is n X t X f where
# n:Batch size
# t: sequence length/time-steps/no:of unrollings)
# f: Nºof feature per time-step
# Note: input_shape=(t,f)
# @ref https://stackoverflow.com/a/62994263/9178470
vector_processor = Bidirectional(LSTM(INPUT_HIDDEN_DIM // 2, return_sequences=False),
                        input_shape=(inputs.shape[-2], inputs.shape[-1]),
                        merge_mode='concat') # concat will return n X t X <HIDDEN_DIM/2>*2 # TODO why *2 and not *<nº of LSTM>?
model.add(vector_processor)
model.add(Dense(5, activation='softmax'))
#model.add(Activation('softmax'))
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])

review_generator = SaHandler.padded_reviews(word2vec_model, 30)
train = next(review_generator)
X = train[0]
y = train[1]
print(X.shape)
print(y.shape)
model.fit(X, y, epochs=1, batch_size=32)
#model(X)

for i,layer in enumerate(model.layers):
    print(f"Layer {i}: {layer.input_shape}, {layer.output_shape}")


(30, 4187, 300)
[[[ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  ...
  [-0.13232827 -2.62986159 -0.36190996 ... -0.45965153 -1.26568198
    1.29691005]
  [-0.23795742 -0.44820398  0.56489909 ...  0.51551569 -0.61315298
    0.22358017]
  [-0.8630321  -0.01265488 -0.0185978  ...  0.09963396 -1.32533157
   -0.682262  ]]

 [[ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  ...
  [ 0.22711904 -0.1230227  -1.40985775 ... -1.84888208 -0.56577241
   -0.44517368]
  [ 1.72474003 -1.50331306  0.00616959 ... -0.04561142 -0.83204603
   -0.83100051]
  [-0.07290153 -0.98932672 -1.06557715 ... -0.29010141 -0.02782034
    1.73782802]]

 [[ 0.          0.  