In [71]:
import numpy as np
import gzip
import re
import json
import datetime
from gensim.models import word2vec
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.cluster import KMeans

import multiprocessing # number of threads

import matplotlib.pyplot as plt # plot
import matplotlib.gridspec as gridspec

from typing import Dict,Any,List

In [41]:
class SentimentAnalysis():
    def __init__(self, json_file, hidden_layer = (300,50), train_test_split: float = 0.7, random_state: int = None):
        self._reviews=json.load(gzip.open(json_file,'rb'))
        
        # split in train-test
        self._test_reviews=self._reviews[int(len(self._reviews)*train_test_split):]
        self._reviews=self._reviews[:-len(self._test_reviews)]
        
        # save variables for the rest of the functions
        self._hidden_layer = hidden_layer
        self._random_state = random_state
        self._train_test_split = train_test_split
    
    @property
    def reviews(self) -> List[Dict[str,Any]]:
        return self._reviews
    
    @property
    def largest_review_size(self) -> int:
        # @ref https://stackoverflow.com/a/1582670/9178470
        return len(max(self.review_sentences, key=len))
        
    @property
    def review_sentences(self) -> List[str]:
        return [x['text'] for x in self._reviews]
    
    def makeBoW(self):
        count_vectorizer=CountVectorizer()
        counts=count_vectorizer.fit_transform([" ".join(x['text']) for x in self._reviews])
        transformer = TfidfTransformer()
        self.BoW=transformer.fit_transform(counts)
        #Posem els scores en un array
        self.scores=np.array([x['score'] for x in self._reviews])

    def learn(self):
        #To be fit by the student
        learner=MLPClassifier(hidden_layer_sizes=self._hidden_layer, solver='lbfgs', alpha=1e-5, random_state=self._random_state)
        self.model=learner.fit(self.BoW_train,self.scores_train)
        pass
    
    def evaluate(self):
        #Returns RMSE
        print(np.sqrt(sum(np.power(self.scores_test-self.model.predict(self.BoW_test),2))))
        print(confusion_matrix(self.scores_test,np.round(self.model.predict(self.BoW_test))))
        print(classification_report(self.scores_test,np.round(self.model.predict(self.BoW_test))))

    def train_word2vec_model(self, size: int, window: int = 5, min_count: int = 5, workers: int = -1) -> word2vec.Word2Vec:
        """
        Get a word2vec model according to the input data.
        @ref https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
        
        @param size:      Dimensionality of the word vectors
        @param window:    Maximum distance between the current and predicted word
        @param min_count: Ignores all words with total frequency lower than this
        @param workers:   Use these many worker threads to train the model; -1 to use all
        @return Word2Vec model
        
        """
        
        sentences=self.review_sentences
        
        if workers == -1:
            workers = SentimentAnalysis.get_num_max_workers()
            print(f"[v] Using {workers} workers for the Word2Vec operation...")
            
        return word2vec.Word2Vec(sentences, vector_size=size, window=window,
                                       min_count=min_count, workers=workers)
    
    @staticmethod
    def get_num_max_workers() -> int:
        return multiprocessing.cpu_count()
    
    @staticmethod
    def save_word2vec(model: word2vec.Word2Vec, file_name: str):
        model.save(file_name)

    @staticmethod
    def load_word2vec(file_name: str) -> word2vec.Word2Vec:
        return word2vec.Word2Vec.load(file_name)

# global variables
WORD2VEC_VECTOR_SIZE = 300

# create an inscance of the class
SaHandler=SentimentAnalysis('data/Watches_withstopwords.json.gz', random_state=1)

In [4]:
print(f"{SaHandler.reviews[0]['score']}: {' '.join(SaHandler.reviews[0]['text'])}")

4.0: having owned two previous g shocks in my life including the first series in 1984 i ve long appreciated their quality the main reason i stopped wearing them was simply because the resin straps would break and having worn them for 4 years i grew weary of them however having burned through many fashion watches in the last 10 years i ve been disappointed to spend only to get 1 or 2 years worth of usage so i m back to g shock i think this model g1710d 7av represents a nice blend between good looks and practical durability which most guys really want the face is smaller on my wrist than i expected from a g shock but i think it s still classy for the office the side buttons are hidden in gray plastic which you can t see in picture but the rest of the watch is metal you might find a classier analog face g shock in the 200 300 but this one is a better value the only reason i didn t give it 5 stars is because the led isn t backlight it s an amber light that comes around the inside bevel so 

In [11]:
max_input_size = SaHandler.largest_review_size
print(max_input_size)

4187


Here we'll train a Word2Vec model using the train data.
Word2Vec will learn word associations from a large corpus of text. It will be able to detect synonymous words,[ref](https://en.wikipedia.org/w/index.php?title=Word2vec&oldid=1143734439) as words are represented by vectors that states the context of that word.

In [29]:
word2vec_model = SaHandler.train_word2vec_model(WORD2VEC_VECTOR_SIZE)
SentimentAnalysis.save_word2vec(word2vec_model, 'word2vec.bin')
word_vectors = word2vec_model.wv

[v] Using 32 workers for the Word2Vec operation...


What if we visualize the generated word2vec model?
In order to do it we'll need some kind of clustering algorithm; we'll use k-Means. I've also tried DBSCAN and OPTICS, as they are other methods found on [sk-learn](https://scikit-learn.org/stable/modules/clustering.html#overview-of-clustering-methods), but it seems like there's no relevant separation to use a density-based algorithm, as they both report one single group.

Some information needed to understand what we'll do next:
- `Word2Vec#wv.vectors` will return a matrix of `<number of input words> rows x <vector size> columns`, representing (for each input word) its vector
- `Word2Vec#wv.index_to_key` will return a list with all the input words. This will be useful in order to relate a vector to an actual word
Also, the section ["What can I do with word vectors?", on gensim wiki](https://radimrehurek.com/gensim/models/keyedvectors.html#what-can-i-do-with-word-vectors) is very interesting to see the word extrapolation, but it won't be discussed in this Jupyter Notebook.

In [50]:
num_clusters = 10

# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans( n_clusters = num_clusters, n_init='auto' )
# `fit_predict` will force each of the `word_vectors.vectors` vectors into one of the 10 clusters
idx = kmeans_clustering.fit_predict( word_vectors.vectors )

# Create a Word / Index dictionary, mapping each vocabulary word to a cluster number
word_centroid_map = dict(list(zip( word_vectors.index_to_key, idx )))

# For each cluster
for cluster in range(num_clusters):
    # Find all of the words for that cluster number, and print them out
    words = []
    for key, value in zip(list(word_centroid_map.keys()),list(word_centroid_map.values())):
        if (value == cluster):
            words.append(key)
    
    print(f"Cluster {cluster}: {words[:10]}")

(14340,)
Cluster 0: ['for', 'of', 'in', 'my', 'with', 'at', 'has', 'just', 'when', 'an']
Cluster 1: ['to', 'on', 'band', 'or', 'out', 'up', 'get', 'wrist', 'off', 'little']
Cluster 2: ['the', 'are', 'which', 'face', 'also', 'light', 'read', 'there', 'see', 'hand']
Cluster 3: ['you', 'time', 'use', 'day', 'your', 'date', 'work', 'set', 'second', 'need']
Cluster 4: ['about', 'after', 'now', 'two', 'over', '2', '5', 'few', '3', 'many']
Cluster 5: ['it', 'a', 'and', 'watch', 'is', 'this', 's', 'very', 'great', 'like']
Cluster 6: ['i', 'that', 'but', 'have', 'not', 'as', 't', 'one', 'so', 'be']
Cluster 7: ['was', 'had', 'me', 'bought', 'years', 'amazon', 'battery', 'been', 'he', 'first']
Cluster 8: ['can', 'would', 'will', 'don', 'does', 'could', 'did', 'doesn', 'should', 'didn']
Cluster 9: ['compliments', 'comments']


In [None]:
SaHandler.divideTrainTest()

In [None]:
SaHandler.learn()

In [None]:
SaHandler.evaluate()