In [21]:
import numpy as np
import gzip
import re
import json
import datetime
from gensim.models import word2vec
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix, classification_report

import multiprocessing # number of threads

from typing import Dict,Any,List

In [37]:
class SentimentAnalysis():
    def __init__(self, json_file, hidden_layer = (300,50), random_state: int = None):
        self._reviews=json.load(gzip.open(json_file,'rb'))
        self._hidden_layer = hidden_layer
        self._random_state = random_state
        pass
    
    @property
    def reviews(self) -> List[Dict[str,Any]]:
        return self._reviews
    
    @property
    def largest_review_size(self) -> int:
        # @ref https://stackoverflow.com/a/1582670/9178470
        return len(max(self.review_sentences, key=len))
        
    @property
    def review_sentences(self) -> List[str]:
        return [x['text'] for x in self._reviews]
    
    def makeBoW(self):
        count_vectorizer=CountVectorizer()
        counts=count_vectorizer.fit_transform([" ".join(x['text']) for x in self._reviews])
        transformer = TfidfTransformer()
        self.BoW=transformer.fit_transform(counts)
        #Posem els scores en un array
        self.scores=np.array([x['score'] for x in self._reviews])

    def divideTrainTest(self):
        is2013=[x['year']==2013 for x in self._reviews]

        self.BoW_train=self.BoW[~np.array(is2013),:]
        self.scores_train=self.scores[~np.array(is2013)]
        self.BoW_test=self.BoW[np.array(is2013),:]
        self.scores_test=self.scores[np.array(is2013)]
        pass

    def saveTrainTest(self):
        #Mireu np.save i (np.load Opcional)
        pass
    def learn(self):
        #To be fit by the student
        learner=MLPClassifier(hidden_layer_sizes=self._hidden_layer, solver='lbfgs', alpha=1e-5, random_state=self._random_state)
        self.model=learner.fit(self.BoW_train,self.scores_train)
        pass
    
    def evaluate(self):
        #Returns RMSE
        print(np.sqrt(sum(np.power(self.scores_test-self.model.predict(self.BoW_test),2))))
        print(confusion_matrix(self.scores_test,np.round(self.model.predict(self.BoW_test))))
        print(classification_report(self.scores_test,np.round(self.model.predict(self.BoW_test))))

    def train_word2vec_model(self, size: int, window: int = 5, min_count: int = 5, workers: int = -1) -> word2vec.Word2Vec:
        """
        Get a word2vec model according to the input data.
        @ref https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
        
        @param size:      Dimensionality of the word vectors
        @param window:    Maximum distance between the current and predicted word
        @param min_count: Ignores all words with total frequency lower than this
        @param workers:   Use these many worker threads to train the model; -1 to use all
        @return Word2Vec model
        
        """
        
        sentences=self.review_sentences
        
        if workers == -1:
            workers = SentimentAnalysis.get_num_max_workers()
            print(f"[v] Using {workers} workers for the Word2Vec operation...")
            
        return word2vec.Word2Vec(sentences, vector_size=size, window=window,
                                       min_count=min_count, workers=workers)
    
    @staticmethod
    def get_num_max_workers() -> int:
        return multiprocessing.cpu_count()
    
    @staticmethod
    def save_word2vec(model: word2vec.Word2Vec, file_name: str):
        model.save(file_name)

    @staticmethod
    def load_word2vec(file_name: str) -> word2vec.Word2Vec:
        return word2vec.Word2Vec.load(file_name)

    def clusterize(self, model: word2vec.Word2Vec):
        # Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
        # average of 30 words per cluster

        word_vectors = model.wv.vectors
        num_clusters = 10

        # Initalize a k-means object and use it to extract centroids
        kmeans_clustering = KMeans( n_clusters = num_clusters, n_init='auto' )
        idx = kmeans_clustering.fit_predict( word_vectors )

        # Create a Word / Index dictionary, mapping each vocabulary word to
        # a cluster number
        word_centroid_map = dict(list(zip( model.wv.index_to_key, idx )))

        # For the first 10 clusters
        for cluster in range(0,10):
            #
            # Print the cluster number
            print("\nCluster %d" % cluster)
            #
            # Find all of the words for that cluster number, and print them out
            words = []
            for i in range(0,len(list(word_centroid_map.values()))):
                if( list(word_centroid_map.values())[i] == cluster ):
                    words.append(list(word_centroid_map.keys())[i])
            print(words[:10])

        pass

# global variables
WORD2VEC_VECTOR_SIZE = 300

# create an inscance of the class
SaHandler=SentimentAnalysis('data/Watches_withstopwords.json.gz', random_state=1)

In [4]:
print(f"{SaHandler.reviews[0]['score']}: {' '.join(SaHandler.reviews[0]['text'])}")

4.0: having owned two previous g shocks in my life including the first series in 1984 i ve long appreciated their quality the main reason i stopped wearing them was simply because the resin straps would break and having worn them for 4 years i grew weary of them however having burned through many fashion watches in the last 10 years i ve been disappointed to spend only to get 1 or 2 years worth of usage so i m back to g shock i think this model g1710d 7av represents a nice blend between good looks and practical durability which most guys really want the face is smaller on my wrist than i expected from a g shock but i think it s still classy for the office the side buttons are hidden in gray plastic which you can t see in picture but the rest of the watch is metal you might find a classier analog face g shock in the 200 300 but this one is a better value the only reason i didn t give it 5 stars is because the led isn t backlight it s an amber light that comes around the inside bevel so 

In [11]:
max_input_size = SaHandler.largest_review_size
print(max_input_size)

4187


Here we'll train a Word2Vec model using the train data.
Word2Vec will learn word associations from a large corpus of text. It will be able to detect synonymous words,[ref](https://en.wikipedia.org/w/index.php?title=Word2vec&oldid=1143734439) as words are represented by vectors that states the context of that word.

In [29]:
word2vec_model = SaHandler.train_word2vec_model(WORD2VEC_VECTOR_SIZE)
SentimentAnalysis.save_word2vec(word2vec_model, 'word2vec.bin')

[v] Using 32 workers for the Word2Vec operation...


In [38]:
SaHandler.clusterize(word2vec_model)


Cluster 0
['or', 'work', 'never', 'water', 'while', 'wearing', 'working', 'worn', 'under', 'etc']

Cluster 1
['i', 'was', 'had', 'me', 'bought', 'am', 'amazon', 'buy', 'he', 'first']

Cluster 2
['of', 'that', 'have', 'as', 'one', 'watches', 'than', 'they', 'other', 've']

Cluster 3
['you', 'time', 'if', 'use', 'day', 'battery', 'do', 'your', 'date', 'set']

Cluster 4
['it', 'a', 'and', 'watch', 'is', 'this', 'but', 'not', 's', 'very']

Cluster 5
['years', 'year', 'months', 'days', 'times', 'month', 'week', 'seconds', 'minutes', 'weeks']

Cluster 6
['to', 'in', 'my', 'with', 't', 'so', 'at', 'has', 'can', 'just']

Cluster 7
['are', 'which', 'face', 'light', 'read', 'see', 'hand', 'case', 'dial', 'hands']

Cluster 8
['for', 'about', 'after', 'been', 'now', 'two', 'over', '2', '5', 'few']

Cluster 9
['the', 'on', 'be', 'band', 'out', 'up', 'get', 'off', 'little', 'back']


In [None]:
SaHandler.divideTrainTest()

In [None]:
SaHandler.learn()

In [None]:
SaHandler.evaluate()