In [5]:
import numpy as np
import gzip
import re
import json
import datetime
from gensim.models import word2vec
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
#from sklearn.linear_model import SGDRegressor, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix, classification_report
from typing import Dict,Any,List

ModuleNotFoundError: No module named 'sklearn'

In [2]:
class SentimentAnalysis():
    def __init__(self, json_file, hidden_layer = (300,50), random_state: int = None):
        self._reviews=json.load(gzip.open(json_file,'rb'))
        self._hidden_layer = hidden_layer
        self._random_state = random_state
        pass
    
    @property
    def reviews(self) -> List[Dict[str,Any]]:
        return self._reviews
    
    @property
    def largest_review_size(self) -> int:
        # @ref https://stackoverflow.com/a/1582670/9178470
        return len(max(self.review_sentences, key=len))
        
    @property
    def review_sentences(self) -> np.array:
        return np.array([x['text'] for x in self._reviews])
    
    def makeBoW(self):
        count_vectorizer=CountVectorizer()
        counts=count_vectorizer.fit_transform([" ".join(x['text']) for x in self._reviews])
        transformer = TfidfTransformer()
        self.BoW=transformer.fit_transform(counts)
        #Posem els scores en un array
        self.scores=np.array([x['score'] for x in self._reviews])

    def divideTrainTest(self):
        is2013=[x['year']==2013 for x in self._reviews]

        self.BoW_train=self.BoW[~np.array(is2013),:]
        self.scores_train=self.scores[~np.array(is2013)]
        self.BoW_test=self.BoW[np.array(is2013),:]
        self.scores_test=self.scores[np.array(is2013)]
        pass

    def saveTrainTest(self):
        #Mireu np.save i (np.load Opcional)
        pass
    def learn(self):
        #To be fit by the student
        learner=MLPClassifier(hidden_layer_sizes=self._hidden_layer, solver='lbfgs', alpha=1e-5, random_state=self._random_state)
        self.model=learner.fit(self.BoW_train,self.scores_train)
        pass
    
    def evaluate(self):
        #Returns RMSE
        print(np.sqrt(sum(np.power(self.scores_test-self.model.predict(self.BoW_test),2))))
        print(confusion_matrix(self.scores_test,np.round(self.model.predict(self.BoW_test))))
        print(classification_report(self.scores_test,np.round(self.model.predict(self.BoW_test))))

    def makeWord2Vec(self):
        sentences=self.review_sentences
        self.model = word2vec.Word2Vec(sentences, size=300, window=5, min_count=5, workers=12)
        pass
    
    def saveWord2Vec(self,model):
        self.model.save(model)
        pass

    def loadWord2Vec(self,model):
        self.model=word2vec.Word2Vec.load(model)

    def clusterize(self):
        # Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
        # average of 30 words per cluster

        word_vectors = self.model.wv.vectors
        num_clusters = 10

        # Initalize a k-means object and use it to extract centroids
        kmeans_clustering = KMeans( n_clusters = num_clusters )
        idx = kmeans_clustering.fit_predict( word_vectors )

        # Create a Word / Index dictionary, mapping each vocabulary word to
        # a cluster number
        word_centroid_map = dict(list(zip( self.model.wv.index2word, idx )))

        # For the first 10 clusters
        for cluster in range(0,10):
            #
            # Print the cluster number
            print("\nCluster %d" % cluster)
            #
            # Find all of the words for that cluster number, and print them out
            words = []
            for i in range(0,len(list(word_centroid_map.values()))):
                if( list(word_centroid_map.values())[i] == cluster ):
                    words.append(list(word_centroid_map.keys())[i])
            print(words[:10])

        pass

NameError: name 'List' is not defined

In [3]:
SaHandler=SentimentAnalysis('data/Watches_withstopwords.json.gz', random_state=1)

In [4]:
print(f"{SaHandler.reviews[0]['score']}: {' '.join(SaHandler.reviews[0]['text'])}")

4.0: owned two previous g shocks life including first series 1984 long appreciated quality main reason stopped wearing simply resin straps would break worn 4 years grew weary however burned many fashion watches last 10 years disappointed spend get 1 2 years worth usage back g shock think model g1710d 7av represents nice blend good looks practical durability guys really face smaller wrist expected g shock think still classy office side buttons hidden gray plastic see picture rest watch metal might find classier analog face g shock 200 300 one better value reason didn give 5 stars led isn backlight amber light comes around inside bevel led helps read dial hands lcd screens nonetheless still cool light color ps bottom lcd screen isn blue pictured still different color top lcd adds distinction watch


In [2]:
print(SaHandler.largest_review_size)

NameError: name 'SaHandler' is not defined

In [5]:
SaHandler.makeBoW()
# matriu de <nº documents> files x <nº paraules> columnes,
# indicant (per cada text) com d'important és l'aparició (si apareix) aquella/es paraula/es
# Retorna pesos de regularització L2 (la suma quadràtica és 1)
print(SaHandler.BoW.shape)

(68356, 50664)


In [6]:
SaHandler.divideTrainTest()

In [None]:
SaHandler.learn()

In [None]:
SaHandler.evaluate()