In [1]:
class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
    
    def get_sentiment(self):
        if self.score <=2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: 
            return Sentiment.POSITIVE


In [2]:
import json

file_name = 'data/sentiment/books_small.json'
reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

In [3]:
reviews[2].text

'Very simple book, but leaves you feeling good.  No over the top sex scenes, no graphic violence.  Just a simple book that talks touches on child abuse and dysfunction in a family.  I liked it because the author did not go into graphic detail about the abuse, but instead focused how it affected the people involved.  Good story, good characters.'

### Particionar en prueba y entrenamiento

In [4]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(reviews, test_size=0.33, random_state=42)

### Descomponer entre datos y etiquetas

In [5]:
train_x = [elem.text for elem in train]
train_y = [elem.sentiment for elem in train]

test_x = [elem.text for elem in test]
test_y = [elem.sentiment for elem in test]

### Utilizar vectorizer para la tokenization sobre los datos de tipo texto
#####
La tokenización es el proceso de dividir el texto en un conjunto de piezas significativas. Estas piezas se llaman tokens. Permite utilizar el principio de bag of words

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
train_x_vector = vectorizer.fit_transform(train_x)
test_x_vector = vectorizer.transform(test_x)

train_x_vector[0]

<1x7372 sparse matrix of type '<class 'numpy.int64'>'
	with 17 stored elements in Compressed Sparse Row format>

## Clasificadores

### SVM


In [17]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vector, train_y)

clf_svm.predict(test_x_vector[0])

array(['POSITIVE'], dtype='<U8')

### Decision Tree

In [19]:
from sklearn.tree import DecisionTreeClassifier

clf_tree = DecisionTreeClassifier()
clf_tree.fit(train_x_vector, train_y)

clf_tree.predict(test_x_vector[0])

array(['POSITIVE'], dtype='<U8')

### Naive Bayes

In [23]:
from sklearn.naive_bayes import GaussianNB

clf_nb = GaussianNB()
clf_nb.fit(train_x_vector.toarray(), train_y)

clf_nb.predict(test_x_vector[0].toarray())

array(['POSITIVE'], dtype='<U8')

### Logistic regression

In [25]:
from sklearn.linear_model import LogisticRegression

clf_lor = LogisticRegression()
clf_lor.fit(train_x_vector, train_y)

clf_lor.predict(test_x_vector[0])

array(['POSITIVE'], dtype='<U8')

## Evaluation

In [30]:
# Mean accuracy

print('El score para la svm es de: {}'.format(clf_svm.score(test_x_vector, test_y)))
print('El score para el decision tree es de: {}'.format(clf_tree.score(test_x_vector, test_y)))
print('El score para la naive bayes es de: {}'.format(clf_nb.score(test_x_vector.toarray(), test_y)))
print('El score para la logistic regression es de: {}'.format(clf_lor.score(test_x_vector, test_y)))

El score para la svm es de: 0.8242424242424242
El score para el decision tree es de: 0.7666666666666667
El score para la naive bayes es de: 0.8121212121212121
El score para la logistic regression es de: 0.8303030303030303


In [40]:
# F1 scores

from sklearn.metrics import f1_score

score_svm = f1_score(test_y, clf_svm.predict(test_x_vector), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])
score_dt = f1_score(test_y, clf_tree.predict(test_x_vector), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])
score_nb = f1_score(test_y, clf_nb.predict(test_x_vector.toarray()), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])
score_lor = f1_score(test_y, clf_lor.predict(test_x_vector), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])

print('El f1 score para la svm es de: {}'.format(score_svm))
print('El f1 score para el decision tree es de: {}'.format(score_dt))
print('El f1 score para la naive bayes es de: {}'.format(score_nb))
print('El f1 score para la logistic regression es de: {}'.format(score_lor))

El f1 score para la svm es de: [0.91319444 0.21052632 0.22222222]
El f1 score para el decision tree es de: [0.87017544 0.13333333 0.06666667]
El f1 score para la naive bayes es de: [0.89678511 0.08510638 0.09090909]
El f1 score para la logistic regression es de: [0.91370558 0.12244898 0.1       ]


#### Un f1 score tan bajo puede ser debido a la distribucion de los conjuntos de datos


In [121]:
import random

class ReviewContainer:
    """Función para crear una distribución normal entre los conjuntos de datos"""
    def __init__(self, reviews):
        self.reviews = reviews
        self.evenly_distributed()
    
    def get_text(self):
        return [review.text for review in self.reviews]

    def get_sentiment(self):        
        return [review.sentiment for review in self.reviews]

    def evenly_distributed(self):
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        self.reviews = positive[:len(negative)] + negative
        random.shuffle(self.reviews)

In [122]:
train_container = ReviewContainer(train)
test_container = ReviewContainer(test)

In [123]:
new_train_x = train_container.get_text()
new_train_y = train_container.get_sentiment()

new_test_x = test_container.get_text()
new_test_y = test_container.get_sentiment()

print(new_train_y.count(Sentiment.NEUTRAL))

0


In [124]:
new_train_y.count(Sentiment.NEGATIVE)

436

In [143]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
new_train_x_vector = vectorizer.fit_transform(new_train_x)
new_test_x_vector = vectorizer.transform(new_test_x)

new_train_x_vector[0]

<1x8906 sparse matrix of type '<class 'numpy.int64'>'
	with 66 stored elements in Compressed Sparse Row format>

In [126]:
from sklearn import svm

new_clf_svm = svm.SVC(kernel='linear')
new_clf_svm.fit(new_train_x_vector, new_train_y)

new_clf_svm.predict(new_test_x_vector[0])

array(['NEGATIVE'], dtype='<U8')

In [127]:
from sklearn.tree import DecisionTreeClassifier

new_clf_tree = DecisionTreeClassifier()
new_clf_tree.fit(new_train_x_vector, new_train_y)

new_clf_tree.predict(new_test_x_vector[0])

array(['NEGATIVE'], dtype='<U8')

In [128]:
from sklearn.naive_bayes import GaussianNB

new_clf_nb = GaussianNB()
new_clf_nb.fit(new_train_x_vector.toarray(), new_train_y)

new_clf_nb.predict(new_test_x_vector[0].toarray())

array(['NEGATIVE'], dtype='<U8')

In [129]:
from sklearn.linear_model import LogisticRegression

new_clf_lor = LogisticRegression()
new_clf_lor.fit(new_train_x_vector, new_train_y)

new_clf_lor.predict(new_test_x_vector[0])

array(['NEGATIVE'], dtype='<U8')

In [130]:
# Mean accuracy

print('El score para la svm es de: {}'.format(new_clf_svm.score(new_test_x_vector, new_test_y)))
print('El score para el decision tree es de: {}'.format(new_clf_tree.score(new_test_x_vector, new_test_y)))
print('El score para la naive bayes es de: {}'.format(new_clf_nb.score(new_test_x_vector.toarray(), new_test_y)))
print('El score para la logistic regression es de: {}'.format(new_clf_lor.score(new_test_x_vector, new_test_y)))

El score para la svm es de: 0.7980769230769231
El score para el decision tree es de: 0.6298076923076923
El score para la naive bayes es de: 0.6346153846153846
El score para la logistic regression es de: 0.8149038461538461


In [131]:
# F1 scores

from sklearn.metrics import f1_score

new_score_svm = f1_score(new_test_y, new_clf_svm.predict(new_test_x_vector), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])
new_score_dt = f1_score(new_test_y, new_clf_tree.predict(new_test_x_vector), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])
new_score_nb = f1_score(new_test_y, new_clf_nb.predict(new_test_x_vector.toarray()), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])
new_score_lor = f1_score(new_test_y, new_clf_lor.predict(new_test_x_vector), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])

print('El f1 score para la svm es de: {}'.format(new_score_svm))
print('El f1 score para el decision tree es de: {}'.format(new_score_dt))
print('El f1 score para la naive bayes es de: {}'.format(new_score_nb))
print('El f1 score para la logistic regression es de: {}'.format(new_score_lor))

El f1 score para la svm es de: [0.8028169  0.         0.79310345]
El f1 score para el decision tree es de: [0.64018692 0.         0.61881188]
El f1 score para la naive bayes es de: [0.59574468 0.         0.66666667]
El f1 score para la logistic regression es de: [0.82051282 0.         0.808933  ]


### Un f1 score tan bajo puede ser debido a la poca cantidad de datos 


In [76]:
import json

file_name = 'data/sentiment/books_small_10000.json'
reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

In [77]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(reviews, test_size=0.33, random_state=42)

## Ensayo con data toy


In [145]:
test_set = ['I thorougly enjoyed this, 5 stars', 'bad book do not buy', 'horrible waste of time', 'piece of shit']
new_test = vectorizer.transform(test_set)
new_clf_lor.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')