# OOPM CLASS

In [2]:
import random
class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'
class Review:
    def __init__(self, text, score):
        self.score = score
        self.text = text
        self.sentiment = self.get_sentiment()
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        if self.score == 3:
            return Sentiment.NEUTRAL
        else :
            return Sentiment.POSITIVE
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    def get_text(self):
        return [x.text for x in self.reviews]
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    
    
    def evenly_distribute(self):
        negative = list(filter(lambda x:x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x:x.sentiment == Sentiment.POSITIVE, self.reviews))
        neutral = list(filter(lambda x:x.sentiment == Sentiment.NEUTRAL, self.reviews))
        
        #print(len(positive))
        #print(len(negative))
        #print(len(neutral))
        positive_shrunk = positive[:len(negative)]
        neutral_shrunk = neutral[:len(negative)]
        self.reviews = negative+positive_shrunk+neutral_shrunk
        
        random.shuffle(self.reviews)

# LOAD DATA 

In [3]:
import json

file_name = '/home/stayaryan/Desktop/project1python/Books_5_millions.json'

reviews = []

with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

# PREP DATA## 

In [4]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size = 0.25, random_state = 42)

train_container = ReviewContainer(training)

test_container = ReviewContainer(test)


#len(cont.reviews)

In [5]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))
print(train_y.count(Sentiment.NEUTRAL))

24227
24227
24227


# BAGS OF WORDS VECTORIZATION

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

# CLASSIFICATION 

## LINEAR SVM

In [None]:
from sklearn import svm

clf_svm = svm.SVC(kernel = 'linear')

clf_svm.fit(train_x_vectors, train_y)

clf_svm.predict(test_x_vectors[0])

## DECISION TREE

In [21]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()

clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['NEUTRAL'], dtype='<U8')

## GAUSSIAN NB 

In [22]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()

clf_gnb.fit(train_x_vectors.toarray(), train_y)

clf_gnb.predict(test_x_vectors[0].toarray())

MemoryError: Unable to allocate array with shape (72681, 84123) and data type int64

## LOGISTIC REGRESSION

In [7]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()

clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])



array(['NEUTRAL'], dtype='<U8')

# ACCURACY COMPARISON BASIC

In [8]:
#Accuracy
#print("LINEAR SVM CLASSIIFER ACCURACY : %f"%clf_svm.score(test_x_vectors, test_y))
#print("DECISION TREE CLASSIFIER ACCURACY : %f"%clf_dec.score(test_x_vectors, test_y))
#print("GAUSSIAN NAIVE BAYES CLASSIFIER ACCURACY : %f"%clf_gnb.score(test_x_vectors.toarray(), test_y))
print("LOGISTIC REGRESSION CLASSIFIER ACCURACY : %f"%clf_log.score(test_x_vectors, test_y))

LOGISTIC REGRESSION CLASSIFIER ACCURACY : 0.681195


In [9]:
#F1 SCORE

from sklearn.metrics import f1_score
#f1_score(test_y, clf_svm.predict(test_x_vectors), average = None, labels =[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE])
#f1_score(test_y, clf_dec.predict(test_x_vectors), average = None, labels =[Sentiment.POSITIVE, Sentiment.NEUTRAL,Sentiment.NEGATIVE])
#f1_score(test_y, clf_gnb.predict(test_x_vectors.toarray()), average = None, labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE])
f1_score(test_y, clf_log.predict(test_x_vectors), average = None ,labels =[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])

array([0.76101949, 0.57202958, 0.70470088])

In [101]:
#array([0.90310171, 0.27070707, 0.42196532])
#array([0.89058878, 0.16216216, 0.17921147])
#array([0.7966489 , 0.13432836, 0.11809524])
#array([0.92293493, 0.28428928, 0.43018868]
#    Logistic Regression is the winner
#test_y.count(Sentiment.NEUTRAL)

# FINAL OUTPUT

In [15]:
comment = (input('Enter a test comment : '))
commentdash = [comment]
comment_vector = vectorizer.transform(commentdash)

clf_log.predict(comment_vector)
        

Enter a test comment : thoroughly loved it


array(['POSITIVE'], dtype='<U8')