In [93]:
import random

class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'

class Review :
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
    
    def get_sentiment(self):
        if self.score <=2:
            return Sentiment.NEGATIVE
        elif self.score ==3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiments(self):
        return [x.sentiment for x in self.reviews]
        
    def evently_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

In [49]:
# import Data from Amazon Review Data name Musical Instruments
import json

file_name = "c:/Sekolah/Musical_Instruments_5.json"

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

reviews[1001].text

'The best buy for your bass drum'

In [113]:
# split the data
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.3, random_state=0)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

In [114]:
train_container.evently_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiments()

test_container.evently_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiments()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

332
332


In [168]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

print(train_x[0])
print(train_x_vectors[0].toarray())

I got one of these from another source but have returned it for exchange because of the following issue:  when I record a track either with the "Audio Recorder" or "Guitar Amp" screens in Garageband on my iPad 3, iOS7, the recording is great. But once I toggle over to the multitrack view and attempt to play the recording just made, there is godawful distortion and the audio is completely unlistenable.  Here's the other part of this: I can use Apogee Jam guitar interface, and iRig Pre microphone interface and Garageband behaves as it should and I can play the multitrack view with no problems whatsoever. The Focusrite support folks suggested I do a backup of my iPad contents and then do a software restore but I opted to return the unit instead, since the Apogee and iRig both work perfectly with Garageband. So, I don't know if the unit is having a problem with iOS7 or I just happened to get a lemon but I'll find out when the new one comes next week.  If it works correctly, I'll update thi

In [169]:
# Classification Model
#SVM

from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)
clf_svm.predict(test_x_vectors[0]) 


array(['NEGATIVE'], dtype='<U8')

In [170]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)
clf_dec.predict(test_x_vectors[1]) 

array(['NEGATIVE'], dtype='<U8')

In [171]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier()
clf_rf.fit(train_x_vectors, train_y)
clf_rf.predict(test_x_vectors[2])



array(['NEGATIVE'], dtype='<U8')

In [172]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression()
clf_lr.fit(train_x_vectors, train_y)
clf_lr.predict(test_x_vectors[3])



array(['POSITIVE'], dtype='<U8')

In [173]:
#Evaluate our model (mean accuracy)
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_rf.score(test_x_vectors, test_y))
print(clf_lr.score(test_x_vectors, test_y))

0.7555555555555555
0.6555555555555556
0.6666666666666666
0.762962962962963


In [174]:
# F1 scores

from sklearn.metrics import f1_score

print(f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_rf.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_lr.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))

[0.75373134 0.75735294]
[0.6490566  0.66181818]
[0.63114754 0.69594595]
[0.76296296 0.76296296]


In [181]:
# test our model to a new test
# it seems the model is not perfect, we need to perform additionl data and re-train again our model to become better

test_set = ['very good', 'bad do not buy', 'not worth it']
new_test = vectorizer.transform(test_set)

print(clf_svm.predict(new_test))
print(clf_dec.predict(new_test))
print(clf_rf.predict(new_test))
print(clf_lr.predict(new_test))

['POSITIVE' 'NEGATIVE' 'NEGATIVE']
['POSITIVE' 'NEGATIVE' 'NEGATIVE']
['POSITIVE' 'POSITIVE' 'NEGATIVE']
['POSITIVE' 'NEGATIVE' 'NEGATIVE']
