In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import scipy
from sklearn.naive_bayes import BernoulliNB
import seaborn as sns

Let us classify Amazon reviews based on training IMDB reviews. Load the *imdb_labelled.txt* file into the dataframe imdb:

In [2]:
imdb = pd.read_csv(r'sentiment labelled sentences\imdb_labelled.txt', delimiter= '\t', header=None)
imdb.columns = ['text', 'sentiment']
imdb.head()

Unnamed: 0,text,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


# Data Preparation

Load positive words file, from https://gist.github.com/mkulakowski2/4289437:

In [3]:
# load it
with open('positive-words.txt','r') as file:
    positive = file.read()


# remove description and empty elements
positive = positive[positive.rfind(';')+1:]
positive = positive.split('\n')

for i in range(positive.count('')):
    positive.remove('')

# add a couple of positive words    
positive.append('cool')
positive.append('decent')
    
len(positive)

2008

Load stop words file:

In [4]:
# load list of stop words
with open('stop_words.txt','r') as file:
    stop = file.read()
stop = stop.split('\n')

Load negative words file, from http://ptrckprry.com/course/ssd/data/negative-words.txt:

In [5]:
with open('negative-words.txt','r') as file:
    negative = file.read()


# remove description and empty elements
negative = negative[negative.rfind(';')+1:]
negative = negative.split('\n')

for i in range(negative.count('')):
    negative.remove('')

len(negative)

4783

#### Data Cleaning ####

In [6]:
# Prepare a translation table from punctuation
punctuation = ''.join(['.',',',';',':','-','?','!'])
TRANSDICT = str.maketrans(punctuation,' '*len(punctuation))

def remove_punctuation(word):
    """ removes punctuation from a word"""
    return word.translate(TRANSDICT).strip().replace(' ','')


def percent_positive(review):
    """ Tokenizes each sentence, checks for membership in positive words,
        makes sure positive words are not preceded by 'not'
    """
    
    # tokenize a sentence and remove punctuation
    tokenized = review.lower().split(' ')
    tokenized = [remove_punctuation(word) for word in tokenized]
    pcnt = 0
    
    # check for membership in poitive words list, making sure 'not' doesn't precede
    for word in tokenized:
        if tokenized.index(word) == 0 and (word in positive):
            pcnt += 1/len(tokenized)
        elif tokenized.index(word) == 1:
            if word in positive and (tokenized[tokenized.index(word)-1] != 'not'):
                pcnt += 1/len(tokenized)
        elif tokenized.index(word) > 1:
            if word in positive and (tokenized[tokenized.index(word)-1] != 'not') and (tokenized[tokenized.index(word)-2] != 'not'):
                pcnt += 1/len(tokenized)
    return pcnt

# Apply percent_positive to the text column in our dataframe
imdb['positive'] = imdb['text'].apply(percent_positive)

In [7]:
def percent_negative(review):
    """ Tokenizes each sentence, checks for membership in positive words,
        makes sure positive words are not perceded by 'not'
    """
    
    # tokenize a sentence and remove punctuation
    tokenized = review.lower().split(' ')
    tokenized = [remove_punctuation(word) for word in tokenized]
    pcnt = 0
    
    # check for membership in negative words list, making sure 'not' doesn't precede
    for word in tokenized:
        if tokenized.index(word) == 0 and word in negative:
            pcnt += 1/len(tokenized)
        elif tokenized.index(word) == 1:
            if word in negative and (tokenized[tokenized.index(word)-1] != 'not'):
                pcnt += 1/len(tokenized)
        elif tokenized.index(word) > 1:
            if word in negative and (tokenized[tokenized.index(word)-1] != 'not') and (tokenized[tokenized.index(word)-2] != 'not'):
                pcnt += 1/len(tokenized)
    return pcnt

# Apply percent_negative to the text column in our dataframe
imdb['negative'] = imdb['text'].apply(percent_negative)

In [8]:
# Sort values by most negative first
imdb.sort_values(by=['negative'],ascending=False).head()

Unnamed: 0,text,sentiment,positive,negative
499,"It's a mediocre, miserable, hollow, laughable ...",0,0.0,0.384615
101,Awful.,0,0.0,0.333333
94,Horrible!,0,0.0,0.333333
671,"Bad characters, bad story and bad acting.",0,0.0,0.333333
725,"Instead, we got a bore fest about a whiny, spo...",0,0.0,0.285714


In [9]:
# Sort values by most positive first
imdb.sort_values(by=['positive'],ascending=False).head()

Unnamed: 0,text,sentiment,positive,negative
497,Highly recommended A+,1,0.4,0.0
63,Brilliant!,1,0.333333,0.0
534,") a happy, wonderful, feel good ending!",1,0.333333,0.0
544,"A very good film indeed, about great and uncon...",1,0.333333,0.0
734,;) Recommend with confidence!,1,0.333333,0.0


In [12]:
# Load the amazon dataset
amazon = pd.read_csv(r'sentiment labelled sentences\amazon_cells_labelled.txt', delimiter= '\t', header=None)
amazon.columns = ['text', 'sentiment']
amazon.head()


# Apply percent_positive to the text column in our dataframe
amazon['positive'] = amazon['text'].apply(percent_positive)


# Apply percent_negative to the text column in our dataframe
amazon['negative'] = amazon['text'].apply(percent_negative)

amazon.head()

Unnamed: 0,text,sentiment,positive,negative
0,So there is no way for me to plug it in here i...,0,0.0,0.0
1,"Good case, Excellent value.",1,0.5,0.0
2,Great for the jawbone.,1,0.25,0.0
3,Tied to charger for conversations lasting more...,0,0.0,0.090909
4,The mic is great.,1,0.25,0.0


# Training & Prediction

Let us load a Bernoulli Naive Bayes model, train it on our IMDB data, and predict Amazon review sentiments with it:

In [13]:
# Initialize a model object
classifier = BernoulliNB()

# Fit our model to the data.
classifier.fit(imdb[['positive','negative']], imdb['sentiment'])

# Classify, storing the result in a new variable.
y_pred = classifier.predict(amazon[['positive','negative']])

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    amazon.shape[0],
    (amazon['sentiment'] != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 159


# Conclusion

We are able to predict 841 out of a 1000 amazon review sentiments correctly when trained on IMDB reviews. 