In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import scipy
from sklearn.naive_bayes import BernoulliNB
import seaborn as sns

Let us classify Amazon reviews based on training IMDB reviews. Load the *imdb_labelled.txt* file into the dataframe imdb:

In [4]:
# Load the Amazon dataset
amazon = pd.read_csv(r'sentiment labelled sentences\amazon_cells_labelled.txt', delimiter='\t', header=None)
amazon.columns = ['text', 'sentiment']

# Load the Yelp dataset
yelp = pd.read_csv(r'sentiment labelled sentences\yelp_labelled.txt', delimiter='\t', header=None)
yelp.columns = ['text', 'sentiment']

#Load the IMDB dataset
imdb = pd.read_csv(r'sentiment labelled sentences\imdb_labelled.txt', delimiter= '\t', header=None)
imdb.columns = ['text', 'sentiment']
imdb.head()

Unnamed: 0,text,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


# Data Preparation

Load positive words file, from https://gist.github.com/mkulakowski2/4289437

Load negative words file, from http://ptrckprry.com/course/ssd/data/negative-words.txt

In [5]:
# load positive & negative words 
positive, negative = [open(file, 'r').read() for file in ['positive-words.txt', 'negative-words.txt']]

# load list of stop words
with open('stop_words.txt','r') as file:
    stop = file.read()
stop = stop.split('\n')

# remove description and make it a list
positive, negative = [word[word.rfind(';')+1:] for word in [positive, negative]]
positive, negative = [word.split('\n') for word in [positive, negative]]

# Remove whitespace characters
for word in [positive, negative]:
    for i in range(word.count('')):
        word.remove('')    

# add a couple of positive words    
positive.append('cool')
positive.append('decent')

negative

['2-faced',
 '2-faces',
 'abnormal',
 'abolish',
 'abominable',
 'abominably',
 'abominate',
 'abomination',
 'abort',
 'aborted',
 'aborts',
 'abrade',
 'abrasive',
 'abrupt',
 'abruptly',
 'abscond',
 'absence',
 'absent-minded',
 'absentee',
 'absurd',
 'absurdity',
 'absurdly',
 'absurdness',
 'abuse',
 'abused',
 'abuses',
 'abusive',
 'abysmal',
 'abysmally',
 'abyss',
 'accidental',
 'accost',
 'accursed',
 'accusation',
 'accusations',
 'accuse',
 'accuses',
 'accusing',
 'accusingly',
 'acerbate',
 'acerbic',
 'acerbically',
 'ache',
 'ached',
 'aches',
 'achey',
 'aching',
 'acrid',
 'acridly',
 'acridness',
 'acrimonious',
 'acrimoniously',
 'acrimony',
 'adamant',
 'adamantly',
 'addict',
 'addicted',
 'addicting',
 'addicts',
 'admonish',
 'admonisher',
 'admonishingly',
 'admonishment',
 'admonition',
 'adulterate',
 'adulterated',
 'adulteration',
 'adulterier',
 'adversarial',
 'adversary',
 'adverse',
 'adversity',
 'afflict',
 'affliction',
 'afflictive',
 'affront',


#### Data Cleaning ####

In [6]:
##### Functions


# remove punctuation
def remove_punctuation(word):
    """ removes punctuation from a word"""
    
    punctuation = ''.join(['.',',',';',':','-','?','!','*'])
    TRANSDICT = str.maketrans(punctuation,' '*len(punctuation))
    return word.translate(TRANSDICT).strip().replace(' ','').replace('* ','')


def percent_positive(review):
    """ Tokenizes each sentence, checks for membership in positive words,
        makes sure positive words are not preceded by 'not'
    """
    
    # tokenize a sentence and remove punctuation
    tokenized = review.lower().split(' ')
    tokenized = [remove_punctuation(word) for word in tokenized]
    pcnt = 0
    
    # check for membership in poitive words list, making sure 'not' doesn't precede
    for word in tokenized:
        if tokenized.index(word) == 0 and (word in positive):
            pcnt += 1/len(tokenized)
        elif tokenized.index(word) == 1:
            if word in positive and (tokenized[tokenized.index(word)-1] != 'not'):
                pcnt += 1/len(tokenized)
        elif tokenized.index(word) > 1:
            if word in positive and (tokenized[tokenized.index(word)-1] != 'not') and (tokenized[tokenized.index(word)-2] != 'not'):
                pcnt += 1/len(tokenized)
    return pcnt


def percent_negative(review):
    """ Tokenizes each sentence, checks for membership in positive words,
        makes sure positive words are not perceded by 'not'
    """
    
    # tokenize a sentence and remove punctuation
    tokenized = review.lower().split(' ')
    tokenized = [remove_punctuation(word) for word in tokenized]
    pcnt = 0
    
    # check for membership in negative words list, making sure 'not' doesn't precede
    for word in tokenized:
        if tokenized.index(word) == 0 and word in negative:
            pcnt += 1/len(tokenized)
        elif tokenized.index(word) == 1:
            if word in negative and (tokenized[tokenized.index(word)-1] != 'not'):
                pcnt += 1/len(tokenized)
        elif tokenized.index(word) > 1:
            if word in negative and (tokenized[tokenized.index(word)-1] != 'not') and (tokenized[tokenized.index(word)-2] != 'not'):
                pcnt += 1/len(tokenized)
    return pcnt



In [9]:
# Apply percent_positive to the text column in our dataframe
imdb['positive'] = imdb['text'].apply(percent_positive)

# Apply percent_negative to the text column in our dataframe
imdb['negative'] = imdb['text'].apply(percent_negative)

# Apply percent_positive to the text column in the Amazon dataframe
amazon['positive'] = amazon['text'].apply(percent_positive)

# Apply percent_negative to the text column in the Amazon dataframe
amazon['negative'] = amazon['text'].apply(percent_negative)

# Apply percent_positive to the text column in the Yelp dataframe
yelp['positive'] = yelp['text'].apply(percent_positive)

# Apply percent_negative to the text column in the Yelp dataframe
yelp['negative'] = yelp['text'].apply(percent_negative)


In [10]:
# Sort values by most positive first
imdb.sort_values(by=['positive'],ascending=False).head()

Unnamed: 0,text,sentiment,positive,negative
497,Highly recommended A+,1,0.4,0.0
63,Brilliant!,1,0.333333,0.0
534,") a happy, wonderful, feel good ending!",1,0.333333,0.0
544,"A very good film indeed, about great and uncon...",1,0.333333,0.0
734,;) Recommend with confidence!,1,0.333333,0.0


In [11]:
# Sort values by most negative first
imdb.sort_values(by=['negative'],ascending=False).head()

Unnamed: 0,text,sentiment,positive,negative
499,"It's a mediocre, miserable, hollow, laughable ...",0,0.0,0.384615
101,Awful.,0,0.0,0.333333
94,Horrible!,0,0.0,0.333333
671,"Bad characters, bad story and bad acting.",0,0.0,0.333333
725,"Instead, we got a bore fest about a whiny, spo...",0,0.0,0.285714


# Training & Prediction

Let us load a Bernoulli Naive Bayes model, train it on our IMDB data, and predict Amazon review sentiments with it:

In [15]:
# Initialize a model object
classifier = BernoulliNB()
imdb_data = imdb[['positive','negative']]
imdb_target = imdb['sentiment']
# Fit our model to the data.
classifier.fit(imdb_data, imdb_target)

amazon_data = amazon[['positive','negative']]
amazon_target = amazon['sentiment']

# Classify, storing the result in a new variable.
y_amazon_pred = classifier.predict(amazon_data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    amazon.shape[0],
    (amazon_target != y_amazon_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 159


In [16]:
yelp_data = yelp[['positive','negative']]
yelp_target = yelp['sentiment']

# Classify, storing the result in a new variable.
y_yelp_pred = classifier.predict(yelp_data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    yelp.shape[0],
    (yelp_target != y_yelp_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 185


### Yelp ###

In [17]:
from sklearn.metrics import confusion_matrix
confusion_matrix(yelp_target, y_yelp_pred)

array([[432,  68],
       [117, 383]], dtype=int64)

This matrix shows the counts for when a review was positive and we predicted positive (432), when a review was positive and we predicted negative (68), when a review was negative and we predicted positive (117), and when a review was negative and we predicted negative (383).

In [22]:
# Test your model with different holdout groups.

from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(yelp_data, yelp_target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(classifier.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(classifier.fit(yelp_data, yelp_target).score(yelp_data, yelp_target)))

With 20% Holdout: 0.85
Testing on Sample: 0.815


In [23]:
# Implement cross validation 
from sklearn.model_selection import cross_val_score
cross_val_score(classifier, yelp_data, yelp_target, cv=10)

array([0.83, 0.87, 0.79, 0.87, 0.83, 0.79, 0.82, 0.81, 0.74, 0.8 ])

### Amazon ###

In [24]:
from sklearn.metrics import confusion_matrix
confusion_matrix(amazon_target, y_amazon_pred)

array([[439,  61],
       [ 98, 402]], dtype=int64)

In [31]:
# Test your model with different holdout groups.

from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(amazon_data, amazon_target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(classifier.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(classifier.fit(amazon_data, amazon_target).score(amazon_data, amazon_target)))

With 20% Holdout: 0.855
Testing on Sample: 0.841


In [34]:
# Implement cross validation 
from sklearn.model_selection import cross_val_score
cross_val_score(classifier, amazon_data, amazon_target, cv=10)

array([0.82, 0.87, 0.89, 0.82, 0.91, 0.77, 0.85, 0.84, 0.83, 0.81])

# Conclusion

We are able to predict:
- 841 out of a 1000 amazon review sentiments correctly when trained on IMDB reviews.
- 815 out of a 1000 yelp review sentiments correctly when trained on IMDB reviews.