In [73]:
# Import the modules we need.
import nltk
import numpy as np
import os
import pandas as pd
import re
import string

from nltk.stem.snowball import SnowballStemmer

# Paths for the data directory and processed file.
data_directory = os.path.join('.', 'data')
if not os.path.exists(data_directory):
    print("[ERROR] data directory ('{}') does not exist".format(data_directory))

reviews_file_path = os.path.join(data_directory, 'Reviews.csv')

# Positive and negative word files.
positive_words_file_path = os.path.join('.', 'positive_words.txt')
negative_words_file_path = os.path.join('.', 'negative_words.txt')

In [74]:
# Read the reviews into a dataframe.
from IPython.display import display

column_dtypes = {'productId': str, 'userId': str, 'profileName': str, 'helpfulness': str,
                 'score': np.int64, 'time': np.int64, 'summary': str, 'text': str, 
                 'helpfulness_numerator':np.int64, 'helpfulness_denominator':np.int64}

# For this dataset, 'quoting' must be set to QUOTE_ALL (1) and the quotechar to a pipe (|).
# The problem is that values in some 'text' fields begin with a ", but don't end with one,
# and many review texts contain commas, unbalanced quotes and apostrophes.
review_df = pd.read_table(reviews_file_path, delimiter=',', encoding="UTF-8", dtype=column_dtypes, 
                          quoting=1, quotechar='|', engine="c", skip_blank_lines=True, 
                          error_bad_lines=False, warn_bad_lines=True)

# It takes just under 3 minutes to do 100,000 rows.
# Estimate it will take 15 minutes to do all 545,000 records.
review_df = review_df[0:100000]
display(review_df.info())
review_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 12 columns):
Unnamed: 0                 100000 non-null int64
productId                  100000 non-null object
userId                     100000 non-null object
profileName                99996 non-null object
helpfulness                100000 non-null object
score                      100000 non-null int64
time                       100000 non-null int64
summary                    99998 non-null object
text                       100000 non-null object
helpfulness_numerator      100000 non-null int64
helpfulness_denominator    100000 non-null int64
date                       100000 non-null object
dtypes: int64(5), object(7)
memory usage: 9.2+ MB


None

Unnamed: 0.1,Unnamed: 0,productId,userId,profileName,helpfulness,score,time,summary,text,helpfulness_numerator,helpfulness_denominator,date
0,0,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1/1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,1,1,2011-04-27
1,1,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0/0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,0,0,2012-09-07
2,2,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1/1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,1,1,2008-08-18
3,3,B000UA0QIQ,A395BORC6FGVXV,Karl,3/3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,3,3,2011-06-13
4,4,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0/0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,0,0,2012-10-21


In [75]:
# Read the positive and negative word lists.
def read_words_from_file(filepath):
    word_list = []
    with open(filepath, "rt") as words_file:
        word_list = [w.strip().lower() for w in words_file]
        
    return word_list

positive_words = read_words_from_file(positive_words_file_path)
display(positive_words[0:20])
print("Found {} positive words.".format(len(positive_words)))

negative_words = read_words_from_file(negative_words_file_path)
display(negative_words[0:20])
print("Found {} negative words.".format(len(negative_words)))


# Stem the words and create lists of unique stems.
stemmer = SnowballStemmer("english")

positive_stems = set([stemmer.stem(w) for w in positive_words if w is not ''])
display(list(positive_stems)[0:10])
print("Found {} positive stems.".format(len(positive_stems)))

negative_stems = set([stemmer.stem(w) for w in negative_words if w is not ''])
display(list(negative_stems)[0:10])
print("Found {} negative stems.".format(len(negative_stems)))

['a+',
 'abound',
 'abounds',
 'abundance',
 'abundant',
 'accessable',
 'accessible',
 'acclaim',
 'acclaimed',
 'acclamation',
 'accolade',
 'accolades',
 'accommodative',
 'accomodative',
 'accomplish',
 'accomplished',
 'accomplishment',
 'accomplishments',
 'accurate',
 'accurately']

Found 4013 positive words.


['2-faced',
 '2-faces',
 'abnormal',
 'abolish',
 'abominable',
 'abominably',
 'abominate',
 'abomination',
 'abort',
 'aborted',
 'aborts',
 'abrade',
 'abrasive',
 'abrupt',
 'abruptly',
 'abscond',
 'absence',
 'absent-minded',
 'absentee',
 'absurd']

Found 9568 negative words.


['fave',
 'privileg',
 'fast-pac',
 'hands-down',
 'crisp',
 'supurb',
 'heartfelt',
 'astut',
 'invent',
 'deginifi']

Found 1281 positive stems.


['loneli',
 'underdog',
 'precari',
 'silli',
 'yawn',
 'coward',
 'mockeri',
 'disbelief',
 'constern',
 'greed']

Found 2961 negative stems.


In [76]:
# Define these once so they're not evaluated each time is_desirable_word() is called.
punctuation_pattern = r'[!,&#\[\]\(\)”\"’`‘\'\.:;\*\$<>\?%\-_\\/–“—¿½]+'
number_pattern = r'[0-9]+'


# Utility function for determining whether a word is one which we want to keep or not.
#
# Undesirable words are composed of punctuation, or digits, or are not even strings.
#
# returns - True if the word should be kept, otherwise False
def is_desirable_word(w):
    # Start with some quick tests.
    if type(w) is not str:
        return False

    if 0 == len(w):
        return False

    # Remove single punctuation marks.
    if w in string.punctuation:
        return False

    # Remove single digits.
    if w.isdigit():
        return False

    # Remove numbers.
    if re.match(number_pattern, w):
        return False
    
    # Remove words that consist entirely of punctuation. 
    if re.match(punctuation_pattern, w):
        return False
    
    return True


# Utility function to turn a block of text into a list of unique, "desirable" tokens.
#
# Desirability is determined by the is_desirable_word() function.
#
# raw_text - text to tokenize
#
# returns - the list of unique, desirable tokens
def get_canonical_tokens(raw_text):
    if type(raw_text) is not str:
        return []
    
    if 0 == len(raw_text):
        return []
    
    # Use NLTK to tokenize.
    token_list = nltk.wordpunct_tokenize(raw_text)
    
    # Lowercase the words and strip those that are only punctuation etc.
    token_list = set([w.lower() for w in token_list if is_desirable_word(w)])
    return list(token_list)


verbose = False
text_stem_list = []
summary_stem_list = []
for i, row in review_df.iterrows():
    if i > 1000000:
        # Development aid - limit number of rows processed.
        # Set to a small number to speed up development.
        break
    text = row['text']
    summary = row['summary']
    
    text_tokens = get_canonical_tokens(text)
    summary_tokens = get_canonical_tokens(summary)
    
    #review_df.loc[i, 'summary_stems'] = " ".join(summary_tokens)
    summary_stems = list(set([stemmer.stem(w) for w in summary_tokens]))
    summary_stem_list.append(" ".join(summary_stems))
    
    #review_df.loc[i, 'text_stems'] = " ".join(text_tokens)
    text_stems = list(set([stemmer.stem(w) for w in text_tokens]))
    text_stem_list.append(" ".join(text_stems))
    
    if verbose:
        #print("Row {} ({}) has text: {}".format(i, summary, text))
        print("Row {} has summary tokens: {}".format(i, summary_tokens))
        print("Row {} has summary stems: {}".format(i, summary_stems))
        print("Row {} has text tokens: {}".format(i, text_tokens))
        print("Row {} has text stems: {}".format(i, text_stems))


In [77]:
# Append the list of summary and text stems to the dataframe as extra columns.
review_df = review_df.assign(summary_stems=pd.Series(summary_stem_list).values)
review_df = review_df.assign(text_stems=pd.Series(text_stem_list).values)

display(review_df.info())
display(review_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
Unnamed: 0                 100000 non-null int64
productId                  100000 non-null object
userId                     100000 non-null object
profileName                99996 non-null object
helpfulness                100000 non-null object
score                      100000 non-null int64
time                       100000 non-null int64
summary                    99998 non-null object
text                       100000 non-null object
helpfulness_numerator      100000 non-null int64
helpfulness_denominator    100000 non-null int64
date                       100000 non-null object
summary_stems              100000 non-null object
text_stems                 100000 non-null object
dtypes: int64(5), object(9)
memory usage: 10.7+ MB


None

Unnamed: 0.1,Unnamed: 0,productId,userId,profileName,helpfulness,score,time,summary,text,helpfulness_numerator,helpfulness_denominator,date,summary_stems,text_stems
0,0,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1/1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,1,1,2011-04-27,food qualiti dog good,most smell to more can dog good meat found the...
1,1,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0/0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,0,0,2012-09-07,not as advertis,peanut to label was if not error intend vendor...
2,2,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1/1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,1,1,2008-08-18,say it delight all,witch s out around to filbert heaven gelatin c...
3,3,B000UA0QIQ,A395BORC6FGVXV,Karl,3/3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,3,3,2011-06-13,cough medicin,to was addit good found if it beer order ingre...
4,4,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0/0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,0,0,2012-10-21,great taffi,great this quick there at your deal wide was d...


In [78]:
# Compute the number of positive and negative words in the text stems.
positive_count_summary_list = []
positive_count_text_list = []

negative_count_summary_list = []
negative_count_text_list = []

sentiment_summary_list = []
sentiment_text_list = []

for i, row in review_df.iterrows():
    # Summaries.
    summary_stems = row['summary_stems'].split(" ")

    positive_count = len(positive_stems.intersection(summary_stems))
    negative_count = len(negative_stems.intersection(summary_stems))

    positive_count_summary_list.append(positive_count)
    negative_count_summary_list.append(negative_count)
    
    if positive_count > negative_count:
        sentiment_summary_list.append('positive')
    elif positive_count > negative_count:
        sentiment_summary_list.append('negative')
    else:
        sentiment_summary_list.append('neutral')

    if verbose:
        print("Summary stem matches: {} +ve and {} -ve".format(positive_count, negative_count))

    
    # Text.
    text_stems = row['text_stems'].split(" ")

    positive_count = len(positive_stems.intersection(text_stems))
    negative_count = len(negative_stems.intersection(text_stems))

    positive_count_text_list.append(positive_count)
    negative_count_text_list.append(negative_count)
    
    if positive_count > negative_count:
        sentiment_text_list.append('positive')
    elif positive_count > negative_count:
        sentiment_text_list.append('negative')
    else:
        sentiment_text_list.append('neutral')

    if verbose:
        print("Text stem matches: {} +ve and {} -ve".format(positive_count, negative_count))


# Append the list of summary and text stems to the dataframe as extra columns.
review_df = review_df.assign(positive_summary_count=pd.Series(positive_count_summary_list).values)
review_df = review_df.assign(negative_summary_count=pd.Series(negative_count_summary_list).values)
review_df = review_df.assign(summary_sentiment=pd.Series(sentiment_summary_list).values)

review_df = review_df.assign(positive_text_count=pd.Series(positive_count_text_list).values)
review_df = review_df.assign(negative_text_count=pd.Series(negative_count_text_list).values)
review_df = review_df.assign(text_sentiment=pd.Series(sentiment_text_list).values)


display(review_df.info())
display(review_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 20 columns):
Unnamed: 0                 100000 non-null int64
productId                  100000 non-null object
userId                     100000 non-null object
profileName                99996 non-null object
helpfulness                100000 non-null object
score                      100000 non-null int64
time                       100000 non-null int64
summary                    99998 non-null object
text                       100000 non-null object
helpfulness_numerator      100000 non-null int64
helpfulness_denominator    100000 non-null int64
date                       100000 non-null object
summary_stems              100000 non-null object
text_stems                 100000 non-null object
positive_summary_count     100000 non-null int64
negative_summary_count     100000 non-null int64
summary_sentiment          100000 non-null object
positive_text_count        100000 non-null int64


None

Unnamed: 0.1,Unnamed: 0,productId,userId,profileName,helpfulness,score,time,summary,text,helpfulness_numerator,helpfulness_denominator,date,summary_stems,text_stems,positive_summary_count,negative_summary_count,summary_sentiment,positive_text_count,negative_text_count,text_sentiment
0,0,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1/1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,1,1,2011-04-27,food qualiti dog good,most smell to more can dog good meat found the...,1,1,neutral,5,5,neutral
1,1,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0/0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,0,0,2012-09-07,not as advertis,peanut to label was if not error intend vendor...,0,0,neutral,1,1,neutral
2,2,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1/1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,1,1,2008-08-18,say it delight all,witch s out around to filbert heaven gelatin c...,1,0,positive,4,0,positive
3,3,B000UA0QIQ,A395BORC6FGVXV,Karl,3/3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,3,3,2011-06-13,cough medicin,to was addit good found if it beer order ingre...,0,0,neutral,3,1,positive
4,4,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0/0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,0,0,2012-10-21,great taffi,great this quick there at your deal wide was d...,1,0,positive,2,0,positive
