In [1]:
# Imports required for this cell.
from os import path

# Define some constant strings pointing to the UCI Amazon review data.
DATA_DIR = '../data'
UCI_DATA_DILE = 'uci_labelled_data/amazon_cells_labelled.txt'

# Read in sentiment laballed sentences UCI's Amazon data file.
uci_sentiment_data = []
with open(path.join(DATA_DIR, UCI_DATA_DILE), encoding='utf-8') as fileObj:
    uci_sentiment_data = [line.rstrip('\n') for line in fileObj]
    
# Split each line into a tuple containing the text in the 0th index and the sentiment label in the 1st index.
for index in range(0, len(uci_sentiment_data)):
    split_sentence = uci_sentiment_data[index].split('\t')
    uci_sentiment_data[index] = (split_sentence[0], split_sentence[1])
    
print('Read-in UCI sentiment-labelled Amazon data, count: {}.'.format(len(uci_sentiment_data)))

Read-in UCI sentiment-labelled Amazon data, count: 1000.


In [2]:
# Imports required for this cell.
import json
import gzip
from os import path

# Define some constant strings pointing to the UCSD Amazon review data.
UCSD_DATA_DIR = path.join('../data', 'ucsd_amazon_data')
UCSD_DATA_BEAUTY = 'All_Beauty_5.json.gz'
UCSD_DATA_FASHION = 'AMAZON_FASHION_5.json.gz'
UCSD_DATA_APPLIANCES = 'Appliances_5.json.gz'

# Read in JSON data from UCSD's Amazon data files -- Beauty data.
ucsd_beauty_data = []
with gzip.open(path.join(UCSD_DATA_DIR, UCSD_DATA_BEAUTY)) as json_gzip:
    for line in json_gzip:
        ucsd_beauty_data.append(json.loads(line))

# -- Beauty data.
ucsd_fashion_data = []
with gzip.open(path.join(UCSD_DATA_DIR, UCSD_DATA_FASHION)) as json_gzip:
    for line in json_gzip:
        ucsd_fashion_data.append(json.loads(line))

# -- Appliance data.
ucsd_appliances_data = []
with gzip.open(path.join(UCSD_DATA_DIR, UCSD_DATA_APPLIANCES)) as json_gzip:
    for line in json_gzip:
        ucsd_appliances_data.append(json.loads(line))
        
print('Read-in UCSD Amazon data. Beauty count: {}, Fashion count: {}, Appliances count: {}.'.format(len(ucsd_beauty_data), len(ucsd_fashion_data), len(ucsd_appliances_data)))

Read-in UCSD Amazon data. Beauty count: 5269, Fashion count: 3176, Appliances count: 2272.


In [6]:
# Import the brown corpus.
from nltk.corpus import brown as brown_corpus
# Collections imports related to this cell.
from collections import defaultdict, Counter

# Process the Brown corpus, storing it into a defaultDict. This allows for easy finding of a specific POS tag.
# Note: These keys are case sensitive so the key 'movie' differs from 'Movie'.
word_tags = defaultdict(Counter)
for word, pos in brown_corpus.tagged_words(tagset='universal'):
    word_tags[word][pos] += 1

In [7]:
# Imports related to this cell.
import random
import string

# Test on a random list of the UCSD Amazon data.
punct_table = str.maketrans(dict.fromkeys(string.punctuation))
for review_obj in ucsd_appliances_data[0:15]:
    # Save off the review text with no punctuation.
    review_text = review_obj['reviewText'].translate(punct_table)
    
    # If the review is less than 8 words or more than 200 words, skip it. 
    # This is an attempt to skip over reviews difficult to analyze.
    review_word_list = review_text.split(' ')
    if len(review_word_list) < 8 or len(review_word_list) > 200:
        continue
    
    # Create a map tracking the NN tag count for each word in the review.
    print('review_text: {}'.format(review_text))
    subject_map = defaultdict(Counter)
    for word in review_word_list:
        # Collect the POS tag counts for this word.
        word_tag_counts = word_tags[word]
        #print('   word: {} --> POS tag(s): {}'.format(word, word_tag_counts))
        
        # If this word has no POS tags, no more work is needed for this word.
        if 0 == len(word_tag_counts):
            #print('      No POS tags present for this word, skipping.')
            continue
        
        # Collect some data and update the subject_map.
        word_tag_sum = sum(word_tag_counts.values())
        percentage_noun = float(word_tag_counts['NOUN'] / word_tag_sum)
        subject_map[word] = word_tag_counts['NOUN']
        
        # Print some information about this word.
        #print('      NOUN count: {}, percentage: {}'.format(word_tag_counts['NOUN'], percentage_noun))
        
    # Do some initial guesswork on what the subject of the review may be.
    print('   subject_map: {}'.format(subject_map))
    print('   max word: {}\n\n\n'.format(max(subject_map)))

review_text: I like this as a vent as well as something that will keep house warmer in winter  I sanded it and then painted it the same color as the house  Looks great
   subject_map: defaultdict(<class 'collections.Counter'>, {'I': 5, 'like': 0, 'this': 0, 'as': 0, 'a': 1, 'vent': 7, 'well': 17, 'something': 420, 'that': 0, 'will': 99, 'keep': 3, 'house': 391, 'warmer': 0, 'in': 1, 'winter': 76, 'it': 0, 'and': 0, 'then': 0, 'painted': 0, 'the': 0, 'same': 0, 'color': 132, 'Looks': 1, 'great': 0})
   max word: winter



review_text: I purchasaed a new dryer and did not want to reuse the cord from my old unit This unit installed in a pretty straight forward manor Quality was as expected No Complaints
   subject_map: defaultdict(<class 'collections.Counter'>, {'I': 5, 'a': 1, 'new': 0, 'dryer': 3, 'and': 0, 'did': 0, 'not': 0, 'want': 9, 'to': 1, 'the': 0, 'cord': 6, 'from': 0, 'my': 0, 'old': 0, 'unit': 94, 'This': 0, 'installed': 0, 'in': 1, 'pretty': 0, 'straight': 0, 'forward': 0, '

In [31]:
# Cell related imports.
from nltk import FreqDist
import pandas as pd

# Get the most common adjectives in the brown corpus based on adjective tag counts.
adjectives_dist = FreqDist(word for (word, tag) in brown_corpus.tagged_words(tagset="universal") if tag == 'ADJ')
common_adjectives = adjectives_dist.most_common(160)
common_adjectives_no_count = [pair[0] for pair in common_adjectives]

# Create a frequency distribution of words occurring in positive/negative reviews.
negative_review_adjs = defaultdict(int)
positive_review_adjs = defaultdict(int)
for review_text, sentiment in uci_sentiment_data:
    # Split the review into a list of words.
    review_word_list = review_text.lower().split(' ')
    
    # Negative sentiment reviews.
    if sentiment == '0':
        for word in review_word_list:
            if word in common_adjectives_no_count:
                negative_review_adjs[word] += 1
    # Positive sentiment reviews.
    elif sentiment == '1':
        for word in review_word_list:
            if word in common_adjectives_no_count:
                positive_review_adjs[word] += 1
    # Uncaught issue.
    else:
        print('WARNING: Unrecognized sentiment label: {}'.format(sentiment))
        continue
        
# Create sorted lists showing the most common words for both review types.
negative_review_adjs_list = FreqDist(negative_review_adjs).most_common(100)
positive_review_adjs_list = FreqDist(positive_review_adjs).most_common(100)

# Remove words that are common in both lists.
negative_review_adjs_no_count = [pair[0] for pair in negative_review_adjs_list]
positive_review_adjs_no_count = [pair[0] for pair in positive_review_adjs_list]
joint_adjs_list = set(negative_review_adjs_no_count) & set(positive_review_adjs_no_count)

# Convert these lists to dictionary for ease of access.
negative_review_adjs = defaultdict(int)
positive_review_adjs = defaultdict(int)
for word, count in negative_review_adjs_list:
    negative_review_adjs[word] = count
for word, count in positive_review_adjs_list:
    positive_review_adjs[word] = count

# Remove the joint adjectives list from whichever list features this word less often.
for joint_adj in joint_adjs_list:
    # If the counts in both positive and negative are close, remove from both.
    if negative_review_adjs[joint_adj] > positive_review_adjs[joint_adj]:
        positive_review_adjs.pop(joint_adj)
    elif negative_review_adjs[joint_adj] < positive_review_adjs[joint_adj]:
        negative_review_adjs.pop(joint_adj)
    else:
        positive_review_adjs.pop(joint_adj)
        negative_review_adjs.pop(joint_adj)

# Print out the common adjectives for both reivew sentiments.
print('Adjectives common in positive reviews: {}\n'.format(positive_review_adjs))
print('Adjectives common in negative reviews: {}\n'.format(negative_review_adjs))

Adjectives common in positive reviews: defaultdict(<class 'int'>, {'very': 69, 'great': 62, 'good': 53, 'best': 19, 'happy': 13, 'easy': 12, 'better': 11, 'new': 10, 'clear': 7, 'much': 7, 'fine': 6, 'other': 6, 'long': 6, 'little': 5, 'original': 4, 'several': 4, 'most': 4, 'free': 4, 'small': 4, 'sure': 3, 'able': 3, 'many': 3, 'simple': 3, 'own': 3, 'beautiful': 2, 'whole': 2, 'basic': 1, 'available': 1, 'entire': 1, 'open': 1})

Adjectives common in negative reviews: defaultdict(<class 'int'>, {'only': 17, 'first': 13, 'bad': 11, 'same': 11, 'enough': 10, 'poor': 9, 'few': 9, 'more': 8, 'right': 6, 'difficult': 6, 'last': 5, 'black': 4, 'old': 4, 'real': 4, 'different': 3, 'big': 3, 'dead': 3, 'important': 3, 'hard': 3, 'particular': 2, 'such': 2, 'strong': 2, 'wrong': 2, 'greater': 1, 'white': 1, 'due': 1, 'industrial': 1, 'ready': 1, 'certain': 1})



In [32]:
# Analyze the sentiment-labelled data and predict sentiment.
prediction_list = []
sentiment_list = []
for review_text, sentiment in uci_sentiment_data:
    # Track sentiment prediction.
    prediction = 'No data'
    
    # Loop through each word in the review_text.
    review_word_list = review_text.lower().split(' ')
    for word in review_word_list:
        if word in negative_review_adjs:
            if isinstance(prediction, str):
                prediction = 0
            prediction -= 1
        elif word in positive_review_adjs:
            if isinstance(prediction, str):
                prediction = 0
            prediction += 1
    
    # Convert the prediction scheme to match the sentiment 0/1 labels.
    if isinstance(prediction, str):
        pass
    elif prediction > 0:
        prediction = '1'
    elif prediction < 0:
        prediction = '0'
    elif prediction == 0:
        prediction = 'No prediction'
        
    # Save off the prediction and sentiment into lists.
    prediction_list.append(prediction)
    sentiment_list.append(sentiment)

# Print the confusion matrix.
sentiment_list = pd.Series(sentiment_list, name='Actual')
prediction_list = pd.Series(prediction_list, name='Predicted')
pd_confusion = pd.crosstab(sentiment_list, prediction_list)
print('Confusion matrix: \n{}\n'.format(pd_confusion))

# Print the misclassification rate (0/1 loss) and cannot classify rate.
missclassification_count = 0
unclassification_count = 0
for index in range(len(prediction_list)):
    if prediction_list[index] == 'No data':
        unclassification_count += 1
    elif prediction_list[index] != sentiment_list[index]:
        missclassification_count += 1

missclassified_rate = float(1/(len(uci_sentiment_data) - unclassification_count) * missclassification_count)
missclassified_rate_with_na = float(1/len(uci_sentiment_data) * (missclassification_count + unclassification_count))
unclassified_rate = float(1/len(uci_sentiment_data) * unclassification_count)

print('Missclassification rate (N/d not counted): {:.3f}'.format(missclassified_rate))
print('Missclassification rate (N/d counted)    : {:.3f}'.format(missclassified_rate_with_na))
print('No data rate                             : {:.3f}'.format(unclassified_rate))
print('NOTE: N/d --> No data')
print()

Confusion matrix: 
Predicted    0    1  No data  No prediction
Actual                                     
0          106   68      310             16
1           15  245      229             11

Missclassification rate (N/d not counted): 0.239
Missclassification rate (N/d counted)    : 0.649
No data rate                             : 0.539
NOTE: N/d --> No data



In [33]:
# Create a list of the appliances Amazon review dataset. First, create a list of sentiment labelled reviews.
processed_data = list()
neg_count = 0
pos_count = 0
for review in ucsd_fashion_data:
    # Skip over incomplete reviews.
    if 'reviewText' not in review:
        continue
    
    # Once we have an evenly distributed review list we can exit the processing.
    if neg_count == 300 and pos_count == 300:
        break
    
    # Process the review, saving into a (review_text, sentiment 0/1) tuple.
    sentiment_label = -1
    if (review['overall'] == 1.0 or review['overall'] == 2.0) and neg_count < 300:
        sentiment_label = '0'
        neg_count += 1
    elif (review['overall'] == 4.0 or review['overall'] == 5.0) and pos_count < 300:
        sentiment_label = '1'
        pos_count += 1
    else:
        continue
    
    # Append this review to our list.
    processed_data.append((review['reviewText'].lower(), sentiment_label))

print('Finished processing {} ucsd_appliances_data.'.format(len(processed_data)))
    
# Analyze the Amazon appliance data and predict sentiment.
prediction_list = []
sentiment_list = []
for review_text, sentiment in processed_data:
    # Track sentiment prediction.
    prediction = 'No data'
    
    # Loop through each word in the review_text.
    review_word_list = review_text.lower().split(' ')
    for word in review_word_list:
        if word in negative_review_adjs:
            if isinstance(prediction, str):
                prediction = 0
            prediction -= 1
        elif word in positive_review_adjs:
            if isinstance(prediction, str):
                prediction = 0
            prediction += 1
    
    # Convert the prediction scheme to match the sentiment 0/1 labels.
    if isinstance(prediction, str):
        pass
    elif prediction > 0:
        prediction = '1'
    elif prediction < 0:
        prediction = '0'
    elif prediction == 0:
        prediction = 'No prediction'
        
    # Save off the prediction and sentiment into lists.
    prediction_list.append(prediction)
    sentiment_list.append(sentiment)
    
# Print the confusion matrix.
sentiment_list = pd.Series(sentiment_list, name='Actual')
prediction_list = pd.Series(prediction_list, name='Predicted')
pd_confusion = pd.crosstab(sentiment_list, prediction_list)
print('Confusion matrix: \n{}\n'.format(pd_confusion))

# Print the misclassification rate (0/1 loss) and cannot classify rate.
missclassification_count = 0
unclassification_count = 0
for index in range(len(prediction_list)):
    if prediction_list[index] == 'No data':
        unclassification_count += 1
    elif prediction_list[index] != sentiment_list[index]:
        missclassification_count += 1

missclassified_rate = float(1/(len(processed_data) - unclassification_count) * missclassification_count)
missclassified_rate_with_na = float(1/len(processed_data) * (missclassification_count + unclassification_count))
unclassified_rate = float(1/len(processed_data) * unclassification_count)

print('Missclassification rate (N/d not counted): {:.3f}'.format(missclassified_rate))
print('Missclassification rate (N/d counted)    : {:.3f}'.format(missclassified_rate_with_na))
print('No data rate                             : {:.3f}'.format(unclassified_rate))
print('NOTE: N/d --> No data')
print()

Finished processing 510 ucsd_appliances_data.
Confusion matrix: 
Predicted   0    1  No data  No prediction
Actual                                    
0          84   43       73             10
1          14  155      121             10

Missclassification rate (N/d not counted): 0.244
Missclassification rate (N/d counted)    : 0.531
No data rate                             : 0.380
NOTE: N/d --> No data

