In [1]:
%matplotlib inline
import scattertext as st
import re, io
from pprint import pprint
import pandas as pd
import numpy as np
from scipy.stats import rankdata, hmean, norm
import spacy.en
import os, pkgutil, json, urllib
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
from scattertext import CorpusFromPandas, produce_scattertext_explorer
display(HTML("<style>.container { width:70% !important; }</style>"))

Using TensorFlow backend.


In [2]:
nlp = spacy.en.English()
# If this doesn't work, please uncomment the following line and use a regex-based parser instead
# nlp = st.whitespace_nlp_with_sentences

In [3]:
df =  pd.read_csv('amazon_mp3.csv', header=None, names=['productName', 'title', 'fullText', 'rating'],
                  dtype={'productName': object, 'title': object, 'fullText': object, 'rating': np.int32})

In [4]:
df['category'] = df.rating.apply(lambda x: "bad-review" if  x <= 2 else "good-review")

In [5]:
df['parsed'] = df.fullText.apply(nlp)

In [6]:
corpus = st.CorpusFromParsedDocuments(df, category_col='category', parsed_col='parsed').build()

In [7]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['bad-review_precision'] = term_freq_df['bad-review freq'] * 1./(term_freq_df['bad-review freq'] 
                                                                    + term_freq_df['good-review freq'])
term_freq_df['bad-review_freq_pct'] = term_freq_df['bad-review freq'] * 1./term_freq_df['bad-review freq'].sum()
term_freq_df['bad-review_hmean'] = term_freq_df.apply(
    lambda x: (hmean([x['bad-review_precision'],x['bad-review_freq_pct']])
        if x['bad-review_precision'] > 0 and x['bad-review_freq_pct'] > 0 else 0), axis=1)
term_freq_df.sort_values(by='bad-review_hmean', ascending=False).iloc[:10]

Unnamed: 0_level_0,bad-review freq,good-review freq,bad-review_precision,bad-review_freq_pct,bad-review_hmean
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
the,41210,162514,0.202283,0.025651,0.045529
i,29888,112727,0.209571,0.018604,0.034174
to,25096,86564,0.224754,0.015621,0.029212
it,24551,88762,0.216665,0.015282,0.02855
and,20704,85337,0.195245,0.012887,0.024179
a,18642,74371,0.200424,0.011604,0.021937
is,11017,59806,0.155557,0.006858,0.013136
of,10726,41624,0.20489,0.006676,0.012932
this,9995,35129,0.221501,0.006221,0.012103
for,9473,39025,0.195328,0.005897,0.011447


In [8]:
def normcdf(x):
    return norm.cdf(x, x.mean(), x.std())
term_freq_df['bad-review_precision_normcdf'] = normcdf(term_freq_df['bad-review_precision'])
term_freq_df['bad-review_freq_pct_normcdf'] = normcdf(term_freq_df['bad-review_freq_pct'])
term_freq_df['bad-review_scaled_f_score'] = hmean([term_freq_df['bad-review_precision_normcdf'],
                                                   term_freq_df['bad-review_freq_pct_normcdf']])
term_freq_df.sort_values(by='bad-review_scaled_f_score', ascending=False).iloc[:10]

Unnamed: 0_level_0,bad-review freq,good-review freq,bad-review_precision,bad-review_freq_pct,bad-review_hmean,bad-review_precision_normcdf,bad-review_freq_pct_normcdf,bad-review_scaled_f_score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
refund,329,65,0.835025,0.000205,0.000409,0.95027,0.99917,0.974107
stopped working,349,109,0.762009,0.000217,0.000434,0.926461,0.999578,0.961632
repair,304,103,0.746929,0.000189,0.000378,0.920602,0.998159,0.957813
junk,257,86,0.749271,0.00016,0.00032,0.921534,0.99288,0.955878
worst,258,99,0.722689,0.000161,0.000321,0.910453,0.993068,0.949968
a refund,167,30,0.847716,0.000104,0.000208,0.953697,0.943351,0.948496
returning,259,105,0.711538,0.000161,0.000322,0.905473,0.993252,0.947334
not buy,227,86,0.72524,0.000141,0.000283,0.911564,0.984689,0.946717
stopped,528,237,0.690196,0.000329,0.000657,0.895377,1.0,0.944801
waste,335,150,0.690722,0.000209,0.000417,0.895635,0.99932,0.944641


In [9]:
term_freq_df['bad-review_corner_score'] = corpus.get_corner_scores('bad-review')
term_freq_df.sort_values(by='bad-review_corner_score', ascending=False).iloc[:10]

Unnamed: 0_level_0,bad-review freq,good-review freq,bad-review_precision,bad-review_freq_pct,bad-review_hmean,bad-review_precision_normcdf,bad-review_freq_pct_normcdf,bad-review_scaled_f_score,bad-review_corner_score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
lol lol,22,0,1.0,1.4e-05,2.7e-05,0.981862,0.573674,0.724211,0.971721
update complete,18,0,1.0,1.1e-05,2.2e-05,0.981862,0.558503,0.712004,0.971593
updating ipod,18,0,1.0,1.1e-05,2.2e-05,0.981862,0.558503,0.712004,0.971593
complete waste,16,0,1.0,1e-05,2e-05,0.981862,0.550884,0.705781,0.971495
complete updating,16,0,1.0,1e-05,2e-05,0.981862,0.550884,0.705781,0.971495
were unable,13,0,1.0,8e-06,1.6e-05,0.981862,0.539421,0.696303,0.971272
gb stone,12,0,1.0,7e-06,1.5e-05,0.981862,0.535592,0.693105,0.971162
problem came,12,0,1.0,7e-06,1.5e-05,0.981862,0.535592,0.693105,0.971162
a fatal,11,0,1.0,7e-06,1.4e-05,0.981862,0.53176,0.689888,0.97102
mailed the,11,0,1.0,7e-06,1.4e-05,0.981862,0.53176,0.689888,0.97102


In [10]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['Good Review Score'] = corpus.get_scaled_f_scores('good-review')
term_freq_df['Bad Review Score'] = corpus.get_scaled_f_scores('bad-review')
print("Top 10 Bad Review terms")
pprint(list(term_freq_df.sort_values(by='Bad Review Score', ascending=False).index[:10]))
print("Top 10 Good Review terms")
pprint(list(term_freq_df.sort_values(by='Good Review Score', ascending=False).index[:10]))

Top 10 Bad Review terms
['refund',
 'a refund',
 'of junk',
 'stopped working',
 'repair',
 'junk',
 'sent it',
 'worst',
 'not buy',
 'not recommend']
Top 10 Good Review terms
['love it',
 'very easy',
 'is perfect',
 'love this',
 'loves it',
 'loves',
 'only complaint',
 'is easy',
 'is amazing',
 'highly recommend']


In [11]:
html = produce_scattertext_explorer(corpus,
                                    category='bad-review',
                                    category_name='Bad Review',
                                    not_category_name='Good Review',
                                    width_in_pixels=1000,
                                    minimum_term_frequency=5,
                                    pmi_threshold_coefficient=10,
                                    transform=st.Scalers.scale)
file_name = 'output/ReviewsScattertextScale.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

In [12]:
html = st.produce_scattertext_explorer(corpus,
                                       category='good-review',
                                       category_name='Bad Review',
                                       not_category_name='Good Review',
                                       minimum_term_frequency=5,
                                       pmi_threshold_coefficient=10,
                                       width_in_pixels=1000,
                                       transform=st.Scalers.log_scale_standardize)
file_name = 'output/ReviewsScattertextLog.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

In [14]:
html = produce_scattertext_explorer(corpus,
                                    category='bad-review',
                                    category_name='Bad Review',
                                    not_category_name='Good Review',
                                    width_in_pixels=1000,
                                    minimum_term_frequency=5,
                                    pmi_threshold_coefficient=10,
                                    transform=st.Scalers.percentile)
file_name = 'output/ReviewsScattertextRankData.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)