In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import re
import seaborn as sns
from textblob import TextBlob

%matplotlib inline

df = pd.read_json('./data/temp_pre/temple__12.json')
df.head()

Unnamed: 0,cleaned_text,date,review,stars,title,userid,userurl,version
0,many ads many ads delete app review,2019-03-11,Too many ads. Will delete this app after review.,1 star,Too many ads.,125734925,https://itunes.apple.com/WebObjects/MZStore.wo...,Version 1.9.6
1,temple run fun grinning face smiling eyes,2019-03-03,It is fun😄,5 stars,Temple run,986532366,https://itunes.apple.com/WebObjects/MZStore.wo...,Version 1.9.5
2,temple run play,2019-02-25,Play,5 stars,Temple run,435105632,https://itunes.apple.com/WebObjects/MZStore.wo...,Version 1.9.5
3,game bad jump chasing cloths hate pouting face...,2018-11-17,Cloths I hate is😡😡😡😡😡😡😡😡,1 star,This game is so bad all you do is jump you can...,136138572,https://itunes.apple.com/WebObjects/MZStore.wo...,Version 1.9.4
4,good game bit doggy grimacing face grinning face,2018-09-25,It was a good game but it was a bit too doggy 😬😀,4 stars,Both,899941082,https://itunes.apple.com/WebObjects/MZStore.wo...,Version 1.9.2


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450522 entries, 0 to 450521
Data columns (total 8 columns):
cleaned_text    450522 non-null object
date            450522 non-null datetime64[ns]
review          450522 non-null object
stars           450522 non-null object
title           450522 non-null object
userid          450522 non-null int64
userurl         450522 non-null object
version         450522 non-null object
dtypes: datetime64[ns](1), int64(1), object(6)
memory usage: 27.5+ MB


In [3]:
def sent(text):
    return TextBlob(text).sentiment[0]

In [4]:
df['review_sentiment'] = df.apply(lambda x: sent(x['review']), axis=1)
df['title_sentiment'] = df.apply(lambda x: sent(x['title']), axis=1)
df['cleaned_review_sentiment'] = df.apply(lambda x: sent(x['cleaned_text']), axis=1)

In [5]:
df['stars'] = [int(re.search(r'\d+', x).group()) for x in df['stars']]
df["polsum"] = df['review_sentiment'] + df['title_sentiment']
df['month_year'] = df.date.dt.to_period('M')

In [6]:
df.head()

Unnamed: 0,cleaned_text,date,review,stars,title,userid,userurl,version,review_sentiment,title_sentiment,cleaned_review_sentiment,polsum,month_year
0,many ads many ads delete app review,2019-03-11,Too many ads. Will delete this app after review.,1,Too many ads.,125734925,https://itunes.apple.com/WebObjects/MZStore.wo...,Version 1.9.6,0.5,0.5,0.5,1.0,2019-03
1,temple run fun grinning face smiling eyes,2019-03-03,It is fun😄,5,Temple run,986532366,https://itunes.apple.com/WebObjects/MZStore.wo...,Version 1.9.5,0.0,0.0,0.3,0.0,2019-03
2,temple run play,2019-02-25,Play,5,Temple run,435105632,https://itunes.apple.com/WebObjects/MZStore.wo...,Version 1.9.5,0.0,0.0,0.0,0.0,2019-02
3,game bad jump chasing cloths hate pouting face...,2018-11-17,Cloths I hate is😡😡😡😡😡😡😡😡,1,This game is so bad all you do is jump you can...,136138572,https://itunes.apple.com/WebObjects/MZStore.wo...,Version 1.9.4,-0.8,-0.55,-0.633333,-1.35,2018-11
4,good game bit doggy grimacing face grinning face,2018-09-25,It was a good game but it was a bit too doggy 😬😀,4,Both,899941082,https://itunes.apple.com/WebObjects/MZStore.wo...,Version 1.9.2,0.15,0.0,0.15,0.15,2018-09


In [7]:
af = df.groupby(['month_year', 'stars']).size().to_frame('Counts').reset_index()
af

Unnamed: 0,month_year,stars,Counts
0,2011-08,1,1
1,2011-08,2,1
2,2011-08,3,1
3,2011-08,4,7
4,2011-08,5,53
5,2011-09,1,7
6,2011-09,2,6
7,2011-09,3,17
8,2011-09,4,88
9,2011-09,5,700


In [8]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, FactorRange
from bokeh.models import ColumnDataSource, HoverTool, Div, BoxZoomTool, ResetTool, WheelZoomTool, PanTool
output_notebook()

subset = af[['month_year', 'stars']]
a = [tuple(x) for x in subset.values]
x = []
for i in a:
    x.append((str(i[0])[:10], str(i[1])))

subs2 = af[['Counts']]
counts = [tuple(x) for x in subs2.values]
source = ColumnDataSource(data=dict(x=x, counts=counts))

t = figure(x_range=FactorRange(*x), plot_height=350, plot_width=900, title="Stars by Month")

t.vbar(x='x', top='counts',width=0.9, source=source)

t.y_range.start = 0
t.x_range.range_padding = 0.1
t.xaxis.major_label_orientation = 1
t.xgrid.grid_line_color = None

show(t)

In [9]:
# find the quartiles and IQR for each category
groups = df.groupby('stars')
q1 = groups.quantile(q=0.25)
q2 = groups.quantile(q=0.5)
q3 = groups.quantile(q=0.75)
iqr = q3 - q1
upper = q3 + 1.5*iqr
lower = q1 - 1.5*iqr

# find the outliers for each category
def outliers(group):
    cat = group.name
    return group[(group.polsum > upper.loc[cat]['polsum']) | (group.polsum < lower.loc[cat]['polsum'])]['polsum']
out = groups.apply(outliers).dropna()

cats = [1, 2, 3, 4, 5]

# prepare outlier data for plotting, we need coordinates for every outlier.
if not out.empty:
    outx = []
    outy = []
    for cat in cats:
        # only add outliers if they exist
        if not out.loc[cat].empty:
            for value in out[cat]:
                outx.append(cat)

output_notebook()
p = figure(background_fill_color="#EFE8E2", x_range=['1','2','3','4','5', '6'],
           x_axis_label= "Stars", y_axis_label="Polarity",
        title="Star Ratings Vs Combined Polarity",
          plot_width=600, plot_height=600)
qmin = groups.quantile(q=0.00)
qmax = groups.quantile(q=1.00)
upper.polsum = [min([x,y]) for (x,y) in zip(list(qmax.loc[:,'polsum']),upper.polsum)]
lower.polsum = [max([x,y]) for (x,y) in zip(list(qmin.loc[:,'polsum']),lower.polsum)]

# stems
p.segment(cats, upper.polsum, cats, q3.polsum, line_color="black")
p.segment(cats, lower.polsum, cats, q1.polsum, line_color="black")

# boxes
p.vbar(cats, 0.7, q2.polsum, q3.polsum, fill_color="#E08E79", line_color="black")
p.vbar(cats, 0.7, q1.polsum, q2.polsum, fill_color="#3B8686", line_color="black")

# whiskers (almost-0 height rects simpler than segments)
p.rect(cats, lower.polsum, 0.2, 0.01, line_color="black")
p.rect(cats, upper.polsum, 0.2, 0.01, line_color="black")

# outliers
if not out.empty:
    p.circle(outx, outy, size=6, color="#F38630", fill_alpha=0.6)

p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = "white"
p.grid.grid_line_width = 2
p.xaxis.major_label_text_font_size="12pt"
p.yaxis.major_label_text_font_size="12pt"

show(p)

