In [14]:
# read the data from yelp_academic_dataset_review.json into a list of rows
# each row is decoded into a dictionary using using json.loads()
import json
import pandas as pd

with open('yelp_academic_dataset_review_50K.json', 'rU') as f:
    data = [json.loads(row) for row in f]

In [15]:
# show the first review
print data[0]

{u'votes': {u'funny': 0, u'useful': 0, u'cool': 0}, u'user_id': u'PUFPaY9KxDAcGqfsorJp3Q', u'review_id': u'Ya85v4eqdd6k9Od8HbQjyA', u'text': u'Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.', u'business_id': u'5UmKMjUEUNdYWqANhGckJw', u'stars': 4, u'date': u'2012-08-01', u'type': u'review'}


In [16]:
# convert the list of dictionaries to a DataFrame
yelp = pd.DataFrame(data)
yelp.head(1)

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,votes
0,5UmKMjUEUNdYWqANhGckJw,2012-08-01,Ya85v4eqdd6k9Od8HbQjyA,4,"Mr Hoagie is an institution. Walking in, it do...",review,PUFPaY9KxDAcGqfsorJp3Q,"{u'funny': 0, u'useful': 0, u'cool': 0}"


In [17]:
# Show dimension
print yelp.shape

(50000, 8)


In [18]:
# add DataFrame columns for cool, useful, and funny
yelp['cool'] = [row['votes']['cool'] for row in data]
yelp['useful'] = [row['votes']['useful'] for row in data]
yelp['funny'] = [row['votes']['funny'] for row in data]
#Show dimension after adding 
yelp.shape

(50000, 11)

In [19]:
# drop the votes column
yelp.drop('votes', axis=1, inplace=True)
yelp.head(1)

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,5UmKMjUEUNdYWqANhGckJw,2012-08-01,Ya85v4eqdd6k9Od8HbQjyA,4,"Mr Hoagie is an institution. Walking in, it do...",review,PUFPaY9KxDAcGqfsorJp3Q,0,0,0


In [20]:
#Explore the relationship between each of the vote types (cool/useful/funny) and the number of stars.
# treat stars as a categorical variable and look for differences between groups
yelp.groupby('stars').mean()


Unnamed: 0_level_0,cool,useful,funny
stars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.190649,1.018461,0.452208
2,0.299398,1.012249,0.465863
3,0.448723,0.86202,0.416506
4,0.573698,0.93441,0.390376
5,0.478018,0.822235,0.30142


In [21]:
#Reviews Length 
yelp['reviews_length'] = [len(row['text'] ) for row in data]


In [22]:
#show dimension after adding new fature reviews_length
print yelp.shape
print yelp.head(1)

(50000, 11)
              business_id        date               review_id  stars  \
0  5UmKMjUEUNdYWqANhGckJw  2012-08-01  Ya85v4eqdd6k9Od8HbQjyA      4   

                                                text    type  \
0  Mr Hoagie is an institution. Walking in, it do...  review   

                  user_id  cool  useful  funny  reviews_length  
0  PUFPaY9KxDAcGqfsorJp3Q     0       0      0             453  


In [28]:
# Separate review dataset based on star rating
data_review_1stars = yelp.loc[yelp['stars'] == 1]
data_review_2stars = yelp.loc[yelp['stars'] == 2] 
data_review_3stars = yelp.loc[yelp['stars'] == 3]
data_review_4stars = yelp.loc[yelp['stars'] == 4]
data_review_5stars = yelp.loc[yelp['stars'] == 5]
print (data_review_1stars.head(1))


              business_id        date               review_id  stars  \
5  UsFtqoBl7naz8AVUBZMjQQ  2014-10-29  7N9j5YbBHBW6qguE5DAeyA      1   

                                                text    type  \
5  Wing sauce is like water. Pretty much a lot of...  review   

                  user_id  cool  useful  funny  reviews_length  
5  PP_xoMSYlGr2pb67BbqBdA     0       0      0             307  
765.932194617
0.914791526747


In [24]:
# Look at rating (stars) distribution
# Distribution is obviously skewed. People tend to write positive reviews
import matplotlib.pyplot as plt
counts = (data_review_1stars.size, data_review_2stars.size, data_review_3stars.size, 
            data_review_4stars.size, data_review_5stars.size)
N = len(counts)
x = range(N)
width = 1.0/1.2
plt.xlabel('Star Rating')
plt.ylabel('Count')
plt.title('Number of Reviews with Star Ratings 1-5')
plt.bar(x, counts, width, color="blue")

plt.savefig('rating_distribution.png')
plt.show()

In [57]:
# Correlation between review length and star rating?

import numpy as np
sigma = np.std(np.log(data_review_1stars['reviews_length'].tolist()))
mu = np.mean(np.log(data_review_1stars['reviews_length'].tolist()))
#np.random.seed(1)
#print np.random.lognormal(mu, sigma)
print 'maen:', mu , 'Standard Deviation', sigma

plt.xlabel('Length')
plt.ylabel('Probability')
plt.title('Count of 1-Stars Review Lengths')
count, bins, ignored = plt.hist(data_review_1stars['reviews_length'].tolist(), bins=20, normed=True)

x = np.linspace(min(bins), max(bins), 10000)
pdf = (np.exp(-(np.log(x) - mu)**2 / (2 * sigma**2))
       / (x * sigma * np.sqrt(2 * np.pi)))
plt.plot(x, pdf, color='r', linewidth=2)

plt.savefig('review_length_1.png')
plt.show()

# Correlation between review length and star rating?

plt.xlabel('Length')
plt.ylabel('Probability')
plt.title('Count of 1-Stars Review Lengths')
plt.hist(data_review_1stars['reviews_length'].tolist(), bins=20, normed=True)
plt.savefig('review_length_1.png')
plt.show()

# Correlation between review length and star rating?
sigma = np.std(np.log(data_review_3stars['reviews_length'].tolist()))
mu = np.mean(np.log(data_review_3stars['reviews_length'].tolist()))
# nearly identical log-normal distributions
#np.random.seed(1)
#print np.random.lognormal(mu, sigma)
print 'maen:', mu , 'Standard Deviation', sigma

plt.xlabel('Length')
plt.ylabel('Probability')
plt.title('Count of 3-Stars Review Lengths')
count, bins, ignored = plt.hist(data_review_3stars['reviews_length'].tolist(), bins=20, normed=True)

x = np.linspace(min(bins), max(bins), 10000)
pdf = (np.exp(-(np.log(x) - mu)**2 / (2 * sigma**2))
       / (x * sigma * np.sqrt(2 * np.pi)))

plt.plot(x, pdf, color='r', linewidth=2)
plt.savefig('review_length_3.png')
plt.show()

# Correlation between review length and star rating?
sigma = np.std(np.log(data_review_5stars['reviews_length'].tolist()))
mu = np.mean(np.log(data_review_5stars['reviews_length'].tolist()))
#np.random.seed(1)
#print np.random.lognormal(mu, sigma)
print 'maen:', mu , 'Standard Deviation', sigma
plt.xlabel('Length')
plt.ylabel('Probability')
plt.title('Count of 5-Stars Review Lengths')
count, bins, ignored = plt.hist(data_review_5stars['reviews_length'].tolist(), bins=20, normed=True)

x = np.linspace(min(bins), max(bins), 10000)
pdf = (np.exp(-(np.log(x) - mu)**2 / (2 * sigma**2))
       / (x * sigma * np.sqrt(2 * np.pi)))

plt.plot(x, pdf, color='r', linewidth=2)

plt.savefig('review_length_5.png')
plt.show()

maen: 6.25741003983 Standard Deviation 0.914791526747
maen: 6.22045732056 Standard Deviation 0.861246946387
maen: 5.93054171097 Standard Deviation 0.882837740636


In [56]:
print 'maen:', mu , 'Standard Deviation', sigma



maen: 5.93054171097 Standard Deviation 0.882837740636
