In [1]:
%pwd

'/Users/SwarajPatankar/Desktop/Yelp'

In [2]:
%ls-a

[34m.[m[m/
[34m..[m[m/
.DS_Store
[34m.ipynb_checkpoints[m[m/
Dataset_Challenge_Dataset_Agreement.pdf
Untitled.ipynb
Untitled1.ipynb
Untitled2.ipynb
Weather Data.ipynb
Yelp.ipynb
Yelp_Dataset_Challenge_Round_11.pdf
Yelp_test.ipynb
business.json
business_test.json
checkin.json
photos.json
review.json
tip.json
user.json
yelp_dataset.tar


In [3]:
import ijson
import json
import pandas as pd
import numpy as np
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.sentiment.util import *




In [4]:
def get_alcohol(json_file):
    f = open(json_file)
    objs = ijson.items(f, '')
    business_id = []
    attributes = []
    for line in f:
        data = json.loads(line)
        business_id.append(data['business_id'])
        attributes.append(data['attributes'])

    alcohol = [None] * len(business_id)
    alc = pd.DataFrame({'business_id' : business_id, 'attributes' : attributes, 'alcohol' : alcohol})
    alc.set_index('business_id', inplace = True)
    pos = -1
    for a in alc['attributes']:
        pos += 1
        for k,v in a.items():
            # get restaurant data only
            if k == 'AcceptsInsurance':
                alc.at[alc.index[pos], 'alcohol'] = 'DROP'
            if k == 'Alcohol':
                alc.at[alc.index[pos], 'alcohol'] = v
    
    alc.drop('attributes', axis = 1, inplace = True)


    return alc

def get_reviews(json_file):
    f = open(json_file)
    objs = ijson.items(f, '')
    business_id = []
    review_id = []
    text = []
    date = []

    for line in f:
        data = json.loads(line)
        business_id.append(data['business_id'])
        review_id.append(data['review_id'])
        text.append(data['text'])
        date.append(data['date'])
    
    reviews = pd.DataFrame({'business_id' : business_id, 'review_id' : review_id,
                               'review_text' : text, 'date' : date})
    return reviews


def get_business(json_file):
    f = open(json_file)
    objs = ijson.items(f, '')
    business_id = []
    address = []
    city = []
    state = []
    postal_code = []
    latitude = []
    longitude = []
    for line in f:
        data = json.loads(line)
        business_id.append(data['business_id'])
        address.append(data['address'])
        city.append(data['city'])
        state.append(data['state'])
        postal_code.append(data['postal_code'])
        latitude.append(data['latitude'])
        longitude.append(data['longitude'])
        
    reviews = pd.DataFrame({'business_id' : business_id, 'address' : address, 
                                   'city' : city, 'state' : state, 'postal_code' : postal_code,
                                   'latitude' : latitude, 'longitude' : longitude})
    return reviews

In [5]:
alcohol = get_alcohol('business.json')

In [6]:
reviews = get_reviews('review.json')

In [7]:
business = get_business('business.json')

In [8]:
print(reviews['date'].min())
print(reviews['date'].max())

2004-07-22
2017-12-11


In [9]:
reviews.head()

Unnamed: 0,business_id,date,review_id,review_text
0,0W4lkclzZThpx3V65bVgig,2016-05-28,v0i_UHJMo_hPBq9bxWvW4w,"Love the staff, love the meat, love the place...."
1,AEx2SYEUJmTxVVB18LlCwA,2016-05-28,vkVSCC7xljjrAI4UGfnKEQ,Super simple place but amazing nonetheless. It...
2,VR6GpWIda3SfvPC-lg9H3w,2016-05-28,n6QzIUObkYshz4dz2QRJTw,Small unassuming place that changes their menu...
3,CKC0-MOWMqoeWf6s-szl8g,2016-05-28,MV3CcKScW05u5LVfF6ok0g,Lester's is located in a beautiful neighborhoo...
4,ACFtxLv8pGrrxMm6EgjreA,2016-05-28,IXvOzsEMYtiJI0CARmj77Q,Love coming here. Yes the place always needs t...


In [10]:
business.head()

Unnamed: 0,address,business_id,city,latitude,longitude,postal_code,state
0,"4855 E Warner Rd, Ste B9",FYWN1wneV18bWNgQjJ2GNg,Ahwatukee,33.33069,-111.978599,85044,AZ
1,3101 Washington Rd,He-G7vWjzVUysIKrfNbPUQ,McMurray,40.291685,-80.1049,15317,PA
2,"6025 N 27th Ave, Ste 1",KQPW8lFf1y5BT2MxiSZ3QA,Phoenix,33.524903,-112.11531,85017,AZ
3,"5000 Arizona Mills Cr, Ste 435",8DShNS-LuFqpEWIp0HxijA,Tempe,33.383147,-111.964725,85282,AZ
4,581 Howe Ave,PfOCPjBrlQAnz__NXj9h_w,Cuyahoga Falls,41.119535,-81.47569,44221,OH


In [11]:
final = reviews.merge(business, how = 'left')

In [12]:
# how many values are null after the merge?

null_data = final[final.isnull().any(axis=1)]
len(null_data)

25

In [13]:
sid = SentimentIntensityAnalyzer()
com = []
pos = []
neg = []
rev = []




In [14]:
for review in final['review_text']:
    rev.append(review)
    scores =list(sid.polarity_scores(review).values())
    com.append(scores[3])
    pos.append(scores[2])
    neg.append(scores[0])

    
    


In [15]:
df = pd.DataFrame({'review_text' : rev, 'Negative' : neg, 'Positive' : pos, 'Compound' : com})

In [16]:
final = final.merge(df, how = 'left')

In [17]:
final.head()

Unnamed: 0,business_id,date,review_id,review_text,address,city,latitude,longitude,postal_code,state,Compound,Negative,Positive
0,0W4lkclzZThpx3V65bVgig,2016-05-28,v0i_UHJMo_hPBq9bxWvW4w,"Love the staff, love the meat, love the place....",3895 Boulevard Saint-Laurent,Montréal,45.516373,-73.577537,H2W 1X9,QC,0.946,0.032,0.281
1,AEx2SYEUJmTxVVB18LlCwA,2016-05-28,vkVSCC7xljjrAI4UGfnKEQ,Super simple place but amazing nonetheless. It...,34 Avenue Fairmount Ouest,Montréal,45.523333,-73.594859,H2T 2M1,QC,0.9551,0.0,0.355
2,VR6GpWIda3SfvPC-lg9H3w,2016-05-28,n6QzIUObkYshz4dz2QRJTw,Small unassuming place that changes their menu...,4662 Rue Notre-Dame O,Montréal,45.472902,-73.588321,H4C 1S7,QC,0.8167,0.0,0.093
3,CKC0-MOWMqoeWf6s-szl8g,2016-05-28,MV3CcKScW05u5LVfF6ok0g,Lester's is located in a beautiful neighborhoo...,1057 Avenue Bernard,Outremont,45.522144,-73.607076,H2V 1V1,QC,0.6249,0.0,0.075
4,ACFtxLv8pGrrxMm6EgjreA,2016-05-28,IXvOzsEMYtiJI0CARmj77Q,Love coming here. Yes the place always needs t...,698 Rue Sainte-Catherine O,Montréal,45.50251,-73.570119,H3B 1B9,QC,0.856,0.036,0.123


In [18]:
final = final.sort_values(['business_id', 'date'])

In [19]:
print(len(final))


5269907


In [20]:
print(final.dtypes)

business_id     object
date            object
review_id       object
review_text     object
address         object
city            object
latitude       float64
longitude      float64
postal_code     object
state           object
Compound       float64
Negative       float64
Positive       float64
dtype: object


In [21]:
final['sent_spread'] = final['Positive'] - final['Negative']

In [22]:
final.head()

Unnamed: 0,business_id,date,review_id,review_text,address,city,latitude,longitude,postal_code,state,Compound,Negative,Positive,sent_spread
3414767,--6MefnULPED_I942VcFNA,2008-08-07,iT5SDn-i-L2NLMcdyERj0A,"Chinese name of this place, Canto pronounciati...","328 Highway 7 E, Chalmers Gate 11, Unit 10",Richmond Hill,43.840905,-79.399604,L4B 3P7,ON,0.9926,0.05,0.107,0.057
4015047,--6MefnULPED_I942VcFNA,2010-11-25,aKl1rH7HTkD0stonJP4YHg,If you want a quick fix for a scrumptious char...,"328 Highway 7 E, Chalmers Gate 11, Unit 10",Richmond Hill,43.840905,-79.399604,L4B 3P7,ON,0.966,0.015,0.181,0.166
4797875,--6MefnULPED_I942VcFNA,2010-12-06,hPsTqtjMSde_-v6ZZJiTVw,In the ultra competitive Chinese restaurant in...,"328 Highway 7 E, Chalmers Gate 11, Unit 10",Richmond Hill,43.840905,-79.399604,L4B 3P7,ON,0.9776,0.046,0.116,0.07
4145814,--6MefnULPED_I942VcFNA,2010-12-22,R8Q9mgL2cWgD5yn9_IA--g,John's Chinese BBQ Restaurant is one of those ...,"328 Highway 7 E, Chalmers Gate 11, Unit 10",Richmond Hill,43.840905,-79.399604,L4B 3P7,ON,0.9407,0.072,0.105,0.033
836323,--6MefnULPED_I942VcFNA,2011-01-08,s_DQ0OqFZ3a_8MgeU6Sh2w,i like humble-pie.\nbut the owner walks around...,"328 Highway 7 E, Chalmers Gate 11, Unit 10",Richmond Hill,43.840905,-79.399604,L4B 3P7,ON,0.9932,0.025,0.22,0.195


In [27]:
final.to_csv('final.csv')

In [29]:
final.head(3)

Unnamed: 0,business_id,date,review_id,review_text,address,city,latitude,longitude,postal_code,state,Compound,Negative,Positive,sent_spread
3414767,--6MefnULPED_I942VcFNA,2008-08-07,iT5SDn-i-L2NLMcdyERj0A,"Chinese name of this place, Canto pronounciati...","328 Highway 7 E, Chalmers Gate 11, Unit 10",Richmond Hill,43.840905,-79.399604,L4B 3P7,ON,0.9926,0.05,0.107,0.057
4015047,--6MefnULPED_I942VcFNA,2010-11-25,aKl1rH7HTkD0stonJP4YHg,If you want a quick fix for a scrumptious char...,"328 Highway 7 E, Chalmers Gate 11, Unit 10",Richmond Hill,43.840905,-79.399604,L4B 3P7,ON,0.966,0.015,0.181,0.166
4797875,--6MefnULPED_I942VcFNA,2010-12-06,hPsTqtjMSde_-v6ZZJiTVw,In the ultra competitive Chinese restaurant in...,"328 Highway 7 E, Chalmers Gate 11, Unit 10",Richmond Hill,43.840905,-79.399604,L4B 3P7,ON,0.9776,0.046,0.116,0.07


In [33]:
alcohol = alcohol.reset_index()
alcohol.head(3)

Unnamed: 0,business_id,alcohol
0,He-G7vWjzVUysIKrfNbPUQ,
1,KQPW8lFf1y5BT2MxiSZ3QA,
2,8DShNS-LuFqpEWIp0HxijA,


In [34]:
final = final.merge(alcohol)
final.head()


Unnamed: 0,business_id,date,review_id,review_text,address,city,latitude,longitude,postal_code,state,Compound,Negative,Positive,sent_spread,alcohol
0,--6MefnULPED_I942VcFNA,2008-08-07,iT5SDn-i-L2NLMcdyERj0A,"Chinese name of this place, Canto pronounciati...","328 Highway 7 E, Chalmers Gate 11, Unit 10",Richmond Hill,43.840905,-79.399604,L4B 3P7,ON,0.9926,0.05,0.107,0.057,beer_and_wine
1,--6MefnULPED_I942VcFNA,2010-11-25,aKl1rH7HTkD0stonJP4YHg,If you want a quick fix for a scrumptious char...,"328 Highway 7 E, Chalmers Gate 11, Unit 10",Richmond Hill,43.840905,-79.399604,L4B 3P7,ON,0.966,0.015,0.181,0.166,beer_and_wine
2,--6MefnULPED_I942VcFNA,2010-12-06,hPsTqtjMSde_-v6ZZJiTVw,In the ultra competitive Chinese restaurant in...,"328 Highway 7 E, Chalmers Gate 11, Unit 10",Richmond Hill,43.840905,-79.399604,L4B 3P7,ON,0.9776,0.046,0.116,0.07,beer_and_wine
3,--6MefnULPED_I942VcFNA,2010-12-22,R8Q9mgL2cWgD5yn9_IA--g,John's Chinese BBQ Restaurant is one of those ...,"328 Highway 7 E, Chalmers Gate 11, Unit 10",Richmond Hill,43.840905,-79.399604,L4B 3P7,ON,0.9407,0.072,0.105,0.033,beer_and_wine
4,--6MefnULPED_I942VcFNA,2011-01-08,s_DQ0OqFZ3a_8MgeU6Sh2w,i like humble-pie.\nbut the owner walks around...,"328 Highway 7 E, Chalmers Gate 11, Unit 10",Richmond Hill,43.840905,-79.399604,L4B 3P7,ON,0.9932,0.025,0.22,0.195,beer_and_wine


In [36]:
# gets only restaurants
print(len(final))
final = final[final['alcohol'] != 'DROP']
print(len(final))

5110391
5110391


In [None]:
# let's bring in weather data too

In [None]:
import requests, re
def get_weather():
    
    
    
    url = 'https://www.wunderground.com/history/airport/KHWD/2013/4/1/DailyHistory.html?req_city=Fremont&req_state=CA&req_statename=California&reqdb.zip=94555&reqdb.magic=1&reqdb.wmo=99999'
    
    df = pd.read_html(url, header = 0, index_col = 0)
    