In [41]:
import pandas as pd
import json

%matplotlib inline

# Set dataframe options to keep long data in columns from being truncated with ellipsis (...)
pd.set_option('max_colwidth', None)

# Set dataframe options to force display max columns
pd.set_option('display.max_columns', 90)
pd.set_option('display.max_rows', 90)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:

photos_json = './yelp_dataset/photos.json'
business_json = './yelp_dataset/yelp_academic_dataset_business.json'
tip_json = './yelp_dataset/yelp_academic_dataset_tip.json'
checkin_json = './yelp_dataset/yelp_academic_dataset_checkin.json'
user_json = './yelp_dataset/yelp_academic_dataset_user.json'
review_json = './yelp_dataset/yelp_academic_dataset_review.json'


In [24]:

def create_dataframe(file_path):

    data = []
    with open(file_path, encoding="utf8") as f:
        for jsonObj in f:
            data.append(json.loads(jsonObj))
    return pd.DataFrame(data)

def df_info(df):
    '''
    A helper function - similar in functionality with `df.info()` but includes combined features: df.columns, df.dtypes, 
    df.isnull(), df.info(), df.nunique()
    '''
    print('\nShape : {}'.format(df.shape))
    print('Number of duplicates : {}\n'.format(df.duplicated().sum()))

    print('{:^35} {:^12} {:^12} {:^8} {:>10}\n'.format('COLUMNS', 'DATA TYPE', 'HAS NULL', 'COUNTS', '# UNIQUE'))
    for i, v in enumerate(df.columns):
        col = df[v]
        print(' {:>2}.  {:<30} {:<12} {:^10} {:>8} {:>10}'.format(i+1, v, str(col.dtype), \
                                                                  str(col.isnull().any()), col.count(), \
                                                                  col.nunique()))

In [9]:

photos_df = create_dataframe(photos_json)
business_df = create_dataframe(business_json)
tip_df = create_dataframe(tip_json)
checkin_df = create_dataframe(checkin_json)
user_df = create_dataframe(user_json)
review_df = create_dataframe(review_json)

In [35]:
business_df.sample(3)


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
16983,rxQalNB7uMY6Cj9aBmfA1Q,Arahova Souvlaki,6150 Taschereau Boul,Brossard,QC,J4W 1M7,45.475127,-73.467219,2.5,7,1,{'HasTV': 'True'},"Restaurants, Greek",
73052,xS00PcO4pFxhqg_F3tzeVw,Enercare,7400 Birchmount Road,Markham,ON,L3R 5V4,43.835064,-79.320459,1.0,3,1,,"Heating & Air Conditioning/HVAC, Water Heater ...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W..."
57843,Cgkcz9XbvZsMm54vKTS_cg,York Crossing Restaurant,"2301 Westinghouse Blvd, Ste 8",Charlotte,NC,28273,35.129612,-80.944702,4.0,17,1,"{'GoodForKids': 'True', 'HasTV': 'True', 'Outd...","Restaurants, American (Traditional), Seafood, ...","{'Monday': '6:0-21:0', 'Tuesday': '6:0-21:0', ..."


In [36]:
photos_df.sample(3)
df_info(photos_df)


Unnamed: 0,photo_id,business_id,caption,label
43460,bS7Z4Ath7iswOyyP1gFnNg,8glTYTp-ZzR_mlUe7p7HHQ,,food
76985,_gzPXwjGNdbahjDXap_BAw,XZbuPXdyA0ZtTu3AzqtQhg,,food
11654,3pf9ekLOYX6_eCwpzWnjbA,T7MemoFgPy6iNk4456vKWw,,food



Shape : (200000, 4)
Number of duplicates : 0

              COLUMNS                DATA TYPE     HAS NULL    COUNTS    # UNIQUE

  1.  photo_id                       object         False      200000     200000
  2.  business_id                    object         False      200000      39830
  3.  caption                        object         False      200000      72475
  4.  label                          object         False      200000          5


In [37]:
tip_df.sample(3)
df_info(tip_df)

Unnamed: 0,user_id,business_id,text,date,compliment_count
188932,rFeuV4U0vfls21N4ROgPKw,rMrymOj6RcBBddGuOjv_IQ,What a night! This place was packed with fans ...,2012-04-02 05:45:14,0
966738,MpZcIR-Pt69g1rVP8E9oNg,h5qBxa_L-pIdNSOBQxTNGQ,The Manager is kool,2013-12-08 06:27:11,0
318950,QtuzG4-zmGTdB0jSMC54tg,DC3Uyai-vCCUgMmMvLESlw,Great pizza. Great burgers. Friendly staff.,2013-03-28 01:13:29,0



Shape : (1320761, 5)
Number of duplicates : 156

              COLUMNS                DATA TYPE     HAS NULL    COUNTS    # UNIQUE

  1.  user_id                        object         False     1320761     365869
  2.  business_id                    object         False     1320761     132700
  3.  text                           object         False     1320761    1233082
  4.  date                           object         False     1320761    1315938
  5.  compliment_count               int64          False     1320761         13


In [38]:
checkin_df.sample(3)
df_info(checkin_df)

Unnamed: 0,business_id,date
41755,ECfv2JWsPU1oiFiqSUGUhA,2018-07-28 17:32:04
15936,4r53hrTfVMY0cYxGWprpiQ,"2012-12-15 16:37:44, 2012-12-15 16:38:02, 2012..."
81342,SgDJUI-IefmB1kB0Po22Ow,"2018-06-23 20:34:29, 2018-07-06 16:34:21, 2019..."



Shape : (175187, 2)
Number of duplicates : 0

              COLUMNS                DATA TYPE     HAS NULL    COUNTS    # UNIQUE

  1.  business_id                    object         False      175187     175187
  2.  date                           object         False      175187     175187


In [39]:
user_df.sample(3)
df_info(user_df)

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
799131,A75rzxtXc8nB8A-5dEFAzQ,John,41,2008-01-27 12:18:07,62,8,29,,"nnImk681KaRqUVHlSfZjGQ, Nyc1DYeFJtIgpqV76LPJwQ...",0,...,1,0,0,0,0,3,2,2,2,2
914228,7epHpI0Db7axcqD-GJwaVg,Brittany,3,2014-07-06 22:05:59,9,0,0,,,0,...,0,0,0,0,0,0,0,0,0,0
1777458,WV_bfZkHCemqhLk3SxFVOA,dj kero,11,2009-08-09 05:29:43,9,1,2,,"VjmW6SrnIsp3dSPrGukSQw, pYl4QkO7NQR5Hm30DKhaLw...",1,...,0,0,0,0,0,0,0,0,0,0



Shape : (1968703, 22)
Number of duplicates : 0

              COLUMNS                DATA TYPE     HAS NULL    COUNTS    # UNIQUE

  1.  user_id                        object         False     1968703    1968703
  2.  name                           object         False     1968703     144820
  3.  review_count                   int64          False     1968703       1877
  4.  yelping_since                  object         False     1968703    1960575
  5.  useful                         int64          False     1968703       5112
  6.  funny                          int64          False     1968703       3730
  7.  cool                           int64          False     1968703       4314
  8.  elite                          object         False     1968703        760
  9.  friends                        object         False     1968703    1116786
 10.  fans                           int64          False     1968703        659
 11.  average_stars                  float64        False 

In [42]:
review_df.sample(3)
df_info(review_df)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
3938274,0RKz6Owq3KUviT57v89rEg,6ucvniO1q1ckHLc97YcWnw,mB_cI1tCrNxxvr2e2myNrQ,5.0,1,0,0,"Heaven! That is pretty much all I have to say! Def want to come back during the night to experience the clubbing aspect of it. I am a huge fan of Taco Bell so when I planned my Vegas trip, I knew I had to make a stop here. \n\nThe decor is so cool and the atmosphere is great too. They have these screens where you can order your food and did I mention they have a bar!!!! Yes, a bar!!!! You can order those cool collector plastic bottles where you can fill it with your favorite margarita. The beer is put into a cup with a magnet underneath that you can keep as a souvenir. \n\nThe menu is pretty much the same as back home so I hope they incorporate some secret menu items for this location if they don't already. We will def be back!",2019-03-09 16:36:56
1201078,PebI9jIhOYTAkM8TV0e7JQ,Vtleb8imZabLahkdURbUUg,FRkNpDphmapskQ6s97QbGQ,5.0,1,0,0,So glad my husband and I found this company! Rose was so understanding and professional. She eased all my worries after needing a deep clean on our house after a renovation. The house was cleaned to perfection. Everything sparkled! We will definitely be using this company more frequently! Thank you for all your hard work on our house!,2017-09-08 17:20:53
2354840,jzR2wPoOuR-nZuQ1xiAPNg,nxWrhF_hyX0wwjrEkQX8uQ,LYNKKnl4jAiU1-U-97gR5Q,4.0,36,24,37,"We were looking for things to do in Las Vegas with my niece since she's only 12 and gambling, drinking and other debauchery weren't on the agenda this time around. While walking around Fremont Street, she was enamored by the ziplining that was happening above us and we thought this would be a cool activity.\n\nIn order to purchase tickets, you have to go to the ticket area located at the entrance of Fremont near Walgreens. If you walk all the way to the end, you'll be sent back here. There should be better signage, especially if you just fought a bunch of drunk people to get to the wrong area.\n\nOnce you're inside, they have a bunch of kiosks where you can choose which activity you'd like to do and designated time slots that are available. Because everything was booked up on a Friday night, we opted to do the simple version which was flying halfway through Fremont in a seated position. We chose our time, paid our $25, signed a digital waiver and off we went until it was time.\n\nWhen it was our turn, we walked up a set of stairs thinking it was gonna be a breeze. We were wrong. Once you get upstairs, you're weighed and then pointed to another line. They only let a set amount of people up the elevators at once, so it can take a while. Our time was 1140PM and we didn't actually get to the actual zipline until 1230AM. You don't see the queue from the street, so it can be a little deceptive. Just give yourself that extra breathing room, especially if you have other people waiting around for you.\n\nOne thing I really liked was that they gave you some heavy duty bags with zippers to put your belongings in. That way you're not losing them 100+ feet up in the air. I thought that was super convenient and well thought out.\n\nThe ride itself was fun and my niece had a blast. They take a few pictures during the experience that are available for purchase afterwards in case you wanted to commemorate the occasion. The staff were all very nice and friendly and I would recommend this activity for anyone looking for something to do for a few hours to kill time while in old Vegas.",2016-11-07 19:16:16



Shape : (8021122, 9)
Number of duplicates : 0

              COLUMNS                DATA TYPE     HAS NULL    COUNTS    # UNIQUE

  1.  review_id                      object         False     8021122    8021122
  2.  user_id                        object         False     8021122    1968703
  3.  business_id                    object         False     8021122     209393
  4.  stars                          float64        False     8021122          5
  5.  useful                         int64          False     8021122        266
  6.  funny                          int64          False     8021122        209
  7.  cool                           int64          False     8021122        200
  8.  text                           object         False     8021122    7999488
  9.  date                           object         False     8021122    7853102


In [46]:
review_df.query('review_id == "jzR2wPoOuR-nZuQ1xiAPNg"')[['user_id']]

Unnamed: 0,user_id
2354840,nxWrhF_hyX0wwjrEkQX8uQ


In [58]:
review_df.groupby(['user_id']).agg(['count']).sort_values(['user_id'], ascending=True).head(10)

Unnamed: 0_level_0,review_id,business_id,stars,useful,funny,cool,text,date
Unnamed: 0_level_1,count,count,count,count,count,count,count,count
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
---1lKK3aKOuomHnwAkAow,131,131,131,131,131,131,131,131
---3o4ZsKYoBYBe7H6xG8A,1,1,1,1,1,1,1,1
---89pEy_h9PvHwcHNbpyg,1,1,1,1,1,1,1,1
---94vtJ_5o_nikEs6hUjg,5,5,5,5,5,5,5,5
---PLwSf5gKdIoVnyRHgBA,3,3,3,3,3,3,3,3
---RfKzBwQ8t3wu-LXvx3w,1,1,1,1,1,1,1,1
---cu1hq55BP9DWVXXKHZg,3,3,3,3,3,3,3,3
---fhiwiwBYrvqhpXgcWDQ,1,1,1,1,1,1,1,1
---tGbMnMitD_7srW6Nfzg,1,1,1,1,1,1,1,1
---udAKDsn0yQXmzbWQNSw,2,2,2,2,2,2,2,2


In [57]:
review_df.query('user_id == "---1lKK3aKOuomHnwAkAow"').sample(3)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
4992044,L9xkbmtosit5TeJF0nmudw,---1lKK3aKOuomHnwAkAow,5rxJpTkeJa5rxMvL2NbSnQ,5.0,0,1,0,"A spa for Preggos that is owned by Doctors can you get better than that? Oh you can, because they have classes in yoga and lactation. All around a sliver of bliss.",2011-05-01 15:49:28
4624824,8rTiUXrFD9J10RI4Q2O6Qw,---1lKK3aKOuomHnwAkAow,y8d90Pt16Nip-B5UXWBP-w,4.0,1,0,1,"They serve fun beers and happy hour is nice. Good beer, lots of tables and late. \n\nThe food is really high in calories I am not sure they have any healthy options. Every thing seemed to be 1000 calories half your intake and honestly their food is not work half your daily calories.",2011-01-05 23:49:40
1627943,7NBDa1IWqZiLVv-4W0Qa9w,---1lKK3aKOuomHnwAkAow,XjP43Fso4cL9pFrYl738sg,5.0,2,1,0,"He has an amazing bedside manner and is pretty fast. He has this vibrating needle because it is hard for the mind to process vibration and pain at the same time. He does reminded me a bit of John Lithgow, and after last season of Dexter it was slightly strange. However you could not ask for a nicer staff.",2010-11-27 05:21:16


In [60]:
business_df.query('business_id == "XjP43Fso4cL9pFrYl738sg"')

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
60510,XjP43Fso4cL9pFrYl738sg,"Sam Licata, DDS","6415 S Ft Apache, Ste 190",Las Vegas,NV,89148,36.071907,-115.298254,3.5,13,1,"{'ByAppointmentOnly': 'True', 'BusinessAcceptsCreditCards': 'True'}","Dentists, Oral Surgeons, Health & Medical, General Dentistry","{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', 'Wednesday': '8:0-17:0', 'Thursday': '8:0-17:0', 'Friday': '8:0-17:0'}"
