In [2]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline 
import pandas as pd
import json
from tqdm import tqdm

import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kirthanpakki/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kirthanpakki/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kirthanpakki/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Preprocessing Businesses Dataset

In [5]:
biz_df = pd.read_json('yelp_dataset/yelp_academic_dataset_business.json', lines = True)
biz_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [6]:
biz_df.shape

(150346, 14)

In [7]:
biz_df['is_open'].value_counts()

1    119698
0     30648
Name: is_open, dtype: int64

In [8]:
# Dropping any businesses which are closed
biz_df = biz_df[biz_df['is_open'] == 1]

In [9]:
biz_df.shape

(119698, 14)

### Filtering out Businesses which are not Food

In [20]:
# Getting all the various categories in the dataset

all_categories = biz_df['categories'].str.split(', ').explode()
all_categories = set(all_categories)

In [23]:
len(all_categories)

1303

In [24]:
print(all_categories)

{'Consumer Law', 'Department Stores', 'Casinos', 'General Litigation', 'Bird Shops', 'Gastropubs', 'Digitizing Services', 'Television Service Providers', 'Home Decor', 'Head Shops', 'Framing', 'Beverage Store', 'Bicycle Paths', 'Bespoke Clothing', 'Senior Centers', 'Marinas', 'Process Servers', 'Physical Therapy', 'Swimming Pools', 'Smokehouse', 'Valet Services', 'Service Stations', 'Business Law', 'Estheticians', 'Pediatric Dentists', 'Radiologists', 'Limos', 'Ice Cream & Frozen Yogurt', 'Optometrists', 'Elder Law', 'Delis', 'Candle Stores', 'Beer Bar', 'Shopping Centers', 'Lebanese', 'Sushi Bars', 'Ethnic Grocery', 'Shoe Stores', 'Cannabis Clinics', 'Interval Training Gyms', 'Sandblasting', 'Fuzhou', 'Irrigation', 'Vape Shops', 'Diners', 'Cannabis Collective', 'Batting Cages', 'Professional Services', 'Skating Rinks', 'Cosmetic Dentists', 'Wigs', 'Urologists', 'Home & Garden', 'Indian', 'Food', 'Salad', 'Rodeo', 'Golf Cart Rentals', 'Bikes', 'Courthouses', 'Estate Planning Law', 'Chi

In [30]:
food_categories = {
    'Beverage Store',
    'Department Stores',
    'Smokehouse',
    'Valet Services',
    'Ice Cream & Frozen Yogurt',
    'Delis',
    'Beer Bar',
    'Sushi Bars',
    'Diners',
    'Indian',
    'Food',
    'Salad',
    'Southern',
    'Pasta Shops',
    'Patisserie/Cake Shop',
    'Pakistani',
    'Scandinavian',
    'Hotels',
    'American (Traditional)',
    'Fondue',
    'Gluten-Free',
    'Candy Stores',
    'Nightlife',
    'Taiwanese',
    'Food Trucks',
    'Greek',
    'Cocktail Bars',
    'Burgers',
    'Donairs',
    'Tex-Mex',
    'Guamanian',
    'Latin American',
    'Eastern European',
    'Japanese Curry',
    'Poutineries',
    'Canadian (New)',
    'Steakhouses',
    'Noodles',
    'Coffee & Tea Supplies',
    'Themed Cafes',
    'Uzbek',
    'Breakfast & Brunch',
    'Soup',
    'Mongolian',
    'Food Tours',
    'Shanghainese',
    'Drive-In Theater',
    'Cambodian',
    'Delicatessen',
    'Wineries',
    'Spanish',
    'Speakeasies',
    'Meaderies',
    'Turkish',
    'Comfort Food',
    'Whiskey Bars',
    'Dance Clubs',
    'Vegetarian',
    'Food Delivery Services',
    'Live/Raw Food',
    'Tacos',
    'Afghan',
    'Sandwiches',
    'Coffeeshops',
    'Food Court',
    'Hungarian',
    'Persian/Iranian',
    'Japanese',
    'Szechuan',
    'American (New)',
    'Pita',
    'Brazilian',
    'Custom Cakes',
    'Buffets',
    'Cabaret',
    'Bagels',
    'Bowling',
    'Sri Lankan',
    'Bakeries',
    'Restaurants',
    'Belgian',
    'Peruvian',
    'Breweries',
    'Bars',
    'Coffee Roasteries',
    'Food Banks',
    'Pubs',
    'Shaved Ice',
    'Conveyor Belt Sushi',
    'Wraps',
    'Malaysian',
    'Syrian',
    'Olive Oil',
    'Ethiopian',
    'Calabrian',
    'Italian',
    'Cucina campana',
    'Cheese Shops',
    'Puerto Rican',
    'New Mexican Cuisine',
    'Beer Gardens',
    'Wine Tasting Room',
    'Pan Asian',
    'Cuban',
    'Colombian',
    'Venezuelan',
    'Chicken Wings',
    'Himalayan/Nepalese',
    'Donuts',
    'Karaoke',
    'Fast Food',
    'Ukrainian',
    'Senegalese',
    'Donburi',
    'Popcorn Shops',
    'Somali',
    'Burmese',
    'Tai Chi',
    'Juice Bars & Smoothies',
    'Vietnamese',
    'Mediterranean',
    'Hawaiian',
    'South African',
    'Hakka',
    'Modern European',
    'Creperies',
    'Moroccan',
    'Egyptian',
    'Coffee & Tea',
    'Izakaya',
    'Tapas/Small Plates',
    'Tapas Bars',
    'Bartenders',
    'Internet Cafes',
    'Serbo Croatian',
    'Iberian',
    'Sicilian',
    'Acai Bowls',
    'Chinese',
    'Beer Gardens',
    'Scottish',
    'Middle Eastern',
    'Caribbean',
    'Bistros',
    'Pizza',
    'Arabic',
    'Hot Dogs',
    'Fish & Chips',
    'Drive-Thru Bars',
    'African',
    'Distilleries',
    'Asian Fusion',
    'Thai',
    'Wine Bars',
    'Ramen',
    'Brasseries',
    'Russian',
    'Irish Pub',
    'Vegan',
    'Tonkatsu',
    'Hot Pot',
    'Georgian',
    'Cafeteria',
    'Cajun/Creole',
    'Desserts',
    'Bed & Breakfast',
    'Bubble Tea',
    'Wine & Spirits',
    'Bangladeshi',
    'Cupcakes',
    'Cafes',
    'Dominican',
    'Austrian',
    'Cheesesteaks',
    'Gelato',
    'Mexican',
    'Food Stands',
    'Haitian',
    'Halal',
    'Portuguese',
    'Irish',
    'Salvadoran',
    'Seafood',
    'Trinidadian',
    'Dim Sum',
    'French',
    'Tuscan',
    'Teppanyaki',
    'British',
    'Kebab',
    'Korean',
    'Hainan',
    'Specialty Food',
    'Laotian',
    'Polish',
    'Czech',
    'Pop-Up Restaurants',
    'Singaporean',
    'Tiki Bars',
    'Soul Food',
    'Armenian',
    'Dinner Theater',
    'Hookah Bars',
    'Hong Kong Style Cafe',
    'Sardinian',
    'Local Flavor',
    'Waffles',
    'Pancakes',
    'Israeli',
    'Pretzels',
    'Cantonese',
    'Falafel',
    'Empanadas',
    'Chocolatiers & Shops',
    'Champagne Bars',
    'German',
    'Indonesian',
    'Barbeque',
    'Honduran',
    'Dumplings'
}

In [36]:
biz_df.isnull().sum()

business_id         0
name                0
address             0
city                0
state               0
postal_code         0
latitude            0
longitude           0
stars               0
review_count        0
is_open             0
attributes      12348
categories         95
hours           16095
dtype: int64

In [37]:
# Dropping any rows where the categories column is empty 

biz_df.dropna(subset=['categories'], inplace = True)

In [38]:
biz_df.isnull().sum()

business_id         0
name                0
address             0
city                0
state               0
postal_code         0
latitude            0
longitude           0
stars               0
review_count        0
is_open             0
attributes      12254
categories          0
hours           16000
dtype: int64

In [39]:
food_biz = biz_df[biz_df['categories'].str.contains('|'.join(food_categories))]

  food_biz = biz_df[biz_df['categories'].str.contains('|'.join(food_categories))]


In [41]:
food_biz.shape

(53052, 14)

In [42]:
food_biz.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."
5,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,1,"{'BusinessParking': 'None', 'BusinessAcceptsCr...","Burgers, Fast Food, Sandwiches, Food, Ice Crea...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-22:0', '..."
9,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.208102,-86.76817,1.5,10,1,"{'RestaurantsAttire': ''casual'', 'Restaurants...","Ice Cream & Frozen Yogurt, Fast Food, Burgers,...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-21:0', '..."
10,UJsufbvfyfONHeWdvAHKjA,Marshalls,21705 Village Lakes Sc Dr,Land O' Lakes,FL,34639,28.190459,-82.45738,3.5,6,1,"{'RestaurantsPriceRange2': '2', 'BikeParking':...","Department Stores, Shopping, Fashion","{'Monday': '9:30-21:30', 'Tuesday': '9:30-21:3..."


### Expanding the Attributes Column

In [46]:
# Extracting the various Attributes in the Dataset 

attr_df = food_biz['attributes'].apply(pd.Series)

In [50]:
attr_df.head()

Unnamed: 0,RestaurantsDelivery,OutdoorSeating,BusinessAcceptsCreditCards,BusinessParking,BikeParking,RestaurantsPriceRange2,RestaurantsTakeOut,ByAppointmentOnly,WiFi,Alcohol,...,BestNights,BYOB,Corkage,BYOBCorkage,AcceptsInsurance,RestaurantsCounterService,Open24Hours,AgesAllowed,DietaryRestrictions,HairSpecializesIn
3,False,False,False,"{'garage': False, 'street': True, 'validated':...",True,1.0,True,False,u'free',u'none',...,,,,,,,,,,
4,,,True,"{'garage': None, 'street': None, 'validated': ...",True,,True,,,,...,,,,,,,,,,
5,True,True,True,,False,1.0,True,False,u'no',u'none',...,,,,,,,,,,
9,True,True,True,"{'garage': False, 'street': False, 'validated'...",,1.0,True,False,u'no',u'none',...,,,,,,,,,,
10,,,True,"{'garage': False, 'street': False, 'validated'...",True,2.0,,,,,...,,,,,,,,,,


In [51]:
attr_df.columns

Index(['RestaurantsDelivery', 'OutdoorSeating', 'BusinessAcceptsCreditCards',
       'BusinessParking', 'BikeParking', 'RestaurantsPriceRange2',
       'RestaurantsTakeOut', 'ByAppointmentOnly', 'WiFi', 'Alcohol', 'Caters',
       'WheelchairAccessible', 'GoodForKids', 'RestaurantsAttire',
       'RestaurantsReservations', 'Ambience', 'CoatCheck', 'DogsAllowed',
       'RestaurantsTableService', 'RestaurantsGoodForGroups', 'HasTV',
       'HappyHour', 'DriveThru', 'GoodForMeal', 'NoiseLevel',
       'BusinessAcceptsBitcoin', 'Smoking', 'Music', 'GoodForDancing',
       'BestNights', 'BYOB', 'Corkage', 'BYOBCorkage', 'AcceptsInsurance',
       'RestaurantsCounterService', 'Open24Hours', 'AgesAllowed',
       'DietaryRestrictions', 'HairSpecializesIn'],
      dtype='object')

In [54]:
attr_df.drop('HairSpecializesIn', inplace = True, axis = 1)

In [56]:
food_biz.drop('attributes', inplace = True, axis = 1)
food_biz = pd.concat([food_biz, attr_df], axis = 1)

In [57]:
food_biz.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,GoodForDancing,BestNights,BYOB,Corkage,BYOBCorkage,AcceptsInsurance,RestaurantsCounterService,Open24Hours,AgesAllowed,DietaryRestrictions
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,,,,,,,,,,
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,...,,,,,,,,,,
5,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,...,,,,,,,,,,
9,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.208102,-86.76817,1.5,10,...,,,,,,,,,,
10,UJsufbvfyfONHeWdvAHKjA,Marshalls,21705 Village Lakes Sc Dr,Land O' Lakes,FL,34639,28.190459,-82.45738,3.5,6,...,,,,,,,,,,


In [58]:
# Number of businesses in each state
food_biz['state'].value_counts()

PA     11567
FL      9142
TN      4561
IN      4219
LA      4154
MO      3958
NJ      3420
AZ      2851
AB      2412
NV      1971
ID      1480
CA      1329
IL      1014
DE       972
HI         1
XMS        1
Name: state, dtype: int64

In [65]:
# Checking the number of reviews and the average rating according to the state

group_by_state = food_biz.groupby('state').agg({'stars':'mean', 'review_count':'sum'}).reset_index()
group_by_state.sort_values('review_count', ascending = False).reset_index()

Unnamed: 0,index,state,stars,review_count
0,13,PA,3.540849,933691
1,4,FL,3.572522,731186
2,9,LA,3.6578,557668
3,14,TN,3.472484,408565
4,10,MO,3.480798,313708
5,8,IN,3.509836,301801
6,1,AZ,3.484216,241934
7,12,NV,3.576611,235567
8,2,CA,3.881114,209237
9,11,NJ,3.465351,156617


In [68]:
# Checking the total reviews and the average rating according to the city in top 3 states

group_by_PA = food_biz[food_biz['state']=='PA'].groupby('city').agg({'stars':'mean', 'review_count':'sum'}).reset_index()
group_by_PA.sort_values('review_count', ascending = False).reset_index().head(30)

Unnamed: 0,index,city,stars,review_count
0,194,Philadelphia,3.569234,574595
1,275,West Chester,3.577068,14656
2,127,King of Prussia,3.428058,14628
3,157,Media,3.692913,9730
4,49,Conshohocken,3.538462,9065
5,273,Wayne,3.705,8858
6,174,New Hope,3.699029,8612
7,56,Doylestown,3.684211,8519
8,10,Bensalem,3.255747,8322
9,176,Newtown,3.695238,7204


In [69]:
group_by_FL = food_biz[food_biz['state']=='FL'].groupby('city').agg({'stars':'mean', 'review_count':'sum'}).reset_index()
group_by_FL.sort_values('review_count', ascending = False).reset_index().head(30)

Unnamed: 0,index,city,stars,review_count
0,114,Tampa,3.56671,280833
1,16,Clearwater,3.569945,53257
2,86,Saint Petersburg,3.638132,47089
3,105,St. Petersburg,3.840967,32978
4,11,Brandon,3.448052,26881
5,17,Clearwater Beach,3.673684,18173
6,21,Dunedin,3.915,17443
7,104,St. Pete Beach,3.724719,15344
8,101,St Petersburg,3.599174,15199
9,42,Largo,3.5,14589


In [70]:
group_by_LA = food_biz[food_biz['state']=='LA'].groupby('city').agg({'stars':'mean', 'review_count':'sum'}).reset_index()
group_by_LA.sort_values('review_count', ascending = False).reset_index().head(30)

Unnamed: 0,index,city,stars,review_count
0,26,New Orleans,3.82487,479413
1,20,Metairie,3.382075,38361
2,16,Kenner,3.140927,13190
3,12,Gretna,3.509615,6741
4,14,Harvey,3.208333,4898
5,18,Marrero,3.191358,2866
6,13,Harahan,3.45614,2855
7,8,Chalmette,3.321429,1660
8,15,Jefferson,3.451613,1511
9,36,Westwego,3.568966,1385


In [79]:
food_biz.to_csv('processed_datasets/restaurants.csv', index = False)
food_biz.to_json('processed_datasets/restaurants.json', orient='records', lines = True)

# Preprocessing Reviews Dataset

In [86]:
reviews_path = 'yelp_dataset/yelp_academic_dataset_review.json'
chunksize = 500000
chunks = []

for chunk in pd.read_json(reviews_path, lines = True, chunksize = chunksize):
    rel_data = chunk[['review_id', 'user_id', 'business_id', 'stars', 'text']]
    chunks.append(rel_data)

reviews_df = pd.concat(chunks, axis = 0, ignore_index = True)

In [87]:
reviews_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,text
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,"If you decide to eat here, just be aware it is..."
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,I've taken a lot of spin classes over the year...
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,Family diner. Had the buffet. Eclectic assortm...
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,"Wow! Yummy, different, delicious. Our favo..."
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,Cute interior and owner (?) gave us tour of up...


In [88]:
reviews_df.shape

(6990280, 5)

### Filtering out Reviews which are not in our Restaurant list 

In [89]:
food_reviews = pd.merge(reviews_df, food_biz[['business_id']], on = 'business_id', how = 'inner')
food_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,text
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,"If you decide to eat here, just be aware it is..."
1,VJxlBnJmCDIy8DFG0kjSow,Iaee7y6zdSB3B-kRCo4z1w,XQfwVwDr-v0ZS3_CbbE5Xw,2,This is the second time we tried turning point...
2,S6pQZQocMB1WHMjTRbt77A,ejFxLGqQcWNLdNByJlIhnQ,XQfwVwDr-v0ZS3_CbbE5Xw,4,The place is cute and the staff was very frien...
3,WqgTKVqWVHDHjnjEsBvUgg,f7xa0p_1V9lx53iIGN5Sug,XQfwVwDr-v0ZS3_CbbE5Xw,3,We came on a Saturday morning after waiting a ...
4,M0wzFFb7pefOPcxeRVbLag,dCooFVCk8M1nVaQqcfTL3Q,XQfwVwDr-v0ZS3_CbbE5Xw,2,"Mediocre at best. The decor is very nice, and ..."


In [91]:
food_reviews.shape

(4480346, 5)

### Preprocessing the Text Column 

In [96]:
def preprocess_review_batch(review_batch):
    # Converting to lowercase
    review_batch = review_batch.str.lower()
    # Removing punctuations and specials characters
    review_batch = review_batch.str.replace(r'[^a-zA-Z\s]', '', regex = True)
    # Tokenizing the Text 
    tokens = review_batch.apply(word_tokenize)
    # Removing the stop words 
    tokens = tokens.apply(lambda x: [word for word in x if word not in set(stopwords.words('english'))])
    # Lemmatizing the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = tokens.apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    processed_review = tokens.apply(lambda x: ' '.join(x))
    return processed_review

batch_size = 100000

processed_chunks = []

# Process the text column in batches
for i in range(0, len(food_reviews), batch_size):
    review_batch = food_reviews.iloc[i:i+batch_size]['text']
    processed_chunk = preprocess_review_batch(review_batch)
    processed_chunks.append(processed_chunk)

processed_text_series = pd.concat(processed_chunks, ignore_index=True)

# Add processed text back to reviews_df
food_reviews['processed_text'] = processed_text_series

In [97]:
food_reviews

Unnamed: 0,review_id,user_id,business_id,stars,text,processed_text
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,"If you decide to eat here, just be aware it is...",decide eat aware going take hour beginning end...
1,VJxlBnJmCDIy8DFG0kjSow,Iaee7y6zdSB3B-kRCo4z1w,XQfwVwDr-v0ZS3_CbbE5Xw,2,This is the second time we tried turning point...,second time tried turning point location first...
2,S6pQZQocMB1WHMjTRbt77A,ejFxLGqQcWNLdNByJlIhnQ,XQfwVwDr-v0ZS3_CbbE5Xw,4,The place is cute and the staff was very frien...,place cute staff friendly nice menu good brunc...
3,WqgTKVqWVHDHjnjEsBvUgg,f7xa0p_1V9lx53iIGN5Sug,XQfwVwDr-v0ZS3_CbbE5Xw,3,We came on a Saturday morning after waiting a ...,came saturday morning waiting month opening ho...
4,M0wzFFb7pefOPcxeRVbLag,dCooFVCk8M1nVaQqcfTL3Q,XQfwVwDr-v0ZS3_CbbE5Xw,2,"Mediocre at best. The decor is very nice, and ...",mediocre best decor nice like restaurant tryin...
...,...,...,...,...,...,...
4480341,qMMSJ2se0-G-V9K7y96EbQ,a4wr0eMRHjFqI66JoGpXog,Tj9FmBCHd84kjAE9vcoBnw,1,I've always gone to this location for two reas...,ive always gone location two reason dont water...
4480342,WooR8MihE_bJ3qnYbDjizw,-MZSr4VErbuvzm8c5z2gbQ,Tj9FmBCHd84kjAE9vcoBnw,2,This particular location is so painfully slow....,particular location painfully slowalways chick...
4480343,QLVR6EcBV2lD2JpJrP9N_Q,k5RiHhMsd7zT6gO-sa8CiQ,Tj9FmBCHd84kjAE9vcoBnw,1,The owner is the biggest a-hole ever! He gave ...,owner biggest ahole ever gave much attitude in...
4480344,cf3ft8457oGoiu9rW49cDQ,cS5J5Lw1xECApPjmk6dyJg,Tj9FmBCHd84kjAE9vcoBnw,1,Ask for Breasts and I'm always told there are ...,ask breast im always told none chicken chicken...


In [98]:
food_reviews.drop('text', axis = 1, inplace = True)

In [99]:
food_reviews.shape

(4480346, 5)

In [100]:
food_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,processed_text
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,decide eat aware going take hour beginning end...
1,VJxlBnJmCDIy8DFG0kjSow,Iaee7y6zdSB3B-kRCo4z1w,XQfwVwDr-v0ZS3_CbbE5Xw,2,second time tried turning point location first...
2,S6pQZQocMB1WHMjTRbt77A,ejFxLGqQcWNLdNByJlIhnQ,XQfwVwDr-v0ZS3_CbbE5Xw,4,place cute staff friendly nice menu good brunc...
3,WqgTKVqWVHDHjnjEsBvUgg,f7xa0p_1V9lx53iIGN5Sug,XQfwVwDr-v0ZS3_CbbE5Xw,3,came saturday morning waiting month opening ho...
4,M0wzFFb7pefOPcxeRVbLag,dCooFVCk8M1nVaQqcfTL3Q,XQfwVwDr-v0ZS3_CbbE5Xw,2,mediocre best decor nice like restaurant tryin...


In [101]:
food_reviews.to_csv('processed_datasets/restaurant_reviews.csv', index = False)

In [139]:
food_biz.shape

(53052, 51)

In [103]:
food_reviews.shape

(4480346, 5)

# Preprocessing Users Dataset

In [125]:
users_path = 'yelp_dataset/yelp_academic_dataset_user.json'
chunksize = 500000
chunks = []

with tqdm() as pbar:
    for chunk in pd.read_json(users_path, lines = True, chunksize = chunksize):
        rel_data = chunk[['user_id', 'name', 'review_count', 'friends', 'fans', 'average_stars']]
        chunks.append(rel_data)
    
        pbar.update(1)

users_df = pd.concat(chunks, axis = 0, ignore_index = True)

python(46732) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
4it [03:06, 46.75s/it]


In [127]:
users_df.head()

Unnamed: 0,user_id,name,review_count,friends,fans,average_stars
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,3.91
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,"ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,3.74
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA...",52,3.32
3,SZDeASXq7o05mMNLshsdIA,Gwen,224,"enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74Vg...",28,4.27
4,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,"PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMbA...",1,3.54


In [128]:
users_df.shape

(1987897, 6)

### Filtering out users who have reviews for Restaurants only

In [130]:
merged_df = pd.merge(users_df, food_reviews[['user_id']], on='user_id', how='inner')
restaurant_users = merged_df.drop_duplicates(subset='user_id', keep='first')
restaurant_users.reset_index(drop=True, inplace=True)

restaurant_users.head()

Unnamed: 0,user_id,name,review_count,friends,fans,average_stars
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,3.91
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,"ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,3.74
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA...",52,3.32
3,SZDeASXq7o05mMNLshsdIA,Gwen,224,"enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74Vg...",28,4.27
4,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,"PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMbA...",1,3.54


In [138]:
restaurant_users.shape

(1450144, 6)

In [140]:
restaurant_users.to_csv('processed_datasets/restaurant_users.csv', index = False)

# Philadelphia

In [105]:
biz_philli = food_biz[food_biz['city'] == 'Philadelphia']
biz_philli.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,GoodForDancing,BestNights,BYOB,Corkage,BYOBCorkage,AcceptsInsurance,RestaurantsCounterService,Open24Hours,AgesAllowed,DietaryRestrictions
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,,,,,,,,,,
15,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,Philadelphia,PA,19106,39.953949,-75.143226,4.0,245,...,,,,,,,,,,
19,ROeacJQwBeh05Rqg7F6TCg,BAP,1224 South St,Philadelphia,PA,19147,39.943223,-75.162568,4.5,205,...,,,,,,,,,,
35,aPNXGTDkf-4bjhyMBQxqpQ,Craft Hall,901 N Delaware Ave,Philadelphia,PA,19123,39.962582,-75.135657,3.5,65,...,,,,,,,,,,
82,ppFCk9aQkM338Rgwpl2F5A,Wawa,3604 Chestnut St,Philadelphia,PA,19104,39.954573,-75.194894,3.0,56,...,,,,,,,,,,


In [106]:
biz_philli.shape

(5078, 51)

In [107]:
reviews_philli = pd.merge(food_reviews, biz_philli[['business_id']], on = 'business_id', how = 'inner')
reviews_philli.head()

Unnamed: 0,review_id,user_id,business_id,stars,processed_text
0,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,wow yummy different delicious favorite lamb cu...
1,HME_ksGph3se7Aze5hxa-Q,kSMOJwJXuEUqzfmuFncK4A,kxX2SOes4o-D3ZQBkiMRfA,2,dinein get star disappointing service venue fa...
2,EJWyA5wpdVMji1j4TwSZqQ,mqBWACmaHflW4eh_Ofp16Q,kxX2SOes4o-D3ZQBkiMRfA,5,long hiatus reviewing awaken mouth yelp hibern...
3,T_kAb2NeylB-JdNDKphryw,Z-xgVb4nM42943m2wbBkFw,kxX2SOes4o-D3ZQBkiMRfA,5,weve eaten time seems time get even better shr...
4,NENaCqb6TNj5CyY1LOdI6Q,2SEoXb6r6hPKrl9V9VzBgA,kxX2SOes4o-D3ZQBkiMRfA,5,came philly family event stayed little dinner ...


In [109]:
reviews_philli.shape

(592565, 5)

In [16]:
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
def get_sentiment_scores(text):
    sentiment = sia.polarity_scores(text)
    return sentiment['compound']

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/kirthanpakki/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [113]:
reviews_philli['sentiment_scores'] = reviews_philli['processed_text'].apply(get_sentiment_scores)

In [119]:
reviews_philli.head()

Unnamed: 0,review_id,user_id,business_id,stars,processed_text,sentiment_scores
0,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,wow yummy different delicious favorite lamb cu...,0.9652
1,HME_ksGph3se7Aze5hxa-Q,kSMOJwJXuEUqzfmuFncK4A,kxX2SOes4o-D3ZQBkiMRfA,2,dinein get star disappointing service venue fa...,0.0516
2,EJWyA5wpdVMji1j4TwSZqQ,mqBWACmaHflW4eh_Ofp16Q,kxX2SOes4o-D3ZQBkiMRfA,5,long hiatus reviewing awaken mouth yelp hibern...,0.9932
3,T_kAb2NeylB-JdNDKphryw,Z-xgVb4nM42943m2wbBkFw,kxX2SOes4o-D3ZQBkiMRfA,5,weve eaten time seems time get even better shr...,0.9263
4,NENaCqb6TNj5CyY1LOdI6Q,2SEoXb6r6hPKrl9V9VzBgA,kxX2SOes4o-D3ZQBkiMRfA,5,came philly family event stayed little dinner ...,0.7533


In [121]:
reviews_philli['final_rating'] = reviews_philli['stars'] + reviews_philli['sentiment_scores'] 

In [122]:
reviews_philli.head()

Unnamed: 0,review_id,user_id,business_id,stars,processed_text,sentiment_scores,final_rating
0,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,wow yummy different delicious favorite lamb cu...,0.9652,5.9652
1,HME_ksGph3se7Aze5hxa-Q,kSMOJwJXuEUqzfmuFncK4A,kxX2SOes4o-D3ZQBkiMRfA,2,dinein get star disappointing service venue fa...,0.0516,2.0516
2,EJWyA5wpdVMji1j4TwSZqQ,mqBWACmaHflW4eh_Ofp16Q,kxX2SOes4o-D3ZQBkiMRfA,5,long hiatus reviewing awaken mouth yelp hibern...,0.9932,5.9932
3,T_kAb2NeylB-JdNDKphryw,Z-xgVb4nM42943m2wbBkFw,kxX2SOes4o-D3ZQBkiMRfA,5,weve eaten time seems time get even better shr...,0.9263,5.9263
4,NENaCqb6TNj5CyY1LOdI6Q,2SEoXb6r6hPKrl9V9VzBgA,kxX2SOes4o-D3ZQBkiMRfA,5,came philly family event stayed little dinner ...,0.7533,5.7533


In [123]:
reviews_philli.to_csv('processed_datasets/philadelphia/reviews_philli.csv', index = False)
biz_philli.to_csv('processed_datasets/philadelphia/restuarants_philli.csv', index = False)

In [141]:
users_philli = pd.merge(restaurant_users, reviews_philli[['user_id']], on='user_id', how='inner')
users_philli = users_philli.drop_duplicates(subset='user_id', keep='first')
users_philli.reset_index(drop=True, inplace=True)

In [142]:
users_philli.shape

(199676, 6)

In [143]:
users_philli.to_csv('processed_datasets/philadelphia/users_philli.csv', index = False)

### Creating the Item-User Matrix 

In [144]:
reviews_philli.head()

Unnamed: 0,review_id,user_id,business_id,stars,processed_text,sentiment_scores,final_rating
0,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,wow yummy different delicious favorite lamb cu...,0.9652,5.9652
1,HME_ksGph3se7Aze5hxa-Q,kSMOJwJXuEUqzfmuFncK4A,kxX2SOes4o-D3ZQBkiMRfA,2,dinein get star disappointing service venue fa...,0.0516,2.0516
2,EJWyA5wpdVMji1j4TwSZqQ,mqBWACmaHflW4eh_Ofp16Q,kxX2SOes4o-D3ZQBkiMRfA,5,long hiatus reviewing awaken mouth yelp hibern...,0.9932,5.9932
3,T_kAb2NeylB-JdNDKphryw,Z-xgVb4nM42943m2wbBkFw,kxX2SOes4o-D3ZQBkiMRfA,5,weve eaten time seems time get even better shr...,0.9263,5.9263
4,NENaCqb6TNj5CyY1LOdI6Q,2SEoXb6r6hPKrl9V9VzBgA,kxX2SOes4o-D3ZQBkiMRfA,5,came philly family event stayed little dinner ...,0.7533,5.7533


In [148]:
reviews_philli.drop(columns = ['stars', 'processed_text','sentiment_scores'], axis = 1, inplace=True)

In [149]:
reviews_philli.head()

Unnamed: 0,review_id,user_id,business_id,final_rating
0,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.9652
1,HME_ksGph3se7Aze5hxa-Q,kSMOJwJXuEUqzfmuFncK4A,kxX2SOes4o-D3ZQBkiMRfA,2.0516
2,EJWyA5wpdVMji1j4TwSZqQ,mqBWACmaHflW4eh_Ofp16Q,kxX2SOes4o-D3ZQBkiMRfA,5.9932
3,T_kAb2NeylB-JdNDKphryw,Z-xgVb4nM42943m2wbBkFw,kxX2SOes4o-D3ZQBkiMRfA,5.9263
4,NENaCqb6TNj5CyY1LOdI6Q,2SEoXb6r6hPKrl9V9VzBgA,kxX2SOes4o-D3ZQBkiMRfA,5.7533


In [145]:
biz_philli.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,GoodForDancing,BestNights,BYOB,Corkage,BYOBCorkage,AcceptsInsurance,RestaurantsCounterService,Open24Hours,AgesAllowed,DietaryRestrictions
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,,,,,,,,,,
15,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,Philadelphia,PA,19106,39.953949,-75.143226,4.0,245,...,,,,,,,,,,
19,ROeacJQwBeh05Rqg7F6TCg,BAP,1224 South St,Philadelphia,PA,19147,39.943223,-75.162568,4.5,205,...,,,,,,,,,,
35,aPNXGTDkf-4bjhyMBQxqpQ,Craft Hall,901 N Delaware Ave,Philadelphia,PA,19123,39.962582,-75.135657,3.5,65,...,,,,,,,,,,
82,ppFCk9aQkM338Rgwpl2F5A,Wawa,3604 Chestnut St,Philadelphia,PA,19104,39.954573,-75.194894,3.0,56,...,,,,,,,,,,


In [155]:
res_philli = biz_philli[['business_id', 'name', 'stars']]

In [156]:
res_philli.head()

Unnamed: 0,business_id,name,stars
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,4.0
15,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,4.0
19,ROeacJQwBeh05Rqg7F6TCg,BAP,4.5
35,aPNXGTDkf-4bjhyMBQxqpQ,Craft Hall,3.5
82,ppFCk9aQkM338Rgwpl2F5A,Wawa,3.0


In [146]:
users_philli.head()

Unnamed: 0,user_id,name,review_count,friends,fans,average_stars
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,3.91
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,"ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,3.74
2,q_QQ5kBBwlCcbL1s4NVK3g,Jane,1221,"xBDpTUbai0DXrvxCe3X16Q, 7GPNBO496aecrjJfW6UWtg...",1357,3.85
3,AUi8MPWJ0mLkMfwbui27lg,John,109,"gy5fWeSv3Gamuq9Ox4MV4g, lMr3LWU6kPFLTmCpDkACxg...",4,3.4
4,1McG5Rn_UDkmlkZOrsdptg,Teresa,7,"piejMEdRkGB7-1aL4lL5NQ, X0zFOU6iG95-feQKOXkgrA...",1,4.29


In [150]:
users_philli.drop(columns = ['review_count', 'friends', 'fans'], axis = 1, inplace = True)

In [151]:
users_philli.head()

Unnamed: 0,user_id,name,average_stars
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,3.91
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,3.74
2,q_QQ5kBBwlCcbL1s4NVK3g,Jane,3.85
3,AUi8MPWJ0mLkMfwbui27lg,John,3.4
4,1McG5Rn_UDkmlkZOrsdptg,Teresa,4.29


In [158]:
# Merge reviews_philli with users_philli to get user information
merged_df = pd.merge(reviews_philli, users_philli, on='user_id', how='left')

# Merge merged_df with res_philli to get restaurant information
merged_df = pd.merge(merged_df, res_philli, on='business_id', how='left')

# Select relevant columns for the User-Item Ratings matrix
ratings_df = merged_df[['user_id', 'business_id', 'final_rating']]

# Aggregate duplicate entries by taking the mean of the ratings for each combination of user_id and business_id.
ratings_df = ratings_df.groupby(['user_id', 'business_id']).mean().reset_index()

# Pivot the DataFrame to create the User-Item Ratings matrix
ratings_matrix = ratings_df.pivot(index='user_id', columns='business_id', values='final_rating')

# Display the resulting ratings matrix
print(ratings_matrix)

business_id             --OS_I7dnABrXvRCCuWOGQ  --sXnWH9Xm6_NvIjyuA99w  \
user_id                                                                  
--2tyArRmSoyKx5r-FVG0A                     NaN                     NaN   
--2vR0DIsmQ6WfcSzKWigw                     NaN                     NaN   
--4AjktZiHowEIBCMd4CZA                     NaN                     NaN   
--4_p6Z3tKadJcr9Non_Vw                     NaN                     NaN   
--6GckBYtTa4hj8pT09oAg                     NaN                     NaN   
...                                        ...                     ...   
zzvCl_egPyWpxO7EvWc2IA                     NaN                     NaN   
zzvLsOhm7gBMQDygMBDrHg                     NaN                     NaN   
zzwYLnmIvj8C7wJmRjtkRA                     NaN                     NaN   
zzx7J3zheFF3zf5YYfDAMg                     NaN                     NaN   
zzzMBVS73g3ZJ7qL8JyhiA                     NaN                     NaN   

business_id             -0TffRSXXIlBY

In [160]:
ratings_matrix.head()

business_id,--OS_I7dnABrXvRCCuWOGQ,--sXnWH9Xm6_NvIjyuA99w,-0TffRSXXIlBYVbb5AwfTg,-1B9pP_CrRBJYPICE5WbRA,-1DxQ1CxSc_JCxzKm9bglg,-3ArWZfDjfab8qVHf3WVtg,-3e3CP3FFc-rvJj_-_airw,-3m_nXlyvdKAVNNmVirpGQ,-5Rah4ZvWsDu4oilUZxhtw,-63ytt5vkWof-M9NDGTkng,...,zuKnCtZQKZqnvEaKVnwVVQ,zucC7rHpXPYBu7aEqj0NUw,zujdPV3HT-Y-CKE1GgkMHQ,zvvl3c1FO3O3BZdhusficA,zwTmOj4B_OVPMTMYijQiKg,zwd4dyQ5ovnjVojWfAuhMw,zxRmQ_FWVowh8rlzLCSURQ,zxY4DgtXsVHihSUpsmwamg,zy2p8yfx_fgXMCCUo8nWsA,zz3E7kmJI2r2JseE6LAnrw
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--2tyArRmSoyKx5r-FVG0A,,,,,,,,,,,...,,,,,,,,,,
--2vR0DIsmQ6WfcSzKWigw,,,,,,,3.1481,,,,...,,,,,,,,,,
--4AjktZiHowEIBCMd4CZA,,,,,,,,,,,...,,,,,,,,,,
--4_p6Z3tKadJcr9Non_Vw,,,,,,,,,,,...,,,,,,,,,,
--6GckBYtTa4hj8pT09oAg,,,,,,,,,,,...,,,,,,,,,,


In [161]:
user_item_ratings = ratings_matrix.values

In [164]:
ratings_matrix.to_csv('processed_datasets/philadelphia/ratings_matrix_philli.csv')

# Tampa 

In [3]:
food_biz = pd.read_csv('processed_datasets/restaurants.csv')

  food_biz = pd.read_csv('processed_datasets/restaurants.csv')


In [6]:
reviews_path = 'processed_datasets/restaurant_reviews.csv'
chunksize = 500000
chunks = []

for chunk in pd.read_csv(reviews_path, chunksize = chunksize):
    chunks.append(chunk)

food_reviews = pd.concat(chunks, axis = 0, ignore_index = True)

In [7]:
food_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,processed_text
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,decide eat aware going take hour beginning end...
1,VJxlBnJmCDIy8DFG0kjSow,Iaee7y6zdSB3B-kRCo4z1w,XQfwVwDr-v0ZS3_CbbE5Xw,2,second time tried turning point location first...
2,S6pQZQocMB1WHMjTRbt77A,ejFxLGqQcWNLdNByJlIhnQ,XQfwVwDr-v0ZS3_CbbE5Xw,4,place cute staff friendly nice menu good brunc...
3,WqgTKVqWVHDHjnjEsBvUgg,f7xa0p_1V9lx53iIGN5Sug,XQfwVwDr-v0ZS3_CbbE5Xw,3,came saturday morning waiting month opening ho...
4,M0wzFFb7pefOPcxeRVbLag,dCooFVCk8M1nVaQqcfTL3Q,XQfwVwDr-v0ZS3_CbbE5Xw,2,mediocre best decor nice like restaurant tryin...


In [8]:
food_reviews.shape

(4480346, 5)

In [9]:
users_path = 'processed_datasets/restaurant_users.csv'
chunksize = 500000
chunks = []

for chunk in pd.read_csv(users_path, chunksize = chunksize):
    chunks.append(chunk)

res_users = pd.concat(chunks, axis = 0, ignore_index = True)

In [11]:
res_users.head()

Unnamed: 0,user_id,name,review_count,friends,fans,average_stars
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,3.91
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,"ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,3.74
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA...",52,3.32
3,SZDeASXq7o05mMNLshsdIA,Gwen,224,"enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74Vg...",28,4.27
4,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,"PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMbA...",1,3.54


In [10]:
res_users.shape

(1450144, 6)

In [12]:
biz_tampa = food_biz[food_biz['city'] == 'Tampa']
biz_tampa.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,GoodForDancing,BestNights,BYOB,Corkage,BYOBCorkage,AcceptsInsurance,RestaurantsCounterService,Open24Hours,AgesAllowed,DietaryRestrictions
21,0qNpTGTcqPwOLi2hADx4Xw,Charlie's Market,2815 E Sligh Ave,Tampa,FL,33610,28.01036,-82.430042,3.0,9,...,,,,,,,,,,
38,RK6-cJ9hj53RzOlCBmpT-g,Impasto,,Tampa,FL,33611,27.890814,-82.502346,5.0,5,...,,,,,,,,,,
59,zFvqulgAYOpSG2t1v8AZ-w,The Cake Drip,1625 W Snow Cir,Tampa,FL,33606,27.935753,-82.47647,4.0,40,...,,,,,,,,,,
74,QjV4v7q_pt7tt3K1US7IHg,PDQ Temple Terrace,5112 E Fowler Ave,Tampa,FL,33617,28.054888,-82.399548,3.0,5,...,,,,,,,,,,
93,Ucl9Vo5lwrUmYbV8Dv8X5g,O'Briens Irish Pub,15435 N Dale Mabry Hwy,Tampa,FL,33618,28.09236,-82.500588,4.0,108,...,True,"{'monday': False, 'tuesday': False, 'friday': ...",,,,,,,,


In [13]:
biz_tampa.shape

(3043, 51)

In [14]:
reviews_tampa = pd.merge(food_reviews, biz_tampa[['business_id']], on = 'business_id', how = 'inner')
reviews_tampa.head()

Unnamed: 0,review_id,user_id,business_id,stars,processed_text
0,OAhBYw8IQ6wlfw1owXWRWw,1C2lxzUo1Hyye4RFIXly3g,BVndHaLihEYbr76Z0CMEGw,5,great place breakfast waffle fluffy perfect ho...
1,R3TNDNoRUiVfRgvvczy0mg,-Dt5o6GpQcXQfVeWpHNtDg,BVndHaLihEYbr76Z0CMEGw,5,came based recommendation received street does...
2,OZpHUjMx5vyK0Hn2Uim_AQ,kiTsCsc_vtGXnzVz738w2g,BVndHaLihEYbr76Z0CMEGw,5,found place searching yelp definitely good pla...
3,dmjtUSlyc-3EA1Tv26AWJw,NKFBcrL56W7eHxPXxyPTxA,BVndHaLihEYbr76Z0CMEGw,2,supper pm disappointed ordered waitress nice c...
4,kcV2upXjWLWuJPAt9QICbw,ZuF1R91KH924zJwPTmFi4g,BVndHaLihEYbr76Z0CMEGw,4,good little mom pop breakfast place food reall...


In [15]:
reviews_tampa.shape

(290145, 5)

In [23]:
reviews_tampa['processed_text'] = reviews_tampa['processed_text'].astype(str)
reviews_tampa['sentiment_scores'] = reviews_tampa['processed_text'].apply(get_sentiment_scores)
reviews_tampa['final_rating'] = reviews_tampa['stars'] + reviews_tampa['sentiment_scores'] 

In [27]:
reviews_tampa.head()

Unnamed: 0,review_id,user_id,business_id,stars,processed_text,sentiment_scores,final_rating
0,OAhBYw8IQ6wlfw1owXWRWw,1C2lxzUo1Hyye4RFIXly3g,BVndHaLihEYbr76Z0CMEGw,5,great place breakfast waffle fluffy perfect ho...,0.9477,5.9477
1,R3TNDNoRUiVfRgvvczy0mg,-Dt5o6GpQcXQfVeWpHNtDg,BVndHaLihEYbr76Z0CMEGw,5,came based recommendation received street does...,0.931,5.931
2,OZpHUjMx5vyK0Hn2Uim_AQ,kiTsCsc_vtGXnzVz738w2g,BVndHaLihEYbr76Z0CMEGw,5,found place searching yelp definitely good pla...,0.9823,5.9823
3,dmjtUSlyc-3EA1Tv26AWJw,NKFBcrL56W7eHxPXxyPTxA,BVndHaLihEYbr76Z0CMEGw,2,supper pm disappointed ordered waitress nice c...,0.9175,2.9175
4,kcV2upXjWLWuJPAt9QICbw,ZuF1R91KH924zJwPTmFi4g,BVndHaLihEYbr76Z0CMEGw,4,good little mom pop breakfast place food reall...,0.9432,4.9432


In [28]:
users_tampa = pd.merge(res_users, reviews_tampa[['user_id']], on='user_id', how='inner')
users_tampa = users_tampa.drop_duplicates(subset='user_id', keep='first')
users_tampa.reset_index(drop=True, inplace=True)

In [29]:
users_tampa.head()

Unnamed: 0,user_id,name,review_count,friends,fans,average_stars
0,FT9CFS39sjZxVjCTrDHmdg,Stephanie,201,"ElaFwOTdEpkiYPkHMpOwXQ, nbnftigv3fj8oS6k-waLYQ...",5,3.52
1,OlJ9vcVFB1iEKcZO-MS3cQ,Joz Joz Joz,348,"iBeN7TtSJrq-dLwj-EevFw, oSN3M4_WKdlTsnpgqPDiBg...",116,3.93
2,NIhcRW6DWvk1JQhDhXwgOQ,Lia,2288,"T1upaPMzuW7pNj74fO1rjA, CP28puvAEimt4ziuGTDaHA...",345,3.69
3,rppTTi-kfF8-qyiArNemag,Helen,460,"HzoQKKHDq9BI37dyJAAtGA, Inh7WS8hpmiUmNDhtLK-ZQ...",49,3.33
4,QJI9OSEn6ujRCtrX06vs1w,J,1982,"RyPeT_ICAtX8ah9dhDpEFw, W8r4aKPZFT3GPIQQDbqB6Q...",316,3.61


In [30]:
users_tampa.shape

(115755, 6)

In [32]:
reviews_tampa.to_csv('processed_datasets/tampa/reviews_tampa.csv', index = False)
biz_tampa.to_csv('processed_datasets/tampa/restuarants_tampa.csv', index = False)
users_tampa.to_csv('processed_datasets/tampa/users_tampa.csv', index = False)

### Creating the Item-User Matrix 

In [33]:
reviews_tampa.drop(columns = ['stars', 'processed_text','sentiment_scores'], axis = 1, inplace=True)
res_tampa = biz_tampa[['business_id', 'name', 'stars']]
users_tampa.drop(columns = ['review_count', 'friends', 'fans'], axis = 1, inplace = True)

In [36]:
# Merge reviews_philli with users_philli to get user information
merged_df = pd.merge(reviews_tampa, users_tampa, on='user_id', how='left')

# Merge merged_df with res_philli to get restaurant information
merged_df = pd.merge(merged_df, res_tampa, on='business_id', how='left')

# Select relevant columns for the User-Item Ratings matrix
ratings_df = merged_df[['user_id', 'business_id', 'final_rating']]

# Aggregate duplicate entries by taking the mean of the ratings for each combination of user_id and business_id.
ratings_df = ratings_df.groupby(['user_id', 'business_id']).mean().reset_index()

# Pivot the DataFrame to create the User-Item Ratings matrix
ratings_matrix = ratings_df.pivot(index='user_id', columns='business_id', values='final_rating')

ratings_matrix.head()

business_id,--eBbs3HpZYIym5pEw8Qdw,--pDYWb4DzqKdAdrPcxuaA,--rS-rnOIZxoiDA8yctWpQ,-0oPt7sSKtJG1ysLwV_E9g,-1oygVebK81K8JEPI6H6Lw,-2CPhK6ik9ZBgFX_F-dkxQ,-2dvQxx3cYXd5XmFdDDsDA,-2wh7NTLkWEgsrLJvilnFQ,-34c4hcDPIInTROr8Xtxtw,-361Hc0tlxSYdrH_C3OgzA,...,zjqh_qoBS1BWVSbC51BNjw,znsHfZ2seiCAiO8NGQm8-Q,zo5Ha3Julfnqy3nUKJYimA,zqdwAXPPIu3pRo0kHA6tVQ,zr0g9qn3bf_8c4WTeaI9iw,ztnYPn2w0R4lEJL266apzg,ztppjLmFE25wZOQdJKXAuA,zuObDJ16rxyt5ciCbhKnlA,zwBvdDWFQpYJE1UMDTlSyw,zxtkbSchfaqY1TpZ7KnmKA
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
---2PmXbF47D870stH1jqA,,,,,,,,,,,...,,,,,,,,,,
--1oZcRo9-QKOtTqREKB6g,,,,,,,,,,,...,,,,,,,,,,
--238OiSJBDIhfqcSOQyMA,,,,,,,,,,,...,,,,,,,,,,
--2bpE5vyR-2hAP7sZZ4lA,,,,,,,,,,,...,,,,,,,,,,
--338aogPBCUUKGRHrB14w,,,,,,,,,,,...,,,,,,,,,,


In [37]:
ratings_matrix.shape

(115757, 3043)

In [38]:
ratings_matrix.to_csv('processed_datasets/tampa/ratings_matrix_tampa.csv')

In [39]:
group_by_state = food_biz.groupby('state').agg({'stars':'mean', 'review_count':'sum'}).reset_index()
group_by_state.sort_values('review_count', ascending = False).reset_index()

Unnamed: 0,index,state,stars,review_count
0,13,PA,3.540849,933691
1,4,FL,3.572522,731186
2,9,LA,3.6578,557668
3,14,TN,3.472484,408565
4,10,MO,3.480798,313708
5,8,IN,3.509836,301801
6,1,AZ,3.484216,241934
7,12,NV,3.576611,235567
8,2,CA,3.881114,209237
9,11,NJ,3.465351,156617


In [59]:
group_by_FL = food_biz[food_biz['state']=='TN'].groupby('city').agg({'stars':'mean', 'review_count':'sum'}).reset_index()
group_by_FL.sort_values('review_count', ascending = False).reset_index().head(30)

Unnamed: 0,index,city,stars,review_count
0,34,Nashville,3.564888,309102
1,12,Franklin,3.539387,34742
2,6,Brentwood,3.441989,13007
3,18,Hendersonville,3.377907,9006
4,42,Smyrna,3.261364,6596
5,19,Hermitage,3.09901,5669
6,0,Antioch,3.005051,4792
7,28,Mount Juliet,3.429348,4647
8,27,Madison,3.005952,3243
9,15,Goodlettsville,3.325843,2988


# New Orleans

In [44]:
biz_new_orleans = food_biz[food_biz['city'] == 'New Orleans']
biz_new_orleans.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,GoodForDancing,BestNights,BYOB,Corkage,BYOBCorkage,AcceptsInsurance,RestaurantsCounterService,Open24Hours,AgesAllowed,DietaryRestrictions
14,w_AMNoI1iG9eay7ncmc67w,River 127,100 Iberville St,New Orleans,LA,70130,29.951359,-90.064672,3.0,12,...,,,,,,,,,,
25,uczmbBk5O3tYhGue13dCDg,New Orleans Spirit Tours,723 St Peter St,New Orleans,LA,70130,29.958431,-90.065173,4.0,38,...,,,,,,,,,,
35,YNjyv0gfOr2g8lbmUpTnKg,Copper Vine,1001 Poydras St,New Orleans,LA,70112,29.950647,-90.074427,4.5,350,...,False,"{'monday': False, 'tuesday': False, 'friday': ...",,,,,,,,
43,TLZ3-eDPLhUzfsWO4ad6Ug,Mahony's Po-Boys & Seafood,901 Iberville St,New Orleans,LA,70112,29.955415,-90.070062,4.0,382,...,,,False,,,,,,,
65,hUQ9Z7kQeabvhPOAQOVV1A,Rathbone Mansions,1244 Esplanade Ave,New Orleans,LA,70116,29.967055,-90.065828,3.5,67,...,,,,,,,,,,


In [45]:
biz_new_orleans.shape

(2699, 51)

In [46]:
reviews_new_orleans = pd.merge(food_reviews, biz_new_orleans[['business_id']], on = 'business_id', how = 'inner')
reviews_new_orleans.head()

Unnamed: 0,review_id,user_id,business_id,stars,processed_text
0,6AxgBCNX_PNTOxmbRSwcKQ,r3zeYsv1XFBRA4dJpL78cw,gmjsEdUsKpj9Xxu6pdjH0g,5,loved tour grabbed groupon price great perfect...
1,GdI6HZ34nn3hM2BSHa8X1Q,bMGflZYLUzyOkbOuqKisQA,gmjsEdUsKpj9Xxu6pdjH0g,5,informative took u haunted place planning see ...
2,GOmxzUrGfC5XGIHfFa0B6Q,ohfjgmu9TiOAur-pVn-zmw,gmjsEdUsKpj9Xxu6pdjH0g,5,tour awesome friend scheduled tour advance gro...
3,cEgUTUvkfUUK7hVCbkxU5Q,YWtt5TPZqGfTKA52cVDa-w,gmjsEdUsKpj9Xxu6pdjH0g,5,chose tour raving review online voodoo bone la...
4,mREBtMW3qRH0vQu23DwNpA,HOzG-IJWXs6C2Lp_Q_RcCw,gmjsEdUsKpj9Xxu6pdjH0g,5,friend went voodoo bone lady infamous city dea...


In [47]:
reviews_new_orleans.shape

(489741, 5)

In [48]:
reviews_new_orleans['processed_text'] = reviews_new_orleans['processed_text'].astype(str)
reviews_new_orleans['sentiment_scores'] = reviews_new_orleans['processed_text'].apply(get_sentiment_scores)
reviews_new_orleans['final_rating'] = reviews_new_orleans['stars'] + reviews_new_orleans['sentiment_scores'] 

In [50]:
users_new_orleans = pd.merge(res_users, reviews_new_orleans[['user_id']], on='user_id', how='inner')
users_new_orleans = users_new_orleans.drop_duplicates(subset='user_id', keep='first')
users_new_orleans.reset_index(drop=True, inplace=True)

In [51]:
users_new_orleans.shape

(208795, 6)

In [52]:
reviews_new_orleans.to_csv('processed_datasets/new_orleans/reviews_tampa.csv', index = False)
biz_new_orleans.to_csv('processed_datasets/new_orleans/restuarants_tampa.csv', index = False)
users_new_orleans.to_csv('processed_datasets/new_orleans/users_tampa.csv', index = False)

### Creating the Item-User Matrix 

In [53]:
reviews_new_orleans.drop(columns = ['stars', 'processed_text','sentiment_scores'], axis = 1, inplace=True)
res_new_orleans = biz_new_orleans[['business_id', 'name', 'stars']]
users_new_orleans.drop(columns = ['review_count', 'friends', 'fans'], axis = 1, inplace = True)

In [54]:
# Merge reviews_philli with users_philli to get user information
merged_df = pd.merge(reviews_new_orleans, users_new_orleans, on='user_id', how='left')

# Merge merged_df with res_philli to get restaurant information
merged_df = pd.merge(merged_df, res_new_orleans, on='business_id', how='left')

# Select relevant columns for the User-Item Ratings matrix
ratings_df = merged_df[['user_id', 'business_id', 'final_rating']]

# Aggregate duplicate entries by taking the mean of the ratings for each combination of user_id and business_id.
ratings_df = ratings_df.groupby(['user_id', 'business_id']).mean().reset_index()

# Pivot the DataFrame to create the User-Item Ratings matrix
ratings_matrix = ratings_df.pivot(index='user_id', columns='business_id', values='final_rating')

ratings_matrix.head()

business_id,--x_BmZbxzK_nx_GHBaRVw,--zb12mw2YK-7j6UaHzm8w,-0__F9fnKt8uioCKztF5Ww,-0ltw8--HLuulPyOSspqAQ,-1XSzguS6XLN-V6MVZMg2A,-4x3pVUUsfWmKEilWKsOZQ,-86Z04IBKxhEQ17rCOkn8g,-9ZNA22YhFlvTdLDYjvNdA,-9yzQQ0d_rcOD2CzdTNO_Q,-A2OLubXDsMRPNN7LqohPA,...,znmW7djuEJjlMI0hw-utEw,zqGEI72ihSB6uipAlE9opQ,zqMftGuxwL8mwQvy2nq9Zg,zr7vspTJPNgxP-j90bc_wg,ztcOE7NN0WdleNOCbje8ng,zvGNZF827KyzLupKiG4Xtw,zwe9H6Xxqb1_E09A20Ptgg,zxIF-bnaJ-eKIsznB7yu7A,zzbZtgPYZS8sTIWQH6DwEw,zznZqH9CiAznbkV6fXyHWA
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
---UgP94gokyCDuB5zUssA,,,,,,,,,,,...,,,,,,,,,,
--1FFSF8N5TtSfzH7We4NA,,,,,,,,,,,...,,,,,,,,,,
--1orhUoGFSdHXsoxqQc8g,,,,,,,,,,,...,,,,,,,,,,
--3PldvWEZ_bhLNsyrAtgA,,,,,,,,,,,...,,,,,,,,,,
--3WaS23LcIXtxyFULJHTA,,,,,,,,,,,...,,,,,,,,,,


In [55]:
ratings_matrix.shape

(208796, 2699)

In [56]:
ratings_matrix.to_csv('processed_datasets/new_orleans/ratings_matrix_new_orleans.csv')

# Nashville