In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
data_file = open("yelp_academic_dataset_checkin.json")
data = []
for line in data_file:
  data.append(json.loads(line))
checkin_df = pd.DataFrame(data)
data_file.close()

In [4]:
data_file = open("yelp_academic_dataset_business.json")
data = []
for line in data_file:
  data.append(json.loads(line))
business_df = pd.DataFrame(data)
data_file.close()

In [5]:
data_file = open("yelp_academic_dataset_review.json")
data = []
for line in data_file:
  data.append(json.loads(line))
review_df = pd.DataFrame(data)
data_file.close()

In [6]:
data_file = open("yelp_academic_dataset_tip.json")
data = []
for line in data_file:
  data.append(json.loads(line))
tip_df = pd.DataFrame(data)
data_file.close()

In [7]:
data_file = open("yelp_academic_dataset_user.json")
data = []
for line in data_file:
  data.append(json.loads(line))
user_df = pd.DataFrame(data)
data_file.close()

In [8]:
print(business_df["categories"])

0         Doctors, Traditional Chinese Medicine, Naturop...
1         Shipping Centers, Local Services, Notaries, Ma...
2         Department Stores, Shopping, Fashion, Home & G...
3         Restaurants, Food, Bubble Tea, Coffee & Tea, B...
4                                 Brewpubs, Breweries, Food
                                ...                        
150341                           Nail Salons, Beauty & Spas
150342    Pets, Nurseries & Gardening, Pet Stores, Hobby...
150343    Shopping, Jewelry, Piercing, Toy Stores, Beaut...
150344    Fitness/Exercise Equipment, Eyewear & Optician...
150345    Beauty & Spas, Permanent Makeup, Piercing, Tattoo
Name: categories, Length: 150346, dtype: object


In [9]:
def split_and_explode(row):
    categories = row['categories']
    if categories is not None and categories.strip() != '':
        categories = categories.split(', ')
        return pd.Series({'business_id': row['business_id'], 'business_category': categories})
    else:
        return pd.Series({'business_id': row['business_id'], 'business_category': []})

# Apply the function to split and explode categories
business_categories = business_df.apply(split_and_explode, axis=1)

# Explode the list of business categories
business_categories = business_categories.explode('business_category')

# Display the resulting DataFrame
print(business_categories)

                   business_id             business_category
0       Pns2l4eNsfO8kk83dixA6A                       Doctors
0       Pns2l4eNsfO8kk83dixA6A  Traditional Chinese Medicine
0       Pns2l4eNsfO8kk83dixA6A         Naturopathic/Holistic
0       Pns2l4eNsfO8kk83dixA6A                   Acupuncture
0       Pns2l4eNsfO8kk83dixA6A              Health & Medical
...                        ...                           ...
150344  mtGm22y5c2UHNXDFAjaPNw                         Bikes
150345  jV_XOycEzSlTx-65W906pg                 Beauty & Spas
150345  jV_XOycEzSlTx-65W906pg              Permanent Makeup
150345  jV_XOycEzSlTx-65W906pg                      Piercing
150345  jV_XOycEzSlTx-65W906pg                        Tattoo

[668695 rows x 2 columns]


In [10]:
def extract_and_explode(row):
    attributes = row['attributes']
    if isinstance(attributes, dict):
        attribute_list = [key for key, value in attributes.items() if value]
        return pd.Series({'business_id': row['business_id'], 'business_attribute': attribute_list})
    else:
        return pd.Series({'business_id': row['business_id'], 'business_attribute': []})

# Apply the function to extract and explode attributes
business_attributes = business_df.apply(extract_and_explode, axis=1)

# Explode the list of business attributes
business_attributes = business_attributes.explode('business_attribute')

# Display the resulting DataFrame
print(business_attributes)

                   business_id          business_attribute
0       Pns2l4eNsfO8kk83dixA6A           ByAppointmentOnly
1       mpf3x-BjTdTEA3yCZrAYPw  BusinessAcceptsCreditCards
2       tUFrWirKiKi_TAnsVWINQQ                 BikeParking
2       tUFrWirKiKi_TAnsVWINQQ  BusinessAcceptsCreditCards
2       tUFrWirKiKi_TAnsVWINQQ      RestaurantsPriceRange2
...                        ...                         ...
150345  jV_XOycEzSlTx-65W906pg  BusinessAcceptsCreditCards
150345  jV_XOycEzSlTx-65W906pg             BusinessParking
150345  jV_XOycEzSlTx-65W906pg                 BikeParking
150345  jV_XOycEzSlTx-65W906pg                        WiFi
150345  jV_XOycEzSlTx-65W906pg           ByAppointmentOnly

[1220564 rows x 2 columns]


In [11]:
rows = []

# Define the days of the week
days_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Iterate over rows in the DataFrame
for _, row in business_df.iterrows():
    business_id = row['business_id']
    hours = row['hours']
    
    if hours is not None:
        for day_of_week in days_of_week:
            time_range = hours.get(day_of_week)
            if time_range:
                open_time, close_time = time_range.split('-')
                rows.append({
                    'business_id': business_id,
                    'day_of_week': day_of_week,
                    'open_time': open_time,
                    'close_time': close_time
                })

# Create a DataFrame from the list of rows
business_hours = pd.DataFrame(rows)

# Display the resulting DataFrame
print(business_hours)

                   business_id day_of_week open_time close_time
0       mpf3x-BjTdTEA3yCZrAYPw      Monday       0:0        0:0
1       mpf3x-BjTdTEA3yCZrAYPw     Tuesday       8:0      18:30
2       mpf3x-BjTdTEA3yCZrAYPw   Wednesday       8:0      18:30
3       mpf3x-BjTdTEA3yCZrAYPw    Thursday       8:0      18:30
4       mpf3x-BjTdTEA3yCZrAYPw      Friday       8:0      18:30
...                        ...         ...       ...        ...
801010  jV_XOycEzSlTx-65W906pg     Tuesday      12:0       19:0
801011  jV_XOycEzSlTx-65W906pg   Wednesday      12:0       19:0
801012  jV_XOycEzSlTx-65W906pg    Thursday      12:0       19:0
801013  jV_XOycEzSlTx-65W906pg      Friday      12:0       19:0
801014  jV_XOycEzSlTx-65W906pg    Saturday      12:0       19:0

[801015 rows x 4 columns]


In [12]:
# QUERY 1
filtered_business = business_df[(business_df['city'] == 'Philadelphia') & (business_df['business_id'].isin(business_categories[business_categories['business_category'] == 'Restaurants']['business_id']))]

# Group and count business categories
result = filtered_business[filtered_business['business_id'].isin(business_categories['business_id'])].groupby('categories').size().reset_index(name='count')

# Sort by count in descending order and limit to top 10
result = result.sort_values(by='count', ascending=False).head(10)

print(result)

                       categories  count
3479           Restaurants, Pizza    154
2644           Pizza, Restaurants    124
3006         Restaurants, Chinese     89
976          Chinese, Restaurants     84
3324         Restaurants, Italian     43
3402         Restaurants, Mexican     42
2306         Mexican, Restaurants     39
2076         Italian, Restaurants     39
2746  Restaurants, American (New)     36
87    American (New), Restaurants     34


In [13]:
#QUERY 2
merged_df = pd.merge(business_df, business_categories, on='business_id')

# Filter businesses in Philadelphia with the 'Restaurants' category
filtered_df = merged_df[(merged_df['city'] == 'Philadelphia') & (merged_df['business_category'] == 'Restaurants')]

# Group by restaurant names and calculate average rating and total review count
result = filtered_df.groupby('name').agg({
    'stars': 'mean',
    'review_count': 'sum'
}).reset_index()

# Sort by total reviews and average rating in descending order
result = result.sort_values(by=['review_count', 'stars'], ascending=[False, False])

# Limit to the top 10 restaurants
result = result.head(10)

print(result)

                         name   stars  review_count
3632  Reading Terminal Market  4.5000          5721
3317     Pat's King of Steaks  3.0000          4250
3801           Sabrina's Café  4.0000          3730
1718          Green Eggs Café  3.7500          3531
1614            Geno's Steaks  1.7500          3406
1327                   El Vez  4.0000          3187
5032                    Zahav  4.5000          3065
1166           Dim Sum Garden  4.0000          3049
353                  Barbuzzo  4.5000          2893
1420           Federal Donuts  4.1875          2811


In [14]:
#QUERY 3
merged_df = pd.merge(business_df, business_categories, on='business_id')

# Filter businesses in Philadelphia with the 'Restaurants' category
filtered_df = merged_df[(merged_df['city'] == 'Philadelphia') & (merged_df['business_category'] == 'Restaurants')]

# Select restaurant name and total review count
result = filtered_df[['name', 'review_count']]

# Sort by total reviews in descending order
result = result.sort_values(by='review_count', ascending=False)

# Limit to the top 10 restaurants
result = result.head(10)

print(result)

                                  name  review_count
636969         Reading Terminal Market          5721
506489            Pat's King of Steaks          4250
254368                   Geno's Steaks          3401
585425                          El Vez          3187
138595                           Zahav          3065
288179                        Barbuzzo          2893
89200                             Parc          2761
237140                  Jim's South St          2736
163293  Dalessandro’s Steaks & Hoagies          2686
664178                 Green Eggs Café          2679


In [15]:
#QUERY 4
merged_df = pd.merge(business_df, business_categories, on='business_id')

# Filter businesses in Philadelphia with the 'Restaurants' category
filtered_df = merged_df[(merged_df['city'] == 'Philadelphia') & (merged_df['business_category'] == 'Restaurants')]

# Create a new column 'Restaurant_Status' based on 'is_open'
filtered_df['Restaurant_Status'] = filtered_df['is_open'].apply(lambda x: 'Open' if x == 1 else 'Closed')

# Group by 'Restaurant_Status' and count total occurrences
result = filtered_df.groupby('Restaurant_Status').size().reset_index(name='Total_Count')

# Rename the columns to match the SQL query
result = result.rename(columns={'Restaurant_Status': 'Restaurant_Status', 'Total_Count': 'Total_Count'})

print(result)

  Restaurant_Status  Total_Count
0            Closed         2327
1              Open         3525


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Restaurant_Status'] = filtered_df['is_open'].apply(lambda x: 'Open' if x == 1 else 'Closed')


In [16]:
#QUERY 5
from datetime import datetime
merged_df = pd.merge(business_df, business_hours, on='business_id')
merged_df = pd.merge(merged_df, business_categories, on='business_id')

# Filter businesses in Philadelphia with the 'Restaurants' category
filtered_df = merged_df[(merged_df['city'] == 'Philadelphia') & (filtered_df['business_category'] == 'Restaurants')]

# Function to calculate hours open for each business
def calculate_hours_open(row):
    open_time = datetime.strptime(row['open_time'], '%H:%M')
    close_time = datetime.strptime(row['close_time'], '%H:%M')

    if close_time >= open_time:
        time_diff = (close_time - open_time).seconds / 3600  # Convert seconds to hours
    else:
        end_of_day = datetime.strptime('23:59', '%H:%M')
        time_diff = ((end_of_day - open_time).seconds + (close_time - datetime.strptime('00:00', '%H:%M')).seconds) / 3600  # Convert seconds to hours
    return time_diff

# Calculate hours open for each business
filtered_df['hours_open'] = filtered_df.apply(calculate_hours_open, axis=1)

# Calculate the average hours open for all businesses
average_hours_open = filtered_df['hours_open'].mean()

print("Average Hours Open:", average_hours_open)

Average Hours Open: 9.131488203266787


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['hours_open'] = filtered_df.apply(calculate_hours_open, axis=1)


In [17]:
#QUERY 6
merged_df = pd.merge(business_df, checkin_df, on='business_id')
merged_df = pd.merge(merged_df, business_categories, on='business_id')

# Filter businesses in Philadelphia with the 'Restaurants' category
filtered_df = merged_df[(merged_df['city'] == 'Philadelphia') & (filtered_df['business_category'] == 'Restaurants')]

# Function to calculate total check-ins
def calculate_total_checkins(row):
    return len(row['date'].split(','))

# Calculate total check-ins for each restaurant
filtered_df['Total_Checkins'] = filtered_df.apply(calculate_total_checkins, axis=1)

# Sort the DataFrame by 'Total_Checkins' in descending order
sorted_df = filtered_df.sort_values(by='Total_Checkins', ascending=False)

# Select the top 10 restaurants with the highest total check-ins
top_10_restaurants = sorted_df.head(10)

print(top_10_restaurants[['name', 'Total_Checkins']])

                                name  Total_Checkins
163293       Sweet Lucy's Smokehouse            1175
87                               BAP             221
72                          Tuna Bar             172
227812    World Wide Aquarium & Pets              30
263501            CrossFit Fairmount              29
52146   New Lee's Chinese Restaurant              20
570322         Alif Brew & Mini Mart              11
260698   O Rei Da Picanha Steakhouse               8
206756             Callowhill Greens               7


  filtered_df = merged_df[(merged_df['city'] == 'Philadelphia') & (filtered_df['business_category'] == 'Restaurants')]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Total_Checkins'] = filtered_df.apply(calculate_total_checkins, axis=1)


In [18]:
print(tip_df["text"])

0                            Avengers time with the ladies.
1         They have lots of good deserts and tasty cuban...
2                    It's open even when you think it isn't
3                                 Very decent fried chicken
4                    Appetizers.. platter special for lunch
                                ...                        
908910                Disappointed in one of your managers.
908911                              Great food and service.
908912                                  Love their Cubans!!
908913                              Great pizza great price
908914                    Food is good value but a bit hot!
Name: text, Length: 908915, dtype: object


In [19]:
#QUERY 7
user_avg_ratings = review_df.groupby('user_id')['stars'].mean().reset_index()

# Sort users based on their average review rating in descending order
sorted_users = user_avg_ratings.sort_values(by='stars', ascending=False)

# Select the top reviewers with the highest average ratings
top_reviewers = sorted_users.head(10)

# Print the top reviewers
print("Top Reviewers:")
print(top_reviewers)

Top Reviewers:
                        user_id  stars
0        ---1lKK3aKOuomHnwAkAow    5.0
1027402  W17zbNSLF0DW2uu0Oy4IXA    5.0
1027373  W13JwV5DJCYds57kiu4h-A    5.0
1027375  W13OsqHpHSAtbMK1De9XXQ    5.0
1027380  W14KPPpnz3xi5Eqn0Q38SA    5.0
1027382  W14SFFHOCZa6GQB5s8Gh5Q    5.0
1027383  W14ZeF164SCqZ6QRniclmQ    5.0
1027385  W14kmIm0kGWa9eDPE_JFjQ    5.0
1027387  W15-0kDbkFCYxNpDcaDEuQ    5.0
1027389  W152UVHDpb237E71KMiArA    5.0


In [20]:
#QUERY 8
filtered_business = business_df[(business_df['city'] == 'Philadelphia') & (business_df['categories'].str.contains('restaurants', case=False, regex=True))]

# Merge the 'filtered_business' DataFrame with the 'tip' DataFrame
merged_df = pd.merge(tip_df, filtered_business, on='business_id', how='inner')

# Group by restaurant categories and count the number of tips
category_engagement = merged_df.groupby('categories')['business_id'].count().reset_index()
category_engagement.rename(columns={'categories': 'Restaurant_Category', 'business_id': 'Number_ofTips'}, inplace=True)

# Sort by 'Number_ofTips' in descending order
category_engagement = category_engagement.sort_values(by='Number_ofTips', ascending=False)

# Print the restaurant categories with the highest number of tips
print("Restaurant Categories with the Most Tips:")
print(category_engagement)

Restaurant Categories with the Most Tips:
                                    Restaurant_Category  Number_ofTips
761   Candy Stores, Shopping, Department Stores, Fas...            827
2422                                 Pizza, Restaurants            703
3187                                 Restaurants, Pizza            700
3041                               Restaurants, Italian            573
3117                               Restaurants, Mexican            567
...                                                 ...            ...
3325  Restaurants, Southern, Fast Food, Chicken Wing...              1
2752                 Restaurants, Chinese, Asian Fusion              1
2500  Puerto Rican, Spanish, Cuban, Caribbean, Resta...              1
3328          Restaurants, Spanish, Seafood, Portuguese              1
3556  Sandwiches, Restaurants, Coffee & Tea, Food, V...              1

[3960 rows x 2 columns]


In [21]:
#QUERY 9
filtered_business = business_df[business_df['city'] == 'Philadelphia']

# Merge the 'filtered_business' DataFrame with the 'tip' DataFrame using an inner join
merged_df = filtered_business.merge(tip_df, on='business_id', how='inner')

# Group by restaurant names and count the number of tips
result = merged_df.groupby('name')['business_id'].count().reset_index()
result.rename(columns={'name': 'Restaurant_Name', 'business_id': 'NumberOfTips'}, inplace=True)

# Sort by 'NumberOfTips' in descending order and limit to the top 10
result = result.sort_values(by='NumberOfTips', ascending=False).head(10)

print(result)

                               Restaurant_Name  NumberOfTips
6095  Philadelphia International Airport - PHL          1011
6605                   Reading Terminal Market           827
7573                                 Starbucks           688
6963                            Sabrina's Café           499
2024            Dalessandro’s Steaks & Hoagies           460
3207                           Green Eggs Café           421
5889                      Pat's King of Steaks           400
2307                                   Dunkin'           393
2179                            Dim Sum Garden           382
3328                               Han Dynasty           380


In [29]:
filtered_business = business_df[(business_df['city'] == 'Philadelphia') & (business_df['categories'].str.contains('restaurants', case=False, regex=True))]

# Calculate the average rating for each restaurant
restaurant_avg_ratings = review_df.groupby('business_id')['stars'].mean().reset_index()

# Sort the restaurants by average rating in ascending order
sorted_restaurants = restaurant_avg_ratings.sort_values(by='stars')

# Select the 10 lowest-rated restaurants
lowest_rated_restaurants = sorted_restaurants.head(10)

# Filter reviews for the 10 lowest-rated restaurants
reviews_for_lowest_rated = pd.merge(review_df, lowest_rated_restaurants, on='business_id', how='inner')

# Display reviews for the 10 lowest-rated restaurants
print(reviews_for_lowest_rated)

                 review_id                 user_id             business_id  \
0   Hr_-94dbvGOulwXoVC0iXQ  rVo3owIvz-iJhBb9A9TXLg  TGuMpvsCKuAcGER8cGXhWg   
1   gR0cs1X8aBKCMowmIEdopw  mzUTfAvbxEfc9lqjl7BtOg  TGuMpvsCKuAcGER8cGXhWg   
2   xuI0hFE6WPPwQFrJSfQ2AA  uBtcU_YD2rMYawa_Rc-fpw  TGuMpvsCKuAcGER8cGXhWg   
3   mGyOuES3YOSFHoHkCJgDMw  j5YS-J6kytVelaMYUKFNlg  TGuMpvsCKuAcGER8cGXhWg   
4   FD2zBDL98jFhoM087rCFzw  yPM6YRXJ36yPej_XKjtlBw  TGuMpvsCKuAcGER8cGXhWg   
..                     ...                     ...                     ...   
93  41xWoFIy5ltVAZfEe8meqA  VFR3Q5Uk3CfEQXAW6nqxUw  TGts2jHdAF0MXllTBuUaIg   
94  ASLg93KQesMCtV04GFlD5Q  m8HWHV3wFkYe85I4K-b6Gw  TGts2jHdAF0MXllTBuUaIg   
95  xbL9I_2RkQv1hp4Vcctxug  LREYVbQ2KkzmpJhQOfMrgQ  TGts2jHdAF0MXllTBuUaIg   
96  sVhfL_sOcENQEkTGi-zE0w  eJDcMASAcKB_fYXug5SH0g  TGts2jHdAF0MXllTBuUaIg   
97  gU48OdJyZY38J3EK6Cdq2Q  wVSz-wislRKONqDCWcMnhQ  TGts2jHdAF0MXllTBuUaIg   

    stars_x  useful  funny  cool  \
0       1.0       0      0 