In [3]:
import numpy as np
import pandas as pd
from pathlib import Path

In [4]:
base_path = Path("../data/Yelp-JSON/Yelp-JSON/yelp_dataset")
df = pd.read_json(base_path / "yelp_academic_dataset_business.json")

In [5]:
cities = df['city'].unique()
cities.sort()

In [6]:
for city in cities:
    print(city)

AB Edmonton
AMBLER
ARDMORE
AVON
Abington
Abington Township
Affton
Afton
Alberta Park Industrial
Aldan
Algiers
Aliso Viejo
Alloway
Almonesson
Alton
Ambler
Andalusia
Antioch
Apollo Beach
Apollo beach
Apopka
Arabi
Arden
Ardmore
Arizona
Arnold
Arrington
Ashland
Ashland City
Aston
Atco
Audubon
Audubon 
Austin
Avon
Avondale
BOISE
BOISE AP
BRANDON
Bala Cynwyd
Ballwin
Balm
Bargersville
Barnhart
Barrington
Barto
Bayonet Point
Bear
Beaumont
Beech Grove
Beech Grove,
Bel Ridge
Bellair
Belle Chase
Belle Chasse
Belle Meade
Belleair
Belleair Beach
Belleair Blf
Belleair Bluffs
Belleair Blufs
Bellefontaine
Bellefontaine Neighbors
Bellefonte
Belleville
Belleville 
Bellevue
Bellmawr
Bellville
Belmont Hills
Bennington
Bensalem
Bensalem Township
Bensalem. Pa
Berkeley
Berlin
Berlin Boro
Berlin Township
Berry Hill
Berwyn
Bethalto
Bethel
Bethel Township
Beverly
Birchrunville
Black Jack
Blackwood
Blackwood 
Blooming Glen
Blue Bell
Blvd
Boise
Boise 
Boise (Meridian)
Boise City
Boone
Boothwyn
Bordentown
Bordento

In [7]:
cities = ['Berkeley', 'Santa Clara']
bay_area_restaurants = df[df['city'].isin(['Berkeley', 'Santa Clara'])]

In [8]:
bay_area_restaurants.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11 entries, 2072 to 130858
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   business_id   11 non-null     object 
 1   name          11 non-null     object 
 2   address       11 non-null     object 
 3   city          11 non-null     object 
 4   state         11 non-null     object 
 5   postal_code   11 non-null     object 
 6   latitude      11 non-null     float64
 7   longitude     11 non-null     float64
 8   stars         11 non-null     float64
 9   review_count  11 non-null     int64  
 10  is_open       11 non-null     int64  
 11  attributes    11 non-null     object 
 12  categories    11 non-null     object 
 13  hours         9 non-null      object 
dtypes: float64(3), int64(2), object(9)
memory usage: 1.3+ KB


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   150346 non-null  object 
 1   name          150346 non-null  object 
 2   address       150346 non-null  object 
 3   city          150346 non-null  object 
 4   state         150346 non-null  object 
 5   postal_code   150346 non-null  object 
 6   latitude      150346 non-null  float64
 7   longitude     150346 non-null  float64
 8   stars         150346 non-null  float64
 9   review_count  150346 non-null  int64  
 10  is_open       150346 non-null  int64  
 11  attributes    136602 non-null  object 
 12  categories    150243 non-null  object 
 13  hours         127123 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 16.1+ MB


In [10]:
chunk_size = 10000  # Adjust based on available memory

dfs = []  # List to store chunks
for chunk in pd.read_json(base_path / "yelp_academic_dataset_review.json", lines=True, chunksize = chunk_size):
    dfs.append(chunk)

df = pd.concat(dfs, ignore_index=True) 

In [11]:
print(df.memory_usage(deep=True).sum() / 1e9, "GB")  # Print memory usage in GB

6.316121498 GB


In [14]:
len(df['user_id'].unique())

1987929

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6990280 entries, 0 to 6990279
Data columns (total 9 columns):
 #   Column       Dtype         
---  ------       -----         
 0   review_id    object        
 1   user_id      object        
 2   business_id  object        
 3   stars        int64         
 4   useful       int64         
 5   funny        int64         
 6   cool         int64         
 7   text         object        
 8   date         datetime64[ns]
dtypes: datetime64[ns](1), int64(4), object(4)
memory usage: 480.0+ MB


In [21]:
# Ensure the column is in datetime format
df['date'] = pd.to_datetime(df['date'])

# Compute statistics
min_date = df['date'].min()
max_date = df['date'].max()
median_date = df['date'].median()
mean_date = df['date'].mean()
q1_date = df['date'].quantile(0.25)  # 1st quartile (Q1)
q3_date = df['date'].quantile(0.75)  # 3rd quartile (Q3)

In [22]:
# Print results
print("Min Date:", min_date)
print("Max Date:", max_date)
print("Median Date:", median_date)
print("Mean Date:", mean_date)
print("1st Quartile (Q1):", q1_date)
print("3rd Quartile (Q3):", q3_date)

Min Date: 2005-02-16 03:23:22
Max Date: 2022-01-19 19:48:45
Median Date: 2017-06-03 01:26:07
Mean Date: 2017-01-11 11:22:33.441780992
1st Quartile (Q1): 2015-01-25 04:53:50.249999872
3rd Quartile (Q3): 2019-05-23 00:02:46.249999872
