In [1]:
import psycopg

# Connect to your PostgreSQL database
# Make sure it is already running on your computer.
# (This is done by default on JupyterHub)
conn = psycopg.connect("postgresql://postgres:[password]@localhost/yelp_dataset")


# Create a cursor object
cur = conn.cursor()

#  Create Tables
cur.execute("""
    CREATE TABLE IF NOT EXISTS businesses (
        business_id VARCHAR PRIMARY KEY,
        name VARCHAR,
        city VARCHAR,
        stars DECIMAL,
        review_count INT,
        categories TEXT
    )
""")


cur.execute("""
    CREATE TABLE IF NOT EXISTS reviews (
        review_id VARCHAR PRIMARY KEY,
        business_id VARCHAR REFERENCES businesses(business_id),
        user_id VARCHAR,
        stars DECIMAL,
        text TEXT,
        date DATE
    )
""")

cur.execute("""
    CREATE TABLE IF NOT EXISTS users (
        user_id VARCHAR PRIMARY KEY,
        name VARCHAR,
        review_count INT,
        avg_stars DECIMAL
    )
""")

# Commit the transaction
conn.commit()

# Close the cursor and connection
cur.close()
conn.close()

In [2]:
import psycopg
import json
import random


business_file = "/Users/aidanajakulina/Downloads/yelp_dataset/yelp_academic_dataset_business.json"
review_file = "/Users/aidanajakulina/Downloads/yelp_dataset/yelp_academic_dataset_review.json"
user_file = "/Users/aidanajakulina/Downloads/yelp_dataset/yelp_academic_dataset_user.json"

# connect to your PostgreSQL database
conn = psycopg.connect("postgresql://postgres:[password]@localhost/yelp_dataset")
cur = conn.cursor()

#so we can start fresh with every time we run this code
cur.execute("TRUNCATE TABLE businesses CASCADE;")
cur.execute("TRUNCATE TABLE reviews CASCADE;")
cur.execute("TRUNCATE TABLE users CASCADE;")

# load and sample businesses
with open(business_file, 'r') as f:
    businesses = [json.loads(line) for line in f]

sampled_businesses = random.sample(businesses, 10000)

business_data = [
    (
        business["business_id"],
        business.get("name", None),
        business.get("city", None),
        business.get("stars", None),
        business.get("review_count", None),
        business.get("categories", None)
    )
    for business in sampled_businesses
]

# put sampled businesses into  database

cur.executemany("""
    INSERT INTO businesses (business_id, name, city, stars, review_count, categories)
    VALUES (%s, %s, %s, %s, %s, %s)
""", business_data)

#load reviews and filter for sampled businesses

with open(review_file, 'r') as f:
    reviews = [json.loads(line) for line in f]

sampled_business_ids = set(b["business_id"] for b in sampled_businesses)
filtered_reviews = [review for review in reviews if review["business_id"] in sampled_business_ids]


# filter users for the filtered reviews

sampled_user_ids = set(review["user_id"] for review in filtered_reviews)

with open(user_file, 'r') as f:
    users = [json.loads(line) for line in f]

filtered_users = [user for user in users if user["user_id"] in sampled_user_ids]

#insert filtered users

user_data = [
    (
        user["user_id"],
        user.get("name", None),
        user.get("review_count", None),
        user.get("average_stars", None)
    )
    for user in filtered_users
]

cur.executemany("""
    INSERT INTO users (user_id, name, review_count, avg_stars)
    VALUES (%s, %s, %s, %s)
""", user_data)

# filter reviews again for valid users 

valid_user_ids = set(user["user_id"] for user in filtered_users)
filtered_reviews = [
    review for review in filtered_reviews if review["user_id"] in valid_user_ids
]

# insert filtered reviews
review_data = [
    (
        review["review_id"],
            review["business_id"],
        review["user_id"],
        review.get("stars", None),
        review.get("text", None),
        review.get("date", None)
    )
    for review in filtered_reviews
]

cur.executemany("""
    INSERT INTO reviews (review_id, business_id, user_id, stars, text, date)
    VALUES (%s, %s, %s, %s, %s, %s)
""", review_data)


conn.commit()
cur.close()
conn.close()
print("Data loading complete")


Data loading complete


In [4]:
import psycopg
import json
import random

business_file = "/Users/aidanajakulina/Downloads/yelp_dataset/yelp_academic_dataset_business.json"
review_file = "/Users/aidanajakulina/Downloads/yelp_dataset/yelp_academic_dataset_review.json"
user_file = "/Users/aidanajakulina/Downloads/yelp_dataset/yelp_academic_dataset_user.json"

# connect to your PostgreSQL database
conn = psycopg.connect("postgresql://postgres:[password]@localhost/yelp_dataset")
cur = conn.cursor()

cur.execute("TRUNCATE TABLE businesses CASCADE;")
cur.execute("TRUNCATE TABLE reviews CASCADE;")
cur.execute("TRUNCATE TABLE users CASCADE;")

def stream_json(file_path):
    with open(file_path, 'r') as f:
        for line_num, line in enumerate(f, start=1):
            try:
                yield json.loads(line)
            except json.JSONDecodeError as e:
                print(f"Skipping malformed line {line_num}: {e}")

random.seed(42)
businesses = list(stream_json(business_file))
sampled_businesses = random.sample(businesses, min(len(businesses), 10000)) 

business_data = [
    (
        business["business_id"],
        business.get("name"),
        business.get("city"),
        business.get("stars"),
        business.get("review_count"),
        business.get("categories")
    )
    for business in sampled_businesses
]

cur.executemany("""
    INSERT INTO businesses (business_id, name, city, stars, review_count, categories)
    VALUES (%s, %s, %s, %s, %s, %s)
""", business_data)

sampled_business_ids = set(b["business_id"] for b in sampled_businesses)
filtered_reviews = [
    review for review in stream_json(review_file) if review["business_id"] in sampled_business_ids
]

review_data = [
    (
        review["review_id"],
        review["business_id"],
        review["user_id"],
        review.get("stars"),
        review.get("text"),
        review.get("date")
    )
    for review in filtered_reviews
]

cur.executemany("""
    INSERT INTO reviews (review_id, business_id, user_id, stars, text, date)
    VALUES (%s, %s, %s, %s, %s, %s)
""", review_data)

sampled_user_ids = set(review["user_id"] for review in filtered_reviews)
filtered_users = [
    user for user in stream_json(user_file) if user["user_id"] in sampled_user_ids
]
user_data = [
    (
        user["user_id"],
        user.get("name"),
        user.get("review_count"),
        user.get("average_stars")
    )
    for user in filtered_users
]

cur.executemany("""
    INSERT INTO users (user_id, name, review_count, avg_stars)
    VALUES (%s, %s, %s, %s)
""", user_data)

conn.commit()
cur.close()
conn.close()

print("Data loading complete")

Data loading complete


In [5]:
import psycopg


conn = psycopg.connect("postgresql://postgres:[password]@localhost/yelp_dataset")
cur = conn.cursor()


cur.execute("SELECT * FROM businesses LIMIT 10;")
businesses = cur.fetchall()
print("Businesses:")
for row in businesses:
    print(row)

cur.execute("SELECT * FROM reviews LIMIT 10;")
reviews = cur.fetchall()
print("Reviews:")
for row in reviews:
    print(row)


cur.execute("SELECT * FROM users LIMIT 10;")
users = cur.fetchall()
print("Users:")
for row in users:
    print(row)


cur.close()
conn.close()


Businesses:
('gRAVJyM4FmXfAGRPouRDIQ', 'Michaels', 'Pinellas Park', Decimal('3'), 15, 'Shopping, Arts & Crafts, Hobby Shops, Knitting Supplies, Art Supplies')
('5D-Gjagh385iopO3xqjjIQ', "McDonald's", 'Caseyville', Decimal('1.5'), 37, 'Burgers, Coffee & Tea, Fast Food, Restaurants, Food')
('An2JUt207oyqhI4mkTZGWA', 'Dentopia Dental', 'Tucson', Decimal('5'), 6, 'Cosmetic Dentists, Health & Medical, Pediatric Dentists, Dentists, General Dentistry')
('GQiYkSH68NUKTKmU-c3Xjw', "Wendy's", 'Mooresville', Decimal('2'), 11, 'Restaurants, Fast Food, Burgers')
('XxgEVMFOc7nlZkZ6BYCtuQ', 'National Property Inspections', 'Blue Bell', Decimal('4.5'), 6, 'Home Services, Home Inspectors')
('o-qSKPjoMhQZVsvn9wtDlQ', 'Daylight Donuts', 'Boise', Decimal('3.5'), 23, 'Donuts, Food')
('zSDqW5cZq9Bg7O9vMpJ6Tg', 'Casa Blanca Cuban Grill', 'Boise', Decimal('4.5'), 331, 'Cuban, Food, Latin American, Restaurants, Spanish, Caribbean, Food Trucks')
('_lxqF9oFp_g07IHfq-Z7dQ', 'Quality Inn Montgomeryville-Philadelph