# Pan

## (1) Database Creation

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import explode, split, col, concat_ws, count
import pyspark.sql.functions as F
import os
import shutil

In [2]:
ss = SparkSession.builder \
    .appName("Pan") \
    .config("spark.driver.bindAddress", "localhost") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/16 12:38:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Read data from Yelp business JSON
businesses_df = ss.read.json("../yelp_dataset/yelp_academic_dataset_business.json")

# Create a temporary view
businesses_df.createOrReplaceTempView("businesses")

24/04/16 12:39:09 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [4]:
# Read data from Yelp photos metadata JSON
photos_meta = ss.read.json('../yelp_photos/photos.json')

# Create a temporary view
photos_meta.createOrReplaceTempView("photos_metadata")

                                                                                

In [5]:
# Read data from Yelp review JSON
reviews_df = ss.read.json("../yelp_dataset/yelp_academic_dataset_review.json")

# Create a temporary view
reviews_df.createOrReplaceTempView("reviews")

                                                                                

We will subset the data to just restaurants and bars in Philadelphia, the city with highest count of businesses.

In [6]:
query = """
        SELECT *
        FROM businesses
        WHERE city = 'Philadelphia' 
         AND (categories LIKE '%Restaurant%' OR categories LIKE '%Bar%') AND categories NOT LIKE '%Barbers%'
        """

philly_restaurant_bar_df = ss.sql(query)

# Create a temporary view
philly_restaurant_bar_df.createOrReplaceTempView("philly_restaurants_bars")

Let's get just restaurants and bars in Philadelphia with at least one photo.

In [7]:
query = """
        SELECT business_id, count(photo_id) AS photos_count
        FROM photos_metadata
        GROUP BY business_id HAVING count(photo_id) > 1
        ORDER BY photos_count DESC
        """

df2 = ss.sql(query)

df2.createOrReplaceTempView("temp_df_photos")

In [11]:
query = """
        SELECT businesss.*, photo.photos_count
        FROM philly_restaurants_bars AS businesss
        JOIN temp_df_photos AS photo 
         ON businesss.business_id = photo.business_id
        """

philly_restaurants_bars_1_photo = ss.sql(query)
philly_restaurants_bars_1_photo.count()

philly_restaurants_bars_1_photo.createOrReplaceTempView("philly_restaurants_bars_1_photo")

                                                                                

Now to take a look at the nested fields inside 'attributes'.

In [13]:
# Extract Ambience field

query = """
        SELECT *,
               get_json_object(replace(replace(replace(attributes.Ambience, "'", '"'), 'False', 'false'), 'True', 'true'), '$.touristy') AS Touristy,
               get_json_object(replace(replace(replace(attributes.Ambience, "'", '"'), 'False', 'false'), 'True', 'true'), '$.hipster') AS Hipster,
               get_json_object(replace(replace(replace(attributes.Ambience, "'", '"'), 'False', 'false'), 'True', 'true'), '$.romantic') AS Romantic,
               get_json_object(replace(replace(replace(attributes.Ambience, "'", '"'), 'False', 'false'), 'True', 'true'), '$.divey') AS Divey,
               get_json_object(replace(replace(replace(attributes.Ambience, "'", '"'), 'False', 'false'), 'True', 'true'), '$.intimate') AS Intimate,
               get_json_object(replace(replace(replace(attributes.Ambience, "'", '"'), 'False', 'false'), 'True', 'true'), '$.trendy') AS Trendy,
               get_json_object(replace(replace(replace(attributes.Ambience, "'", '"'), 'False', 'false'), 'True', 'true'), '$.upscale') AS Upscale,
               get_json_object(replace(replace(replace(attributes.Ambience, "'", '"'), 'False', 'false'), 'True', 'true'), '$.classy') AS Classy,
               get_json_object(replace(replace(replace(attributes.Ambience, "'", '"'), 'False', 'false'), 'True', 'true'), '$.casual') AS Casual
        FROM philly_restaurants_bars_1_photo
        """

ambience_expanded = ss.sql(query)

ambience_expanded.createOrReplaceTempView("ambience_expanded")

In [14]:
# Look into Good4Meal field

query = """
        SELECT 
        get_json_object(replace(replace(replace(attributes.GoodForMeal, "'", '"'), 'False', 'false'), 'True', 'true'), '$.lunch') AS Good4Lunch,
        get_json_object(replace(replace(replace(attributes.GoodForMeal, "'", '"'), 'False', 'false'), 'True', 'true'), '$.dinner') AS Good4Dinner,
        get_json_object(replace(replace(replace(attributes.GoodForMeal, "'", '"'), 'False', 'false'), 'True', 'true'), '$.brunch') AS Good4Brunch,
        get_json_object(replace(replace(replace(attributes.GoodForMeal, "'", '"'), 'False', 'false'), 'True', 'true'), '$.breakfast') AS Good4Breakfast
        FROM ambience_expanded
        """

ss.sql(query).show(5,truncate=False)

                                                                                

+----------+-----------+-----------+--------------+
|Good4Lunch|Good4Dinner|Good4Brunch|Good4Breakfast|
+----------+-----------+-----------+--------------+
|null      |null       |null       |null          |
|null      |null       |null       |null          |
|true      |false      |true       |true          |
|true      |true       |false      |false         |
|false     |false      |false      |false         |
+----------+-----------+-----------+--------------+
only showing top 5 rows



Let's add reviews in reviews data. We want to extract number of reviews containing text like 'romantic', 'good for kids', 'rooftop', 'skyline', etc. to improve our suggestions for each vibe category.

In [15]:
query = """
        SELECT *
        FROM reviews
        WHERE business_id IN (SELECT business_id FROM philly_restaurants_bars_1_photo)
        """

reviews_philly_1_photo = ss.sql(query)

In [16]:
query = """
        SELECT ambience_expanded.*, sub.romantic_reviews
        FROM (
            SELECT business_id, count(*) AS romantic_reviews
            FROM reviews
            WHERE business_id IN (SELECT business_id FROM philly_restaurants_bars_1_photo)
            AND text LIKE '%romantic%'
            GROUP BY business_id             
            ) AS sub
        RIGHT JOIN ambience_expanded ON sub.business_id = ambience_expanded.business_id
        """

romantic = ss.sql(query)

romantic.createOrReplaceTempView('added_romantic')

In [17]:
query = """
        SELECT added_romantic.*, sub.date_night_reviews
        FROM (
            SELECT business_id, count(*) AS date_night_reviews
            FROM reviews
            WHERE business_id IN (SELECT business_id FROM philly_restaurants_bars_1_photo)
            AND text LIKE '%date night%'
            GROUP BY business_id             
            ) AS sub
        RIGHT JOIN added_romantic ON sub.business_id = added_romantic.business_id
        """

date_night = ss.sql(query)
date_night.createOrReplaceTempView('added_date_night')

In [18]:
query = """
        SELECT added_date_night.*, sub.family_friendly_reviews
        FROM (
            SELECT business_id, count(*) AS family_friendly_reviews
            FROM reviews
            WHERE business_id IN (SELECT business_id FROM philly_restaurants_bars_1_photo)
            AND (text LIKE '%good for kids%' OR text LIKE '%family friendly%' OR text LIKE '%kids menu%')
            GROUP BY business_id             
            ) AS sub
        RIGHT JOIN added_date_night ON sub.business_id = added_date_night.business_id
        ORDER BY sub.family_friendly_reviews DESC
        """

family_friendly = ss.sql(query)
family_friendly.createOrReplaceTempView('added_family_friendly')

In [19]:
query = """
        SELECT added_family_friendly.*, sub.rooftop_skyline_reviews
        FROM (
            SELECT business_id, count(*) AS rooftop_skyline_reviews
            FROM reviews
            WHERE business_id IN (SELECT business_id FROM philly_restaurants_bars_1_photo)
            AND (text LIKE '%rooftop%' OR text LIKE '%skyline%' OR text LIKE '%Rooftop%' OR text LIKE '%Skyline%')
            GROUP BY business_id             
            ) AS sub
        RIGHT JOIN added_family_friendly ON sub.business_id = added_family_friendly.business_id
        ORDER BY sub.rooftop_skyline_reviews DESC
        """

rooftop_skyline = ss.sql(query)
rooftop_skyline.createOrReplaceTempView('added_rooftop_skyline')

In [20]:
rooftop_skyline.write.saveAsTable('philly_df_extracted_keywords')

                                                                                

## (2) Filtering Database for Vibe Matches

**Vibes**:
- Coworking Cafe
- Brunch
- Green
- Local Delicacies
- Romantic/Date Night
- Upscale/Special Occassion
- Family-Friendly
- Rooftop
- Budget

Want to integrate zero-shot-classification on the image data:
- "productive cafe with people working" (Coworking Cafe)
- "healthy green food" (Green)
- "formal dinner" (Upscale/Special Occassion)
- "rooftop views" (Rooftop)


In [146]:
# Coworking Cafe
# want to add zero-shot-score (eg "productive cafe with people working")

query = """
        SELECT name, business_id, stars
        FROM philly_df_extracted_keywords
        WHERE Casual = 'true' AND attributes.WiFi LIKE '%free%'
         AND (categories LIKE '%Cafes%' OR categories LIKE '%Coffee%')
         AND get_json_object(replace(replace(replace(attributes.GoodForMeal, "'", '"'), 'False', 'false'), 'True', 'true'), '$.dinner') = 'false'
        """

coworking_df = ss.sql(query).orderBy(F.desc('stars'), F.desc('review_count'))
print("Coworking")
coworking_df.show(10, truncate=False)
coworking_df.count()

Coworking
+------------------------+----------------------+-----+
|name                    |business_id           |stars|
+------------------------+----------------------+-----+
|Cafe La Maude           |K7KHmHzxNwzqiijSJeKe_A|4.5  |
|Miles Table             |lWedWkinrM5j13pyimbpbA|4.5  |
|Middle Child            |OAWa1WML2V1ZLJGD6V3nBQ|4.5  |
|Café y Chocolate        |qaDImxPguQz0jToNYvB1Eg|4.5  |
|Talula's Daily          |EreYgrQPuR7Sk_FKeZZg9g|4.5  |
|Grindcore House         |9A5Gw0At6so0x-vWM0_JZw|4.5  |
|Nook Bakery & Coffee Bar|e4MoozYGqe_rb4_ZC1rYMQ|4.5  |
|Soy Cafe                |MM_v-KIUJiXLCxnkIqPKWA|4.5  |
|Hinge Cafe              |zjTBfbvbN2Ps6_Ar0w-fuQ|4.5  |
|So Crepe                |Z5b1Me1YBZGQAxIx62vmqg|4.5  |
+------------------------+----------------------+-----+
only showing top 10 rows



73

In [147]:
# Brunch

query = """
        SELECT name, business_id, stars
        FROM philly_df_extracted_keywords
        WHERE Trendy = 'true'
         AND (categories LIKE '%Brunch%' OR get_json_object(replace(replace(replace(attributes.GoodForMeal, "'", '"'), 'False', 'false'), 'True', 'true'), '$.brunch') = 'true')
        """

brunch_df = ss.sql(query).orderBy(F.desc('stars'), F.desc('review_count'))
print("Brunch")
brunch_df.show(10, truncate=False)
brunch_df.count()

Brunch
+--------------------+----------------------+-----+
|name                |business_id           |stars|
+--------------------+----------------------+-----+
|Suraya              |vUrTGX_7HxqeoQ_6QCVz6g|4.5  |
|V Street            |E_h2yNoagLK-3ODYwMPErw|4.5  |
|La Colombe Coffee   |MlXH9dWYDFEBbwuGStlfZg|4.5  |
|Bar Hygge           |Mwc3n5Psw9wRaQ22vZWDYQ|4.5  |
|Talula's Daily      |EreYgrQPuR7Sk_FKeZZg9g|4.5  |
|Knead Bagels        |wocwfwSFrNfYJRyfYSi1Cw|4.5  |
|The Bakeshop on 20th|zujdPV3HT-Y-CKE1GgkMHQ|4.5  |
|Stockyard Sandwich  |B6Cn6maWQ6sLhLwYGLhSlg|4.5  |
|The Rooster         |-sTrihdzACrsOSu1FYdfxQ|4.5  |
|Girard Bruncherie   |VbGGpt-Q5ZeMxSFPDbPeBg|4.5  |
+--------------------+----------------------+-----+
only showing top 10 rows



52

In [148]:
# Green
# want to add zero-shot-score (eg "healthy green food")

query = """
        SELECT name, business_id, stars
        FROM philly_df_extracted_keywords
        WHERE categories LIKE '%Vegetarian%' AND categories LIKE '%Vegan%'
         AND get_json_object(replace(replace(replace(attributes.GoodForMeal, "'", '"'), 'False', 'false'), 'True', 'true'), '$.lunch') = 'true'
        """

green_df = ss.sql(query).orderBy(F.desc('stars'), F.desc('review_count'))
print("Green")
green_df.show(10, truncate=False)
green_df.count()

Green
+---------------------------+----------------------+-----+
|name                       |business_id           |stars|
+---------------------------+----------------------+-----+
|Grindcore House            |9A5Gw0At6so0x-vWM0_JZw|4.5  |
|Soy Cafe                   |MM_v-KIUJiXLCxnkIqPKWA|4.5  |
|Hummusology                |-wwhD6SwDOJZlbZkHlIchQ|4.5  |
|HipCityVeg                 |gczLFZVmpfsBB3uUHEbWiA|4.5  |
|PlantPure Cafe             |rQKL-bO8Xup4gCPcj_oz6w|4.5  |
|Jerry's Kitchen            |CyZrPCmQqMbXdD6SX02f6w|4.5  |
|New Delhi Indian Restaurant|YN4Kk751tmdvoarGo8z7_A|4.0  |
|P S & Co                   |phsLOuBeiYI43hglvWK96Q|4.0  |
|Hiro Ramen House           |_34KJPR-T0HP9USJw1nCfw|4.0  |
|Su Xing House              |PVVFos1LDfD7iETY0w4vaA|4.0  |
+---------------------------+----------------------+-----+
only showing top 10 rows



21

In [149]:
# Local Delicacies

query = """
        SELECT name, business_id, stars
        FROM philly_df_extracted_keywords
        WHERE (Touristy = 'true' OR categories LIKE '%Cheesesteak%' OR name LIKE '%Philadelphia%')
         AND attributes.GoodForMeal != 'null'
        ORDER BY stars DESC, review_count DESC
        """

local_delicacies_df = ss.sql(query).orderBy(F.desc('stars'), F.desc('review_count'))
print("Local Delicacies")
local_delicacies_df.show(10, truncate=False)
local_delicacies_df.count()

Local Delicacies
+------------------------+----------------------+-----+
|name                    |business_id           |stars|
+------------------------+----------------------+-----+
|Reading Terminal Market |ytynqOUb3hjKeJfRj5Tshw|4.5  |
|John's Roast Pork       |LM54ufrINJWoTN5imV8Etw|4.5  |
|Saad's Halal Restaurant |6_LnAQQ0-mml8YgpfRjGuA|4.5  |
|Oh Brother Philly       |7pAgxBMUjrVPH7xh3fn-gw|4.5  |
|Joe's Steaks + Soda Shop|kkcQYuF3w5iHnHMf0EnRhQ|4.5  |
|Woodrow's Sandwich Shop |Jn4tRtjIuz6MBCykQySpeg|4.5  |
|SPOT Gourmet Burgers    |Gw7UW0E2BguzL9suQnwDeg|4.5  |
|Gooey Looie's           |9ggTidLF9LPNdyWdCGtrYg|4.5  |
|McNally's Tavern        |ZPFdJgzPjEUM6BoFfoV-ZA|4.5  |
|The Original Turkey     |lQTdARrqdKMAcDp1PWpQ9A|4.5  |
+------------------------+----------------------+-----+
only showing top 10 rows



99

In [150]:
# Romantic/Date Night

query = """
        SELECT name, business_id, stars
        FROM philly_df_extracted_keywords
        WHERE (attributes.RestaurantsPriceRange2 = 2 OR attributes.RestaurantsPriceRange2 = 3)
         AND (Romantic = 'true' OR Trendy = 'true')
        """

romantic_date_night_df = ss.sql(query).orderBy(F.desc('stars'), F.desc('review_count'), F.desc('romantic_reviews'))
print("Date Night")
romantic_date_night_df.show(10, truncate=False)
romantic_date_night_df.count()

Date Night
+---------------------+----------------------+-----+
|name                 |business_id           |stars|
+---------------------+----------------------+-----+
|Talula's Garden      |i_FWONQD1ZBqrNE2b-M5Ug|4.5  |
|Double Knot          |Ipkx4Sa7ybn8C6LtTqTztw|4.5  |
|Suraya               |vUrTGX_7HxqeoQ_6QCVz6g|4.5  |
|Fat Salmon           |h7TO_IsmLCYmKKDVOOIeFw|4.5  |
|Tria Cafe Rittenhouse|eJaeTZlIdM3HWCq__Ve4Wg|4.5  |
|Tria Cafe Wash West  |Q-prSTdggNlxAEFV88BZOw|4.5  |
|V Street             |E_h2yNoagLK-3ODYwMPErw|4.5  |
|Dizengoff            |JVDHxMnKjif8XdXVFWiClg|4.5  |
|Abe Fisher           |vhDWGF-8BfsxvS7Zo5Wv2w|4.5  |
|Bistrot La Minette   |4_-IcMpkF_sBRHomWZHNzA|4.5  |
+---------------------+----------------------+-----+
only showing top 10 rows



197

In [152]:
# Upscale/Special Occasion
# want to add zero-shot-score (eg "formal dinner")

query = """
        SELECT name, business_id, stars
        FROM philly_df_extracted_keywords
        WHERE attributes.RestaurantsPriceRange2 = 4
         AND (Classy = 'true' OR Upscale = 'true' OR Casual = 'false')
         AND attributes.RestaurantsReservations = 'True'
        """

special_occassion_df = ss.sql(query).orderBy(F.desc('stars'), F.desc('review_count'))
print("Special Occassion")
special_occassion_df.show(10, truncate=False)
special_occassion_df.count()

Special Occassion
+-------------------------------------+----------------------+-----+
|name                                 |business_id           |stars|
+-------------------------------------+----------------------+-----+
|Zahav                                |ctHjyadbDQAtUFfkcAFEHw|4.5  |
|Morimoto                             |6_T2xzR74JqGCTPefAD8Tw|4.5  |
|Butcher and Singer                   |0oSSjekU-3GR8gselReWnA|4.5  |
|Barclay Prime                        |wbDRmtxaKRpBOjutvV6TEA|4.5  |
|Vetri Cucina                         |wUnLSg_GKfEIQ5CQQ770_g|4.5  |
|Lacroix Restaurant at The Rittenhouse|e3Y3hDpwHc9RmQlJtIgHuw|4.5  |
|Laurel                               |fEqiXG_B-fn__w0aeF3nBQ|4.5  |
|Friday Saturday Sunday               |C9UylQTOh7uwZo4a7QhLyg|4.5  |
|Bar 210                              |N2j1caPRBMk34IEKqtGLSA|4.5  |
|Fogo de Chao                         |cGX-1IUwXOjkUqZbkKYcjw|4.0  |
+-------------------------------------+----------------------+-----+
only showing top

27

In [153]:
# Family-Friendly

query = """
        SELECT name, business_id, stars
        FROM philly_df_extracted_keywords
        WHERE (attributes.RestaurantsPriceRange2 = 1 OR attributes.RestaurantsPriceRange2 = 2)
         AND Casual = 'true'
         AND attributes.GoodForKids = 'True'
         AND categories NOT LIKE '%Nightlife%' 
         AND attributes.NoiseLevel LIKE '%average%'
        """

family_friendly_df = ss.sql(query).orderBy(F.desc('family_friendly_reviews'), F.desc('stars'), F.desc('review_count'))
print("Family Friendly")
family_friendly_df.show(10, truncate=False)
family_friendly_df.count()

Family Friendly
+------------------------------+----------------------+-----+
|name                          |business_id           |stars|
+------------------------------+----------------------+-----+
|Jones                         |d_tRshM-w6S4QxE4VVi8tQ|3.5  |
|Green Eggs Café               |0RuvlgTnKFbX3IK0ZOOocA|4.0  |
|Pizzeria Stella               |vF_QwGltBpVesMMGclzB9Q|4.0  |
|Miles Table                   |lWedWkinrM5j13pyimbpbA|4.5  |
|Trolley Car Diner             |kK1re49IK3jmCYTL6B6_lw|3.5  |
|Sabrina's Café                |iUZEGx29miZObLd6_lt7Vg|4.0  |
|Percy Street Barbecue         |qsAZNQ-6P4I9r7onkBZJ_g|4.0  |
|Iron Hill Brewery & Restaurant|ohWH383r60AREWh-aphpSw|3.5  |
|Nomad Pizza Company           |M_EpyAH1CZZVlhxfYBLOqg|4.5  |
|Bar Hygge                     |Mwc3n5Psw9wRaQ22vZWDYQ|4.5  |
+------------------------------+----------------------+-----+
only showing top 10 rows



398

In [154]:
# Rooftop
# want to add zero-shot-score (eg "rooftop views")

query = """
        SELECT name, business_id, stars
        FROM philly_df_extracted_keywords
        WHERE rooftop_skyline_reviews > 20
        """

rooftop_df =  ss.sql(query).orderBy(F.desc('stars') , F.desc('review_count'),  F.desc('rooftop_skyline_reviews'))
print("Rooftop")
rooftop_df.show(10, truncate=False)
rooftop_df.count()

Rooftop
+------------------------+----------------------+-----+
|name                    |business_id           |stars|
+------------------------+----------------------+-----+
|Bok Bar                 |TdrNka-oPWf4B-jN9rKltw|4.0  |
|The Continental Mid-town|WxB8498ejPtHE7wFa89_fA|3.5  |
|R2L                     |9gObo5ltOMo6UgsaXaHPWA|3.5  |
|City Tap House          |LwX2vbzttWYAdoBoVtfvRg|3.5  |
|XIX Nineteen            |17MK8qagV374AuUA4sXuIA|3.5  |
|Revolution House        |ldr7iDtxFXX-q7tJuXqlGQ|3.5  |
|Assembly                |MmDwOS74IeHja_v4jzfAgQ|3.5  |
|The Corner              |nWpIXaVS8VuAsHZYID4Lww|3.5  |
|Stratus                 |79PsTPa-1DBLKhNytfAkYQ|3.0  |
|Attico Rooftop Lounge   |6c8eIawbCBu3FLp2kDraBA|3.0  |
+------------------------+----------------------+-----+



10

In [134]:
# Budget

query = """
        SELECT name, business_id, stars
        FROM ambience_expanded
        WHERE categories NOT LIKE '%Fast Food%'
         AND attributes.RestaurantsPriceRange2 = 1
         AND categories NOT LIKE '%Bakeries'
         AND attributes.RestaurantsTableService = 'True'
         AND (get_json_object(replace(replace(replace(attributes.GoodForMeal, "'", '"'), 'False', 'false'), 'True', 'true'), '$.lunch') = 'true' 
         OR get_json_object(replace(replace(replace(attributes.GoodForMeal, "'", '"'), 'False', 'false'), 'True', 'true'), '$.dinner') = 'true')
        """

budget_df = ss.sql(query).orderBy(F.desc('stars'), F.desc('review_count'))
budget_df.show(10, truncate=False)
budget_df.count()

                                                                                

+--------------------------------------+----------------------+-----+
|name                                  |business_id           |stars|
+--------------------------------------+----------------------+-----+
|Mom Mom's Kitchen and Polish Food Cart|RVLF2RaStLkJiQCqBHknDw|5.0  |
|Smiley's Cafe                         |gCcJTKC40CL0XtYYGfcXDw|4.5  |
|BAP                                   |ROeacJQwBeh05Rqg7F6TCg|4.5  |
|Thang Long Pho Restaurant             |M6ap9LXMEZmHf6NhlNpDQA|4.5  |
|Tamalex Restaurant                    |6CpCnFzpyOV4ymhNP1AOGw|4.5  |
|Malelani  Cafe                        |pmnaMretlgmAA96i8Z7NFw|4.5  |
|Street Side                           |6OC8ErU_m06KABnfSXTJEA|4.5  |
|Foo Kitchen                           |efNrVvbNw2stIq2PvCDV6g|4.5  |
|Hidden Gem Cafe                       |Sp-LM7L_6M-MPo5NCHw8Og|4.5  |
|Cafe Nhan                             |6PO3OlixvoZMJBlcBW_iRw|4.5  |
+--------------------------------------+----------------------+-----+
only showing top 10 

                                                                                

33

## (3) Collecting business and images for Pan website

Join each categories DF of top 10 businesses with photo metadata.


In [101]:
# coworking_df = coworking_df.limit(10)
# coworking_df.createOrReplaceTempView('coworking_df')


query = """
        SELECT coworking_df.business_id , array_agg(photos_metadata.photo_id) AS photo_ids
        FROM coworking_df JOIN photos_metadata
         ON coworking_df.business_id = photos_metadata.business_id
        GROUP BY coworking_df.business_id
        """

coworking_ids = ss.sql(query)
coworking_ids.show(10)

[Stage 356:>                                                        (0 + 7) / 7]

+--------------------+--------------------+
|         business_id|           photo_ids|
+--------------------+--------------------+
|EreYgrQPuR7Sk_FKe...|[PyulJEnHzdsH44pZ...|
|qaDImxPguQz0jToNY...|[ao2DyscqroV7X6Lu...|
|OAWa1WML2V1ZLJGD6...|[K5r-Ej8kYsQcRQuP...|
|K7KHmHzxNwzqiijSJ...|[fNciZ4vMEzxFFzVs...|
|e4MoozYGqe_rb4_ZC...|[YEk_gCd037WJyEhG...|
|9A5Gw0At6so0x-vWM...|[FT5aBSWt8QdYTFyL...|
|MM_v-KIUJiXLCxnkI...|[VpWKpItoDhkiDt6z...|
|lWedWkinrM5j13pyi...|[tHCApa_uvRkKZ61C...|
|Z5b1Me1YBZGQAxIx6...|[UfJTLq-1VPDD5RgP...|
|zjTBfbvbN2Ps6_Ar0...|[i6WU1B8v97HLzOHT...|
+--------------------+--------------------+



                                                                                

In [102]:
# brunch_df = brunch_df.limit(10)
# brunch_df.createOrReplaceTempView('brunch_df')


query = """
        SELECT brunch_df.business_id , array_agg(photos_metadata.photo_id) AS photo_ids
        FROM brunch_df JOIN photos_metadata
         ON brunch_df.business_id = photos_metadata.business_id
        GROUP BY brunch_df.business_id
        """

brunch_ids = ss.sql(query)
brunch_ids.show(10)

+--------------------+--------------------+
|         business_id|           photo_ids|
+--------------------+--------------------+
|EreYgrQPuR7Sk_FKe...|[PyulJEnHzdsH44pZ...|
|Mwc3n5Psw9wRaQ22v...|[QOWO8hYS1Nx_slO2...|
|B6Cn6maWQ6sLhLwYG...|[rbxhySfO2WzlADtz...|
|vUrTGX_7HxqeoQ_6Q...|[vQ4f78kbIvE52xZB...|
|MlXH9dWYDFEBbwuGS...|[Bc5bUdR4e3qnu69F...|
|VbGGpt-Q5ZeMxSFPD...|[XLkQt6U1loL820L2...|
|E_h2yNoagLK-3ODYw...|[RQXrLFvGmkhVPVF-...|
|wocwfwSFrNfYJRyfY...|[-YsQj6HOEUNDT0R6...|
|-sTrihdzACrsOSu1F...|[HIDivevBlu2KTM2C...|
|zujdPV3HT-Y-CKE1G...|[ONkMFsSKARJk9VXw...|
+--------------------+--------------------+



In [103]:
# green_df = green_df.limit(10)
# green_df.createOrReplaceTempView('green_df')


query = """
        SELECT green_df.business_id , array_agg(photos_metadata.photo_id) AS photo_ids
        FROM green_df JOIN photos_metadata
         ON green_df.business_id = photos_metadata.business_id
        GROUP BY green_df.business_id
        """

green_ids = ss.sql(query)
green_ids.show(10)

+--------------------+--------------------+
|         business_id|           photo_ids|
+--------------------+--------------------+
|_34KJPR-T0HP9USJw...|[RmWkNrCHcbZRoKYn...|
|rQKL-bO8Xup4gCPcj...|[BIEGa9dKZa2ykMJl...|
|CyZrPCmQqMbXdD6SX...|[qpns5FRzRx-XJ0V1...|
|YN4Kk751tmdvoarGo...|[DED4hIeT8RjvLf_O...|
|PVVFos1LDfD7iETY0...|[2lzMcPNlwEoyY4m_...|
|9A5Gw0At6so0x-vWM...|[FT5aBSWt8QdYTFyL...|
|MM_v-KIUJiXLCxnkI...|[VpWKpItoDhkiDt6z...|
|phsLOuBeiYI43hglv...|[CnLqQ5v22NTPOnA4...|
|-wwhD6SwDOJZlbZkH...|[oVOnX6A09-X8XFhD...|
|gczLFZVmpfsBB3uUH...|[oWtRXxOQkF92ZJqO...|
+--------------------+--------------------+



In [104]:
# local_delicacies_df = local_delicacies_df.limit(10)
# local_delicacies_df.createOrReplaceTempView('local_delicacies_df')


query = """
        SELECT local_delicacies_df.business_id , array_agg(photos_metadata.photo_id) AS photo_ids
        FROM local_delicacies_df JOIN photos_metadata
         ON local_delicacies_df.business_id = photos_metadata.business_id
        GROUP BY local_delicacies_df.business_id
        """

local_delicacies_ids = ss.sql(query)
local_delicacies_ids.show(10)

+--------------------+--------------------+
|         business_id|           photo_ids|
+--------------------+--------------------+
|LM54ufrINJWoTN5im...|[ItCnim2wIw_DKzIW...|
|ytynqOUb3hjKeJfRj...|[HUCU2vGCZ3nnOWdE...|
|lQTdARrqdKMAcDp1P...|[_K2ppPMOmvGsq3kH...|
|6_LnAQQ0-mml8Ygpf...|[wCRelDbredzJYAMr...|
|9ggTidLF9LPNdyWdC...|[HM7s4kpQ39fOFDpw...|
|Jn4tRtjIuz6MBCykQ...|[wbGuDT7E7QYYK4Cd...|
|Gw7UW0E2BguzL9suQ...|[YlY0qxoCQAgdVwEf...|
|7pAgxBMUjrVPH7xh3...|[JsFdtQ87i86Z4NDi...|
|kkcQYuF3w5iHnHMf0...|[y9JjcJVI-Q-OxQOZ...|
|ZPFdJgzPjEUM6BoFf...|[VPRVuRQCw68Efqoo...|
+--------------------+--------------------+



In [105]:
# romantic_date_night_df = romantic_date_night_df.limit(10)
# romantic_date_night_df.createOrReplaceTempView('romantic_date_night_df')


query = """
        SELECT romantic_date_night_df.business_id , array_agg(photos_metadata.photo_id) AS photo_ids
        FROM romantic_date_night_df JOIN photos_metadata
         ON romantic_date_night_df.business_id = photos_metadata.business_id
        GROUP BY romantic_date_night_df.business_id
        """

romantic_date_night_ids = ss.sql(query)
romantic_date_night_ids.show(10)

+--------------------+--------------------+
|         business_id|           photo_ids|
+--------------------+--------------------+
|JVDHxMnKjif8XdXVF...|[9awbs92QFoFBIipQ...|
|Ipkx4Sa7ybn8C6LtT...|[4RPBqXaA9xCC7cVT...|
|i_FWONQD1ZBqrNE2b...|[diaexIdeMq9YjEQK...|
|vUrTGX_7HxqeoQ_6Q...|[vQ4f78kbIvE52xZB...|
|vhDWGF-8BfsxvS7Zo...|[VX3-SSlTz4ov9elL...|
|h7TO_IsmLCYmKKDVO...|[PV2UEdZYHyY86NBe...|
|E_h2yNoagLK-3ODYw...|[RQXrLFvGmkhVPVF-...|
|4_-IcMpkF_sBRHomW...|[W-GkD1LEWUEUzcKu...|
|eJaeTZlIdM3HWCq__...|[vsyNw_KISRh4A7hH...|
|Q-prSTdggNlxAEFV8...|[s4qiZrCuvSVc-fwM...|
+--------------------+--------------------+



In [106]:
# special_occassion_df = special_occassion_df.limit(10)
# special_occassion_df.createOrReplaceTempView('special_occassion_df')


query = """
        SELECT special_occassion_df.business_id , array_agg(photos_metadata.photo_id) AS photo_ids
        FROM special_occassion_df JOIN photos_metadata
         ON special_occassion_df.business_id = photos_metadata.business_id
        GROUP BY special_occassion_df.business_id
        """

special_occassion_ids = ss.sql(query)
special_occassion_ids.show(10)

+--------------------+--------------------+
|         business_id|           photo_ids|
+--------------------+--------------------+
|0oSSjekU-3GR8gsel...|[WMvEQIfbdPVEQMBK...|
|wUnLSg_GKfEIQ5CQQ...|[hfUUbS6LhBD81nYB...|
|C9UylQTOh7uwZo4a7...|[4bBwLVRcVWHswAIm...|
|wbDRmtxaKRpBOjutv...|[0i0bHPSyskK1kgIp...|
|ctHjyadbDQAtUFfkc...|[ARm3Lm-thBgN2mNQ...|
|e3Y3hDpwHc9RmQlJt...|[zlfkIIv-Al-wYoqH...|
|N2j1caPRBMk34IEKq...|[7f6-PViffqk8gVMV...|
|cGX-1IUwXOjkUqZbk...|[r5kO6Q9zxuIg60Oe...|
|6_T2xzR74JqGCTPef...|[FGeQf8E6YGHDml-I...|
|fEqiXG_B-fn__w0ae...|[GuZ23QCoj49nfOIZ...|
+--------------------+--------------------+



In [107]:
# family_friendly_df = family_friendly_df.limit(10)
# family_friendly_df.createOrReplaceTempView('family_friendly_df')


query = """
        SELECT family_friendly_df.business_id , array_agg(photos_metadata.photo_id) AS photo_ids
        FROM family_friendly_df JOIN photos_metadata
         ON family_friendly_df.business_id = photos_metadata.business_id
        GROUP BY family_friendly_df.business_id
        """

family_friendly_ids = ss.sql(query)
family_friendly_ids.show(10)

+--------------------+--------------------+
|         business_id|           photo_ids|
+--------------------+--------------------+
|Mwc3n5Psw9wRaQ22v...|[QOWO8hYS1Nx_slO2...|
|ohWH383r60AREWh-a...|[0RgwLP0d-nIdVco7...|
|iUZEGx29miZObLd6_...|[b6dtXSB0E0-XlKqI...|
|0RuvlgTnKFbX3IK0Z...|[cpcbdCLIYJx5a1_T...|
|vF_QwGltBpVesMMGc...|[Ac-U9RamzA6oJEie...|
|kK1re49IK3jmCYTL6...|[TPoeVM5voXOGSi37...|
|d_tRshM-w6S4QxE4V...|[SvxXFyjOFZTeYKP7...|
|lWedWkinrM5j13pyi...|[tHCApa_uvRkKZ61C...|
|M_EpyAH1CZZVlhxfY...|[7CLP-SQo5xNVr_Xt...|
|qsAZNQ-6P4I9r7onk...|[_bWxRRrJBVtszO0d...|
+--------------------+--------------------+



In [142]:
# rooftop_df = rooftop_df.limit(10)
# rooftop_df.createOrReplaceTempView('rooftop_df')


query = """
        SELECT rooftop_df.business_id , array_agg(photos_metadata.photo_id) AS photo_ids
        FROM rooftop_df JOIN photos_metadata
         ON rooftop_df.business_id = photos_metadata.business_id
        GROUP BY rooftop_df.business_id
        """

rooftop_ids = ss.sql(query)
rooftop_ids.show(10)

+--------------------+--------------------+
|         business_id|           photo_ids|
+--------------------+--------------------+
|17MK8qagV374AuUA4...|[1zzFlwwy3jihXJ6N...|
|TdrNka-oPWf4B-jN9...|[jLWj48yy79IWZ-4G...|
|LwX2vbzttWYAdoBoV...|[PJ38A9seHdxDit5b...|
|WxB8498ejPtHE7wFa...|[HiMe9Q4qFTkrgvf5...|
|79PsTPa-1DBLKhNyt...|[kGlFe1bD2cM-iCIe...|
|ldr7iDtxFXX-q7tJu...|[iCJOmiFtkosVdm2G...|
|6c8eIawbCBu3FLp2k...|[oHV7MV5lWZUi9XjF...|
|MmDwOS74IeHja_v4j...|[eGfa6OdkP61gdrEg...|
|9gObo5ltOMo6UgsaX...|[q-biT65t7smZfbTo...|
|nWpIXaVS8VuAsHZYI...|[_3CaB7zaky7Xdi6f...|
+--------------------+--------------------+



In [109]:
# budget_df = budget_df.limit(10)
# budget_df.createOrReplaceTempView('budget_df')


query = """
        SELECT budget_df.business_id, array_agg(photos_metadata.photo_id) AS photo_ids
        FROM budget_df JOIN photos_metadata
         ON budget_df.business_id = photos_metadata.business_id
        GROUP BY budget_df.business_id
        """

budget_ids = ss.sql(query)
budget_ids.show(10)

                                                                                

+--------------------+--------------------+
|         business_id|           photo_ids|
+--------------------+--------------------+
|Sp-LM7L_6M-MPo5NC...|[1aZ1Ae0vmsjgvSIM...|
|pmnaMretlgmAA96i8...|[tSteZVAbIh0XuUx2...|
|gCcJTKC40CL0XtYYG...|[uOzls7U5PMfNgjPa...|
|6OC8ErU_m06KABnfS...|[iLYb6yqZAaI26EgB...|
|RVLF2RaStLkJiQCqB...|[EyTE26BdtW2PcWuK...|
|6PO3OlixvoZMJBlcB...|[popKUQQpZoqWkirh...|
|efNrVvbNw2stIq2Pv...|[vV8kgBtENih7awZH...|
|M6ap9LXMEZmHf6Nhl...|[pAK1F-3ROp57M0xf...|
|6CpCnFzpyOV4ymhNP...|[6TJ35pqkjmeNsI4p...|
|ROeacJQwBeh05Rqg7...|[oewWFZoEf7AVYyWJ...|
+--------------------+--------------------+



Create directories to store photos for top 10 businesses in each categories.

In [93]:
vibes = ["coworking_cafe", "brunch", "green", "local_delicacies", 
         "date_night", "special_occassion", "family_friendly", "rooftop"]

category_dir = '/Users/scampione/MSDS/Spring_24_2/Entrepreneurship/categories/'

vibe_dirs = [f'{category_dir}{vibe}'  for vibe in vibes]

In [120]:
def create_top_10s_dir(vibe_df, vibe_dir, photo_base_dir):
    for row in vibe_df.collect():
        business_id = row['business_id']
        photo_ids = row['photo_ids']
        photo_files = [f'{id}.jpg' for id in photo_ids]
        
        # Create a new subdirectory for the business
        business_dir = os.path.join(vibe_dir, business_id)
        os.makedirs(business_dir, exist_ok=True)
        
        # Copy each photo to the new subdirectory
        for photo_file in photo_files:
            # Construct file paths
            source_path = os.path.join(photo_base_dir, photo_file)
            destination_path = os.path.join(business_dir, photo_file)
            
            # Check if the source file exists and then copy
            if os.path.exists(source_path):
                shutil.copy(source_path, destination_path)

In [123]:
vibe_ids = [coworking_ids, brunch_ids, green_ids, local_delicacies_ids, 
            romantic_date_night_ids, special_occassion_ids,
            family_friendly_ids, rooftop_ids]

photo_base_dir = '/Users/scampione/MSDS/Spring_24_2/Entrepreneurship/yelp_photos/photos'


In [None]:
# Try for one
coworking_dir = '/Users/scampione/MSDS/Spring_24_2/Entrepreneurship/categories/coworking_cafe'
create_top_10s_dir(coworking_ids, coworking_dir, photo_base_dir)

In [124]:
for vibe_ids, vibe_dir in zip(vibe_ids, vibe_dirs): 
    create_top_10s_dir(vibe_ids, vibe_dir, photo_base_dir)

                                                                                

Business data for top ten business in each cateogires.

In [173]:
vibes = ["coworking_cafe", "brunch", "green", "local_delicacies", 
         "date_night", "special_occassion", "family_friendly", "rooftop"]


vibe_dfs = [coworking_df, brunch_df, green_df, local_delicacies_df, 
            romantic_date_night_df, special_occassion_df,
            family_friendly_df, rooftop_df]

for i, vibe_df in enumerate(vibe_dfs):
    print(vibes[i])
    for row in vibe_df.limit(10).collect():
        print(f"{row['business_id']}:\t{row['name']},   {row['stars']}")
    print("\n")

coworking_cafe
K7KHmHzxNwzqiijSJeKe_A:	Cafe La Maude,   4.5
lWedWkinrM5j13pyimbpbA:	Miles Table,   4.5
OAWa1WML2V1ZLJGD6V3nBQ:	Middle Child,   4.5
qaDImxPguQz0jToNYvB1Eg:	Café y Chocolate,   4.5
EreYgrQPuR7Sk_FKeZZg9g:	Talula's Daily,   4.5
9A5Gw0At6so0x-vWM0_JZw:	Grindcore House,   4.5
e4MoozYGqe_rb4_ZC1rYMQ:	Nook Bakery & Coffee Bar,   4.5
MM_v-KIUJiXLCxnkIqPKWA:	Soy Cafe,   4.5
zjTBfbvbN2Ps6_Ar0w-fuQ:	Hinge Cafe,   4.5
Z5b1Me1YBZGQAxIx62vmqg:	So Crepe,   4.5


brunch
vUrTGX_7HxqeoQ_6QCVz6g:	Suraya,   4.5
E_h2yNoagLK-3ODYwMPErw:	V Street,   4.5
MlXH9dWYDFEBbwuGStlfZg:	La Colombe Coffee,   4.5
Mwc3n5Psw9wRaQ22vZWDYQ:	Bar Hygge,   4.5
EreYgrQPuR7Sk_FKeZZg9g:	Talula's Daily,   4.5
wocwfwSFrNfYJRyfYSi1Cw:	Knead Bagels,   4.5
zujdPV3HT-Y-CKE1GgkMHQ:	The Bakeshop on 20th,   4.5
B6Cn6maWQ6sLhLwYGLhSlg:	Stockyard Sandwich,   4.5
-sTrihdzACrsOSu1FYdfxQ:	The Rooster,   4.5
VbGGpt-Q5ZeMxSFPDbPeBg:	Girard Bruncherie,   4.5


green
9A5Gw0At6so0x-vWM0_JZw:	Grindcore House,   4.5
MM_v-KIUJiXLCxnkIq