# Analyzing Restaurant Vibes: *Zero-Shot Learning and PySpark*

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [None]:
# Initialize Spark Session
ss = SparkSession.builder \
    .appName("YelpDataAnalysis") \
    .config("spark.driver.bindAddress", "localhost") \
    .getOrCreate()

In [None]:
# Define helper functions
def read_data(file_path, format='json', infer_schema='true', header='true'):
    """
    Reads data from a specified file path using Spark, supporting multiple formats.
    
    Args:
        file_path (str): The path to the dataset.
        format (str): The format of the dataset (default is 'json').
        infer_schema (str): Option to infer schema automatically (default is 'true').
        header (str): Indicates if the first row should be used as a header (default is 'true').

    Returns:
        DataFrame: A DataFrame containing the imported data.
    """
    return ss.read.format(format).option("inferSchema", infer_schema).option("header", header).load(file_path)


def create_temp_view(df, view_name):
    """
    Creates a temporary SQL view based on the DataFrame.
    
    Args:
        df (DataFrame): DataFrame on which to base the SQL view.
        view_name (str): The name of the SQL view to create.
    """
    df.createOrReplaceTempView(view_name)


## Database Creation

In [None]:
# Loading data and creating views
businesses_df = read_data("../yelp_dataset/yelp_academic_dataset_business.json")
create_temp_view(businesses_df, "businesses")

photos_meta = read_data('../yelp_photos/photos.json')
create_temp_view(photos_meta, "photos_metadata")

reviews_df = read_data("../yelp_dataset/yelp_academic_dataset_review.json")
create_temp_view(reviews_df, "reviews")

zero_shot_scores_df = read_data("../zero_shot_scores.csv", format='csv').drop("_c0")
create_temp_view(zero_shot_scores_df, "zero_shot_scores")


In [None]:
# Query for filtering Philadelphia restaurants and bars
philly_restaurants_bars_df = ss.sql("""
    SELECT *
    FROM businesses
    WHERE city = 'Philadelphia' 
      AND (categories LIKE '%Restaurant%' OR categories LIKE '%Bar%')
      AND categories NOT LIKE '%Barbers%'
""")
create_temp_view(philly_restaurants_bars_df, "philly_restaurants_bars")


# Query to get businesses with more than one photo
photo_counts_df = ss.sql("""
    SELECT business_id, count(photo_id) AS photos_count
    FROM photos_metadata
    GROUP BY business_id
    HAVING photos_count > 1
""")
create_temp_view(photo_counts_df, "temp_df_photos")


# Joining businesses with their photo counts
philly_businesses_photos_df = ss.sql("""
    SELECT b.*, p.photos_count
    FROM philly_restaurants_bars AS b
    JOIN temp_df_photos AS p ON b.business_id = p.business_id
""")
create_temp_view(philly_businesses_photos_df, "philly_restaurants_bars_1_photo")


In [None]:
# Expanding the Ambience field
ambience_expanded_df = ss.sql("""
    SELECT *, 
           get_json_object(replace(replace(replace(attributes.Ambience, "'", '"'), 'False', 'false'), 'True', 'true'), '$.touristy') AS Touristy,
           get_json_object(replace(replace(replace(attributes.Ambience, "'", '"'), 'False', 'false'), 'True', 'true'), '$.hipster') AS Hipster,
           get_json_object(replace(replace(replace(attributes.Ambience, "'", '"'), 'False', 'false'), 'True', 'true'), '$.romantic') AS Romantic,
           get_json_object(replace(replace(replace(attributes.Ambience, "'", '"'), 'False', 'false'), 'True', 'true'), '$.divey') AS Divey,
           get_json_object(replace(replace(replace(attributes.Ambience, "'", '"'), 'False', 'false'), 'True', 'true'), '$.intimate') AS Intimate,
           get_json_object(replace(replace(replace(attributes.Ambience, "'", '"'), 'False', 'false'), 'True', 'true'), '$.trendy') AS Trendy,
           get_json_object(replace(replace(replace(attributes.Ambience, "'", '"'), 'False', 'false'), 'True', 'true'), '$.upscale') AS Upscale,
           get_json_object(replace(replace(replace(attributes.Ambience, "'", '"'), 'False', 'false'), 'True', 'true'), '$.classy') AS Classy,
           get_json_object(replace(replace(replace(attributes.Ambience, "'", '"'), 'False', 'false'), 'True', 'true'), '$.casual') AS Casual
    FROM philly_restaurants_bars_1_photo
""")
create_temp_view(ambience_expanded_df, "ambience_expanded")

In [None]:
# Extract data from keywords
query = """
        SELECT ambience_expanded.*, sub.romantic_reviews
        FROM (
            SELECT business_id, count(*) AS romantic_reviews
            FROM reviews
            WHERE business_id IN (SELECT business_id 
                                  FROM philly_restaurants_bars_1_photo)
            AND text LIKE '%romantic%'
            GROUP BY business_id             
            ) AS sub
        RIGHT JOIN ambience_expanded 
         ON sub.business_id = ambience_expanded.business_id
        """
romantic = ss.sql(query)
romantic.createOrReplaceTempView('added_romantic')


query = """
        SELECT added_romantic.*, sub.family_friendly_reviews
        FROM (
            SELECT business_id, count(*) AS family_friendly_reviews
            FROM reviews
            WHERE business_id IN (SELECT business_id 
                                  FROM philly_restaurants_bars_1_photo)
             AND (text LIKE '%good for kids%' OR 
                  text LIKE '%family friendly%' OR text LIKE '%kids menu%')
            GROUP BY business_id             
            ) AS sub
        RIGHT JOIN added_romantic 
         ON sub.business_id = added_romantic.business_id
        ORDER BY sub.family_friendly_reviews DESC
        """
family_friendly = ss.sql(query)
family_friendly.createOrReplaceTempView('added_family_friendly')


query = """
        SELECT added_family_friendly.*, sub.rooftop_skyline_reviews
        FROM (
            SELECT business_id, count(*) AS rooftop_skyline_reviews
            FROM reviews
            WHERE business_id IN (SELECT business_id 
                                  FROM philly_restaurants_bars_1_photo)
             AND (text LIKE '%rooftop%' OR text LIKE '%skyline%' OR 
                  text LIKE '%Rooftop%' OR text LIKE '%Skyline%')
            GROUP BY business_id             
            ) AS sub
        RIGHT JOIN added_family_friendly 
         ON sub.business_id = added_family_friendly.business_id
        ORDER BY sub.rooftop_skyline_reviews DESC
        """
rooftop_skyline = ss.sql(query)
rooftop_skyline.createOrReplaceTempView('added_rooftop_skyline')

In [None]:
# Combines  with zero-shot classification scores to offer a comprehensive business profile
final_table_df = ss.sql("""
    SELECT *
    FROM added_rooftop_skyline r
    LEFT JOIN zero_shot_scores z ON r.business_id = z.bid
    """)
final_table_df.write.saveAsTable('philly_df_extracted_keywords')

## Filtering Database for Vibe Matches

**Vibes**:
- Coworking Cafe
- Brunch
- Green
- Local Delicacies
- Romantic/Date Night
- Upscale/Special Occassion
- Family-Friendly
- Rooftop
- Budget

In [None]:
# Coworking Cafe
query = """
        SELECT name, business_id, stars, round(coworking_cafe_score, 1) AS zero_shot_score
        FROM philly_df_extracted_keywords
        WHERE Casual = 'true' AND attributes.WiFi LIKE '%free%'
         AND (categories LIKE '%Cafes%' OR categories LIKE '%Coffee%')
         AND get_json_object(replace(replace(replace(attributes.GoodForMeal, "'", '"'), 'False', 'false'), 'True', 'true'), '$.dinner') = 'false'
        """

coworking_df = ss.sql(query).orderBy(F.desc('stars'), 
                                     F.desc(F.round('coworking_cafe_score', 1)), 
                                     F.desc('review_count'))

print("Coworking")
coworking_df.show(10, truncate=False)

Coworking


                                                                                

+------------------------+----------------------+-----+---------------+
|name                    |business_id           |stars|zero_shot_score|
+------------------------+----------------------+-----+---------------+
|Milkcrate Cafe          |e-ZyZc24wgkKafM3pguR2w|4.5  |0.8            |
|Nook Bakery & Coffee Bar|e4MoozYGqe_rb4_ZC1rYMQ|4.5  |0.7            |
|Volo Coffeehouse        |k2YJkdLg25xlYjshpeEtkQ|4.5  |0.7            |
|The Living Room Cafe    |uJvTBGksfErVDSOeBsmOyA|4.5  |0.7            |
|Cafe Walnut             |9pr6rI87hZTCIAnFGNnvzQ|4.5  |0.6            |
|Café y Chocolate        |qaDImxPguQz0jToNYvB1Eg|4.5  |0.5            |
|Lulu Cafe               |q-m9__XyqMhuX5cValqS2w|4.5  |0.5            |
|Function Coffee Labs    |BaSwNEingTmrBw4shffK5w|4.5  |0.5            |
|United By Blue          |ZpgVL2z1kgRi954c9m9INw|4.5  |0.4            |
|Talula's Daily          |EreYgrQPuR7Sk_FKeZZg9g|4.5  |0.3            |
+------------------------+----------------------+-----+---------

73

In [2]:
# Brunch
query = """
        SELECT name, business_id, stars, round(brunch_score, 1) AS zero_shot_score
        FROM philly_df_extracted_keywords
        WHERE Trendy = 'true'
         AND (categories LIKE '%Brunch%' OR get_json_object(replace(replace(replace(attributes.GoodForMeal, "'", '"'), 'False', 'false'), 'True', 'true'), '$.brunch') = 'true')
        """

brunch_df = ss.sql(query).orderBy(F.desc(F.round('brunch_score', 1)), 
                                  F.desc('stars'), 
                                  F.desc('review_count'))

In [3]:
# Green
query = """
        SELECT name, business_id, stars
        FROM philly_df_extracted_keywords
        WHERE categories LIKE '%Vegetarian%' AND categories LIKE '%Vegan%'
         AND get_json_object(replace(replace(replace(attributes.GoodForMeal, "'", '"'), 'False', 'false'), 'True', 'true'), '$.lunch') = 'true'
        """

green_df = ss.sql(query).orderBy(F.desc('stars'), F.desc('review_count'))

In [4]:
# Local Delicacies
query = """
        SELECT name, business_id, stars
        FROM philly_df_extracted_keywords
        WHERE (Touristy = 'true' OR categories LIKE '%Cheesesteak%' OR name LIKE '%Philadelphia%')
         AND attributes.GoodForMeal != 'null'
        ORDER BY stars DESC, review_count DESC
        """

local_delicacies_df = ss.sql(query).orderBy(F.desc('stars'), F.desc('review_count'))

In [5]:
# Romantic/Date Night
query = """
        SELECT name, business_id, stars, round(romantic_date_night_score, 2) AS zero_shot_score
        FROM philly_df_extracted_keywords
        WHERE (attributes.RestaurantsPriceRange2 = 2 OR attributes.RestaurantsPriceRange2 = 3)
         AND (Romantic = 'true' OR Trendy = 'true')
        """

romantic_date_night_df = ss.sql(query).orderBy(F.desc(F.round('romantic_date_night_score', 2)), 
                                               F.desc('stars'), F.desc('review_count'), 
                                               F.desc('romantic_reviews'))

In [6]:
# Upscale/Special Occasion
query = """
        SELECT name, business_id, stars, round(upscale_special_occasion_score, 1) AS zero_shot_score
        FROM philly_df_extracted_keywords
        WHERE attributes.RestaurantsPriceRange2 = 4
         AND (Classy = 'true' OR Upscale = 'true' OR Casual = 'false')
         AND attributes.RestaurantsReservations = 'True'
        """

special_occassion_df = ss.sql(query).orderBy(F.desc('stars'),  
                                             F.desc(F.round('upscale_special_occasion_score', 1)), 
                                             F.desc('review_count'))

In [7]:
# Family-Friendly
query = """
        SELECT name, business_id, stars
        FROM philly_df_extracted_keywords
        WHERE (attributes.RestaurantsPriceRange2 = 1 OR attributes.RestaurantsPriceRange2 = 2)
         AND Casual = 'true'
         AND attributes.GoodForKids = 'True'
         AND categories NOT LIKE '%Nightlife%' 
         AND attributes.NoiseLevel LIKE '%average%'
        """

family_friendly_df = ss.sql(query).orderBy(F.desc('family_friendly_reviews'), 
                                           F.desc('stars'), 
                                           F.desc('review_count'))

In [8]:
# Rooftop
query = """
        SELECT name, business_id, stars, round(rooftop_score, 1)
        FROM philly_df_extracted_keywords
        WHERE rooftop_skyline_reviews > 20
        """

rooftop_df =  ss.sql(query).orderBy(F.desc(F.round('rooftop_score', 1)), 
                                    F.desc('stars'), 
                                    F.desc('rooftop_skyline_reviews'))

In [9]:
# Budget
query = """
        SELECT name, business_id, stars
        FROM ambience_expanded
        WHERE categories NOT LIKE '%Fast Food%'
         AND attributes.RestaurantsPriceRange2 = 1
         AND categories NOT LIKE '%Bakeries'
         AND attributes.RestaurantsTableService = 'True'
         AND (get_json_object(replace(replace(replace(attributes.GoodForMeal, "'", '"'), 'False', 'false'), 'True', 'true'), '$.lunch') = 'true' 
         OR get_json_object(replace(replace(replace(attributes.GoodForMeal, "'", '"'), 'False', 'false'), 'True', 'true'), '$.dinner') = 'true')
        """

budget_df = ss.sql(query).orderBy(F.desc('stars'), F.desc('review_count'))