In [1]:
from pyathena import connect
import pandas as pd
import configparser

In [2]:
config = configparser.ConfigParser()
config.read('../access_keys_shreya.cfg')
AWS_ACCESS_KEY = config.get('aws', 'aws_access_key')
AWS_SECRET_KEY = config.get('aws', 'aws_secret_key')

In [3]:
from six.moves.urllib.parse import quote_plus
from sqlalchemy.engine import create_engine

SCHEMA_NAME = "yelp"
S3_STAGING_DIR = "s3://sairin.yelp.dataset/dataset"
AWS_REGION = "us-west-1"
conn_str = (
    "awsathena+rest://{aws_access_key_id}:{aws_secret_access_key}@"
    "athena.{region_name}.amazonaws.com:443/"
    "{schema_name}?s3_staging_dir={s3_staging_dir}&work_group=primary"
)


# Create the SQLAlchemy connection. Note that you need to have pyathena installed for this.
engine = create_engine(
    conn_str.format(
        aws_access_key_id=quote_plus(AWS_ACCESS_KEY),
        aws_secret_access_key=quote_plus(AWS_SECRET_KEY),
        region_name=AWS_REGION,
        schema_name=SCHEMA_NAME,
        s3_staging_dir=quote_plus(S3_STAGING_DIR),
    )
)
athena_connection = engine.connect()

In [5]:
query_phialdelphia_restaurants = """
        SELECT * 
        FROM yelp.philadelphia_details_table
        """

In [29]:
df_restaurant_philadelphia_cusine = pd.read_sql(query_phialdelphia_restaurants, athena_connection)
df_restaurant_philadelphia_cusine.to_pickle('../src/data/restaurant_philadelphia_details.pkl')

In [7]:
df_restaurant_philadelphia_cusine.head()

Unnamed: 0,business_id,review_count,name,address,city,state,latitude,longitude,categories,stars
0,i_FWONQD1ZBqrNE2b-M5Ug,1964,Talula's Garden,210 W Washington Sq,Philadelphia,PA,39.947327,-75.153542,"American (New), Restaurants",4.446537
1,2CDI713ATuxHfnB5b-sBdw,1492,Vedge,1221 Locust St,Philadelphia,PA,39.947931,-75.161428,"Restaurants, Vegetarian, Beer, Wine & Spirits,...",4.481233
2,mtvT7uRey3F395STFRM1Tg,1049,Vernick Food & Drink,2031 Walnut St,Philadelphia,PA,39.950759,-75.174846,"Food, Restaurants, Beer, Wine & Spirits, Diner...",4.499523
3,J8S7cPPlTgsQnXKVfTyN8g,851,PYT,"The Piazza At Schmidts, 1050 N Hancock St",Philadelphia,PA,39.966505,-75.139149,"American (Traditional), Food, Salad, Burgers, ...",2.998825
4,rYqmaOIULRouz_1db07OdQ,844,Green Eggs Cafe,1306 Dickinson St,Philadelphia,PA,39.931348,-75.166497,"Diners, Restaurants, Breakfast & Brunch, Ameri...",3.957346


In [8]:
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string, STOPWORDS, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, stem_text
from gensim import utils
import re

# Custom filters
def remove_stopwords(text):
    return ' '.join(word for word in text.split() if word not in STOPWORDS)

def separate_combined_words(text):
    # Use regex to identify words enclosed in parentheses and split them
    return re.sub(r'\(([^)]+)\)', r' \1 ', text)

# Combined filters
FILTERS = [
    lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces,
    strip_numeric, stem_text, separate_combined_words, remove_stopwords
]

def clean_tokenize_text(df_restaurant_philadelphia_cusine):
    df_restaurant_philadelphia_cusine['tokens'] = df_restaurant_philadelphia_cusine['categories'].map(lambda x: preprocess_string(x, FILTERS))
    return df_restaurant_philadelphia_cusine

philadelphia_cusine_tokens = clean_tokenize_text(df_restaurant_philadelphia_cusine)
philadelphia_cusine_tokens.head()


Unnamed: 0,business_id,review_count,name,address,city,state,latitude,longitude,categories,stars,tokens
0,i_FWONQD1ZBqrNE2b-M5Ug,1964,Talula's Garden,210 W Washington Sq,Philadelphia,PA,39.947327,-75.153542,"American (New), Restaurants",4.446537,"[american, new, restaur]"
1,2CDI713ATuxHfnB5b-sBdw,1492,Vedge,1221 Locust St,Philadelphia,PA,39.947931,-75.161428,"Restaurants, Vegetarian, Beer, Wine & Spirits,...",4.481233,"[restaur, vegetarian, beer, wine, spirit, vega..."
2,mtvT7uRey3F395STFRM1Tg,1049,Vernick Food & Drink,2031 Walnut St,Philadelphia,PA,39.950759,-75.174846,"Food, Restaurants, Beer, Wine & Spirits, Diner...",4.499523,"[food, restaur, beer, wine, spirit, diner, ame..."
3,J8S7cPPlTgsQnXKVfTyN8g,851,PYT,"The Piazza At Schmidts, 1050 N Hancock St",Philadelphia,PA,39.966505,-75.139149,"American (Traditional), Food, Salad, Burgers, ...",2.998825,"[american, tradit, food, salad, burger, bar, n..."
4,rYqmaOIULRouz_1db07OdQ,844,Green Eggs Cafe,1306 Dickinson St,Philadelphia,PA,39.931348,-75.166497,"Diners, Restaurants, Breakfast & Brunch, Ameri...",3.957346,"[diner, restaur, breakfast, brunch, american, ..."


In [27]:
def restaurant_within_boundary(boundary):
    lat_min = boundary["south"]
    lat_max = boundary["north"]
    long_min = boundary["west"]
    long_max = boundary["east"]
     # Filter the DataFrame based on latitude and longitude boundaries
    restaurants_within_boundary = df_restaurant_philadelphia_cusine.loc[
    (df_restaurant_philadelphia_cusine["latitude"] >= lat_min) &
    (df_restaurant_philadelphia_cusine["latitude"] <= lat_max) &
    (df_restaurant_philadelphia_cusine["longitude"] >= long_min) &
    (df_restaurant_philadelphia_cusine["longitude"] <= long_max)
]
    # Sort the filtered DataFrame by review_count in descending order
    restaurants_within_boundary = restaurants_within_boundary.sort_values(by="review_count", ascending=False)

    # Return the names of the restaurants within the boundaries
    return restaurants_within_boundary[["name",'review_count']][:10]



In [28]:
boundary = {}
boundary["east"] = -75.15088773269655
boundary["north"] = 39.962888035213524
boundary["south"] = 39.94231041707019
boundary["west"] = -75.155

# Call the function and print the result
restaurants_within_boundaries = restaurant_within_boundary(boundary)
restaurants_within_boundaries

Unnamed: 0,name,review_count
0,Talula's Garden,1964
63,Morimoto,1943
15,Terakawa Ramen,1753
93,Jones,1168
177,Fat Salmon,1130
96,Sang Kee Peking Duck House,962
230,Blackbird Pizzeria,761
41,Pho Xe Lua Viet Thai Restaurant,732
254,Rangoon Burmese Restaurant,624
60,Independence Beer Garden,519


In [38]:
import pandas as pd
from utils import  generate_bert_embeddings

# Assuming you have loaded your DataFrame 'df_restaurant_philadelphia'

def get_bert_embedding(row):
    token = ' '.join(row['tokens'])  # Concatenate the list of tokens into a single string
    return generate_bert_embeddings(token)

# Apply the function to each row and store the results in a new column 'bert_embedding'
philadelphia_cusine_tokens['bert_embedding'] = philadelphia_cusine_tokens.apply(get_bert_embedding, axis=1)


philadelphia_cusine_tokens.head()



Unnamed: 0,business_id,review_count,name,address,city,state,latitude,longitude,categories,stars,tokens,bert_embedding
0,i_FWONQD1ZBqrNE2b-M5Ug,1964,Talula's Garden,210 W Washington Sq,Philadelphia,PA,39.947327,-75.153542,"American (New), Restaurants",4.446537,"[american, new, restaur]","[0.06482988, -0.072307155, -0.0048128353, -0.0..."
1,2CDI713ATuxHfnB5b-sBdw,1492,Vedge,1221 Locust St,Philadelphia,PA,39.947931,-75.161428,"Restaurants, Vegetarian, Beer, Wine & Spirits,...",4.481233,"[restaur, vegetarian, beer, wine, spirit, vega...","[0.063544795, 0.043516044, -0.066077605, -0.00..."
2,mtvT7uRey3F395STFRM1Tg,1049,Vernick Food & Drink,2031 Walnut St,Philadelphia,PA,39.950759,-75.174846,"Food, Restaurants, Beer, Wine & Spirits, Diner...",4.499523,"[food, restaur, beer, wine, spirit, diner, ame...","[0.09352894, -0.011953473, -0.030399857, -0.03..."
3,J8S7cPPlTgsQnXKVfTyN8g,851,PYT,"The Piazza At Schmidts, 1050 N Hancock St",Philadelphia,PA,39.966505,-75.139149,"American (Traditional), Food, Salad, Burgers, ...",2.998825,"[american, tradit, food, salad, burger, bar, n...","[0.013814832, -0.061587844, -0.04581057, -0.03..."
4,rYqmaOIULRouz_1db07OdQ,844,Green Eggs Cafe,1306 Dickinson St,Philadelphia,PA,39.931348,-75.166497,"Diners, Restaurants, Breakfast & Brunch, Ameri...",3.957346,"[diner, restaur, breakfast, brunch, american, ...","[0.019389452, -0.07336227, 0.021259494, -0.042..."


In [41]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def similar_cusinine_restaurants(cuisine):
    cusine_embedding = generate_bert_embeddings(cuisine)
    
    # Reshape the cusine_embedding to be 2D as required by cosine_similarity function
    cusine_embedding = np.array(cusine_embedding).reshape(1, -1)
    
    # Compute the cosine similarity for each restaurant
    similarities = philadelphia_cusine_tokens['bert_embedding'].apply(
        lambda restaurant_embedding: cosine_similarity(cusine_embedding, np.array(restaurant_embedding).reshape(1, -1))
    )
    
    # Concatenate the similarities to the DataFrame for sorting
    df_with_similarity = philadelphia_cusine_tokens.copy()
    df_with_similarity['similarity'] = similarities

    # Sort the DataFrame by the similarity in descending order and get the top 10
    df_sorted = df_with_similarity.sort_values(by='similarity', ascending=False).head(10)

    # Extract the restaurant names
    restaurant_names = df_sorted['name'].tolist()
    
    return restaurant_names


result=similar_cusinine_restaurants('thai')
result

['Pho Xe Lua Viet Thai Restaurant',
 'Penang',
 'Xiandu Thai',
 'Sampan',
 'Spice 28',
 'Banana Leaf',
 'Dan Dan',
 'Seorabol Korean Restaurant',
 'Nine Ting',
 'Cheu Noodle Bar']

In [58]:
from sklearn.metrics.pairwise import cosine_similarity

def get_closest_cuisine(row):
    restaurant_embedding = np.array(row['bert_embedding']).reshape(1, -1)
    
    # Compute the cosine similarity for each cuisine
    similarities = {cuisine: cosine_similarity(restaurant_embedding, np.array(embedding).reshape(1, -1)) for cuisine, embedding in cuisine_embeddings.items()}
    
    # Get the cuisine with the highest similarity
    closest_cuisine = max(similarities, key=similarities.get)
    
    return closest_cuisine
cuisines = ['american', 'italian', 'mexican', 'european', 'indian', 'thai', 'chinese', 'korean', 'wine-spirit']
cuisine_embeddings = {cuisine: generate_bert_embeddings(cuisine) for cuisine in cuisines}
philadelphia_cusine_tokens['closest_cuisine'] = philadelphia_cusine_tokens.apply(get_closest_cuisine, axis=1)
philadelphia_cusine_tokens.head()


Unnamed: 0,business_id,review_count,name,address,city,state,latitude,longitude,categories,stars,tokens,bert_embedding,closest_cuisine
0,i_FWONQD1ZBqrNE2b-M5Ug,1964,Talula's Garden,210 W Washington Sq,Philadelphia,PA,39.947327,-75.153542,"American (New), Restaurants",4.446537,"[american, new, restaur]","[0.06482988, -0.072307155, -0.0048128353, -0.0...",american
1,2CDI713ATuxHfnB5b-sBdw,1492,Vedge,1221 Locust St,Philadelphia,PA,39.947931,-75.161428,"Restaurants, Vegetarian, Beer, Wine & Spirits,...",4.481233,"[restaur, vegetarian, beer, wine, spirit, vega...","[0.063544795, 0.043516044, -0.066077605, -0.00...",wine-spirit
2,mtvT7uRey3F395STFRM1Tg,1049,Vernick Food & Drink,2031 Walnut St,Philadelphia,PA,39.950759,-75.174846,"Food, Restaurants, Beer, Wine & Spirits, Diner...",4.499523,"[food, restaur, beer, wine, spirit, diner, ame...","[0.09352894, -0.011953473, -0.030399857, -0.03...",wine-spirit
3,J8S7cPPlTgsQnXKVfTyN8g,851,PYT,"The Piazza At Schmidts, 1050 N Hancock St",Philadelphia,PA,39.966505,-75.139149,"American (Traditional), Food, Salad, Burgers, ...",2.998825,"[american, tradit, food, salad, burger, bar, n...","[0.013814832, -0.061587844, -0.04581057, -0.03...",thai
4,rYqmaOIULRouz_1db07OdQ,844,Green Eggs Cafe,1306 Dickinson St,Philadelphia,PA,39.931348,-75.166497,"Diners, Restaurants, Breakfast & Brunch, Ameri...",3.957346,"[diner, restaur, breakfast, brunch, american, ...","[0.019389452, -0.07336227, 0.021259494, -0.042...",american


In [59]:
philadelphia_cusine_tokens.to_pickle('../src/data/restaurant_philadelphia_details_cuisine.pkl')


In [60]:
philadelphia_cusine_tokens['closest_cuisine'].value_counts()

closest_cuisine
wine-spirit    109
italian         40
thai            37
mexican         21
american        16
korean          14
european        13
chinese          6
indian           4
Name: count, dtype: int64