In [33]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')

import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
from transformers import pipeline

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score



## Parsing the TXT File into a List

In [70]:
with open('text-E1B5DBE1DEA1-1.txt', "r") as file:
    lines = file.read()

FileNotFoundError: [Errno 2] No such file or directory: 'text-E1B5DBE1DEA1-1.txt'

In [3]:
restaurants = "".join(lines).split("\n\n")[1:-1]
restaurants

['So Saap (Laotian, Thai in City Heights)\nhttps://sosaap.com/?utm_source=google\nNotes: i forgot wat spice level i ordered but it was so flavorful and spiceful i loved my drunken noodles. the skewers were so tasty!! sugar cane was so sweet and refreshing. sugar cane should be my middle name',
 'Kura Revolving Sushi Bar (Japanese, Sushi in Kearny Mesa)\nhttps://kurasushi.com/locations/sandiegocaconvoy/\nNotes: i am in love. sushi was great. 10/10',
 'Kura Revolving Sushi Bar (Japanese, Sushi)\nhttps://kurasushi.com/locations/pleasantonca/\nNotes: we ate good. i got snow crab this time, didnt like it. got spicy salmon crispy rice too for the first time and it was so good. we got mochi too (strawberry and black sesame) but only liked strawberry',
 'Sugar and Scribe (Bakery, Brunch in Village of La Jolla)\nhttps://www.sugarandscribe.com/\nNotes: the shashuka was so good! hot chocolate was good too on the slightly chilly day 💨. and the nutcracker cup was cute!',
 'Hako Sushi Box (Sushi, Ja

## TXT File Contents to a Dictionary

In [4]:
restaurants[1].split('\n')

['Kura Revolving Sushi Bar (Japanese, Sushi in Kearny Mesa)',
 'https://kurasushi.com/locations/sandiegocaconvoy/',
 'Notes: i am in love. sushi was great. 10/10']

In [5]:
restaurant_details = {}
restaurant_details['Name'] = []
restaurant_details['URL'] = []
restaurant_details['Notes'] = []

In [6]:
for restaurant in restaurants:
    info = restaurant.split('\n')
    restaurant_details['Name'].append(info[0])
    restaurant_details['URL'].append(info[1])
    if len(info) == 2:
        restaurant_details['Notes'].append('No Notes')
    else:
        restaurant_details['Notes'].append(info[2][7:])

## DataFrame of Restaurants and Making Columns for Location + Cuisines

In [7]:
df = pd.DataFrame(restaurant_details)
df

Unnamed: 0,Name,URL,Notes
0,"So Saap (Laotian, Thai in City Heights)",https://sosaap.com/?utm_source=google,i forgot wat spice level i ordered but it was ...
1,"Kura Revolving Sushi Bar (Japanese, Sushi in K...",https://kurasushi.com/locations/sandiegocaconvoy/,i am in love. sushi was great. 10/10
2,"Kura Revolving Sushi Bar (Japanese, Sushi)",https://kurasushi.com/locations/pleasantonca/,"we ate good. i got snow crab this time, didnt ..."
3,"Sugar and Scribe (Bakery, Brunch in Village of...",https://www.sugarandscribe.com/,the shashuka was so good! hot chocolate was go...
4,"Hako Sushi Box (Sushi, Japanese)",https://www.hakosushibox.com/,the box had three rolls and was decently price...
...,...,...,...
87,"Menya Ultra UTC La Jolla (Ramen, Japanese in U...",https://menyaultra.com/,No Notes
88,Fortunate Son Chinese (Chinese in North Park),http://www.fortunatesonchinese.com/,No Notes
89,"The Taco Stand (Mexican, Tacos, Taqueria in Vi...",http://www.letstaco.com/,No Notes
90,The Melt (American in La Jolla),http://www.themelt.com/,No Notes


In [8]:
def cuisines_details(val):
    if '(' not in val:
        cuisine = 'No Cuisine'
    else:
        detail = val[val.find("(") + 1 : val.find(')')]
        if detail.find(' in ') == -1:
            cuisine = detail.split(', ')
        else:
            cuisine = detail[:detail.find(' in ')].strip().split(', ')
        return cuisine

In [9]:
def location_details(val):
    detail = val[val.find("(") + 1 : val.find(')')]
    if detail.find(' in ') == -1:
        location = 'No Detail'
    else:
        location = detail[detail.find(' in ') + 4: ].strip()
    return location

In [10]:
df['Cuisines'] = df['Name'].apply(cuisines_details)
df

Unnamed: 0,Name,URL,Notes,Cuisines
0,"So Saap (Laotian, Thai in City Heights)",https://sosaap.com/?utm_source=google,i forgot wat spice level i ordered but it was ...,"[Laotian, Thai]"
1,"Kura Revolving Sushi Bar (Japanese, Sushi in K...",https://kurasushi.com/locations/sandiegocaconvoy/,i am in love. sushi was great. 10/10,"[Japanese, Sushi]"
2,"Kura Revolving Sushi Bar (Japanese, Sushi)",https://kurasushi.com/locations/pleasantonca/,"we ate good. i got snow crab this time, didnt ...","[Japanese, Sushi]"
3,"Sugar and Scribe (Bakery, Brunch in Village of...",https://www.sugarandscribe.com/,the shashuka was so good! hot chocolate was go...,"[Bakery, Brunch]"
4,"Hako Sushi Box (Sushi, Japanese)",https://www.hakosushibox.com/,the box had three rolls and was decently price...,"[Sushi, Japanese]"
...,...,...,...,...
87,"Menya Ultra UTC La Jolla (Ramen, Japanese in U...",https://menyaultra.com/,No Notes,"[Ramen, Japanese]"
88,Fortunate Son Chinese (Chinese in North Park),http://www.fortunatesonchinese.com/,No Notes,[Chinese]
89,"The Taco Stand (Mexican, Tacos, Taqueria in Vi...",http://www.letstaco.com/,No Notes,"[Mexican, Tacos, Taqueria]"
90,The Melt (American in La Jolla),http://www.themelt.com/,No Notes,[American]


In [11]:
df['Location'] = df['Name'].apply(location_details)
df

Unnamed: 0,Name,URL,Notes,Cuisines,Location
0,"So Saap (Laotian, Thai in City Heights)",https://sosaap.com/?utm_source=google,i forgot wat spice level i ordered but it was ...,"[Laotian, Thai]",City Heights
1,"Kura Revolving Sushi Bar (Japanese, Sushi in K...",https://kurasushi.com/locations/sandiegocaconvoy/,i am in love. sushi was great. 10/10,"[Japanese, Sushi]",Kearny Mesa
2,"Kura Revolving Sushi Bar (Japanese, Sushi)",https://kurasushi.com/locations/pleasantonca/,"we ate good. i got snow crab this time, didnt ...","[Japanese, Sushi]",No Detail
3,"Sugar and Scribe (Bakery, Brunch in Village of...",https://www.sugarandscribe.com/,the shashuka was so good! hot chocolate was go...,"[Bakery, Brunch]",Village of La Jolla
4,"Hako Sushi Box (Sushi, Japanese)",https://www.hakosushibox.com/,the box had three rolls and was decently price...,"[Sushi, Japanese]",No Detail
...,...,...,...,...,...
87,"Menya Ultra UTC La Jolla (Ramen, Japanese in U...",https://menyaultra.com/,No Notes,"[Ramen, Japanese]",University City
88,Fortunate Son Chinese (Chinese in North Park),http://www.fortunatesonchinese.com/,No Notes,[Chinese],North Park
89,"The Taco Stand (Mexican, Tacos, Taqueria in Vi...",http://www.letstaco.com/,No Notes,"[Mexican, Tacos, Taqueria]",Village of La Jolla
90,The Melt (American in La Jolla),http://www.themelt.com/,No Notes,[American],La Jolla


In [12]:
df['Name'] = df['Name'].str.split(r'\((.*)\)').str[0].str.strip()
df

Unnamed: 0,Name,URL,Notes,Cuisines,Location
0,So Saap,https://sosaap.com/?utm_source=google,i forgot wat spice level i ordered but it was ...,"[Laotian, Thai]",City Heights
1,Kura Revolving Sushi Bar,https://kurasushi.com/locations/sandiegocaconvoy/,i am in love. sushi was great. 10/10,"[Japanese, Sushi]",Kearny Mesa
2,Kura Revolving Sushi Bar,https://kurasushi.com/locations/pleasantonca/,"we ate good. i got snow crab this time, didnt ...","[Japanese, Sushi]",No Detail
3,Sugar and Scribe,https://www.sugarandscribe.com/,the shashuka was so good! hot chocolate was go...,"[Bakery, Brunch]",Village of La Jolla
4,Hako Sushi Box,https://www.hakosushibox.com/,the box had three rolls and was decently price...,"[Sushi, Japanese]",No Detail
...,...,...,...,...,...
87,Menya Ultra UTC La Jolla,https://menyaultra.com/,No Notes,"[Ramen, Japanese]",University City
88,Fortunate Son Chinese,http://www.fortunatesonchinese.com/,No Notes,[Chinese],North Park
89,The Taco Stand,http://www.letstaco.com/,No Notes,"[Mexican, Tacos, Taqueria]",Village of La Jolla
90,The Melt,http://www.themelt.com/,No Notes,[American],La Jolla


In [13]:
missing_locations = df[df['Location'] == 'No Detail'].index.tolist()

In [14]:
df[df['Location'] == 'No Detail']

Unnamed: 0,Name,URL,Notes,Cuisines,Location
2,Kura Revolving Sushi Bar,https://kurasushi.com/locations/pleasantonca/,"we ate good. i got snow crab this time, didnt ...","[Japanese, Sushi]",No Detail
4,Hako Sushi Box,https://www.hakosushibox.com/,the box had three rolls and was decently price...,"[Sushi, Japanese]",No Detail
24,Manny's Kitchen,Notes: (if i remember correctly) ordered fish ...,No Notes,[Mexican],No Detail
31,Micheline's Pita House,https://www.pitahousesd.com/,No Notes,[La Jolla],No Detail
33,Country Waffles,https://www.countrywaffleseastbay.com/,had a fire omelette. i ate that shit. yum,[American],No Detail
38,Stratford Court Cafe,https://www.stratfordcourtcafe.com/,No Notes,[Cafe],No Detail
39,Miguel's Cocina,http://www.miguelscocina.com/,No Notes,[Mexican],No Detail
41,Curry Pizza House Dublin,http://www.currypizzahouse.com/,i love having pizza from here. the flavors are...,[Pizza],No Detail
45,Monica's Livermore,https://www.monicaslivermore.com/,"was so good, the hollandaise sauce was somethi...",[Breakfast],No Detail
51,Hint of Desi,https://hintofdesi.com/,i had the chicken egg kathi roll. the meal was...,,No Detail


In [15]:
df['Location'].iloc[2] = 'Pleasanton'
df['Location'].iloc[4] = 'Chula Vista'
df['Location'].iloc[24] = 'Captain Cook'
df['Location'].iloc[31] = 'La Jolla'
df['Location'].iloc[33] = 'Dublin'
df['Location'].iloc[38] = 'Del Mar'
df['Location'].iloc[39] = 'San Diego'
df['Location'].iloc[41] = 'Dublin'
df['Location'].iloc[45] = 'Livermore'
df['Location'].iloc[51] = 'Food Truck'
df['Location'].iloc[52] = 'Pleasanton'
df['Location'].iloc[59] = 'Kailua-Kona'
df['Location'].iloc[66] = 'Kailua-Kona'
df['Location'].iloc[69] = 'Pleasanton'
df['Location'].iloc[77] = 'Avalon'
df['Location'].iloc[78] = 'San Ramon'
df['Location'].iloc[79] = 'Pleasanton'
df['Location'].iloc[81] = 'Pleasanton'
df['Location'].iloc[82] = 'Dublin'
df['Location'].iloc[83] = 'Pleasanton'
df['Location'].iloc[86] = 'Livermore'

In [16]:
df['Ratings'] = None
df

Unnamed: 0,Name,URL,Notes,Cuisines,Location,Ratings
0,So Saap,https://sosaap.com/?utm_source=google,i forgot wat spice level i ordered but it was ...,"[Laotian, Thai]",City Heights,
1,Kura Revolving Sushi Bar,https://kurasushi.com/locations/sandiegocaconvoy/,i am in love. sushi was great. 10/10,"[Japanese, Sushi]",Kearny Mesa,
2,Kura Revolving Sushi Bar,https://kurasushi.com/locations/pleasantonca/,"we ate good. i got snow crab this time, didnt ...","[Japanese, Sushi]",Pleasanton,
3,Sugar and Scribe,https://www.sugarandscribe.com/,the shashuka was so good! hot chocolate was go...,"[Bakery, Brunch]",Village of La Jolla,
4,Hako Sushi Box,https://www.hakosushibox.com/,the box had three rolls and was decently price...,"[Sushi, Japanese]",Chula Vista,
...,...,...,...,...,...,...
87,Menya Ultra UTC La Jolla,https://menyaultra.com/,No Notes,"[Ramen, Japanese]",University City,
88,Fortunate Son Chinese,http://www.fortunatesonchinese.com/,No Notes,[Chinese],North Park,
89,The Taco Stand,http://www.letstaco.com/,No Notes,"[Mexican, Tacos, Taqueria]",Village of La Jolla,
90,The Melt,http://www.themelt.com/,No Notes,[American],La Jolla,


In [17]:
df.loc[0:2, 'Ratings'] = 10.0
df.loc[3:6, 'Ratings'] = 9.9
df.loc[7:14, 'Ratings'] = 9.7
df.loc[14, 'Ratings'] = 9.5
df.loc[15:17, 'Ratings'] = 9.4
df.loc[17:20, 'Ratings'] = 9.3
df.loc[20:33, 'Ratings'] = 9.2
df.loc[33:35, 'Ratings'] = 8.7
df.loc[35:39, 'Ratings'] = 8.6
df.loc[39, 'Ratings'] = 8.5
df.loc[40:44, 'Ratings'] = 8.4
df.loc[44:46, 'Ratings'] = 8.3
df.loc[46:50, 'Ratings'] = 8.2
df.loc[50, 'Ratings'] = 8.1
df.loc[51:53, 'Ratings'] = 8.0
df.loc[53:56, 'Ratings'] = 7.9
df.loc[56:58, 'Ratings'] = 7.8
df.loc[58:65, 'Ratings'] = 7.7
df.loc[65, 'Ratings'] = 7.5
df.loc[66:69, 'Ratings'] = 7.4
df.loc[69:71, 'Ratings'] = 7.3
df.loc[71:74, 'Ratings'] = 7.2
df.loc[74:76, 'Ratings'] = 7.1
df.loc[76:79, 'Ratings'] = 7.0
df.loc[79:82, 'Ratings'] = 6.9
df.loc[82:84, 'Ratings'] = 6.8
df.loc[84:86, 'Ratings'] = 6.7
df.loc[86, 'Ratings'] = 6.2
df.loc[87, 'Ratings'] = 5.6
df.loc[88:90, 'Ratings'] = 5.0
df.loc[90, 'Ratings'] = 4.0
df.loc[91, 'Ratings'] = 3.4

In [18]:
df.loc[df['Cuisines'].apply(lambda x: x == ['La Jolla']), 'Cuisines'] = df.loc[
    df['Cuisines'].apply(lambda x: x == ['La Jolla'])
]['Cuisines'].apply(lambda _: ['Mediterranean'])

In [19]:
df['Cuisines'].value_counts()

Cuisines
[Pizza]                                7
[Mexican]                              5
[American]                             5
[Thai]                                 4
[Italian]                              4
[Indian]                               3
[Japanese, Sushi]                      3
[Chicken, Fast Food]                   2
[Mediterranean]                        2
[Breakfast]                            2
[Chinese]                              2
[Ramen, Japanese]                      2
[Japanese]                             2
[Ramen]                                2
[Barbecue, Bar]                        1
[Kebab, Mediterranean]                 1
[Caribbean]                            1
[Wine Bar, Italian]                    1
[American, Burgers, Fast Food]         1
[Hawaiian, Poke]                       1
[Mediterranean, Greek]                 1
[Laotian, Thai]                        1
[Sushi ,Grill and Chinese Cuisine]     1
[Breakfast, Brunch, American]          1
[Americ

In [20]:
df[df['Name'] == "Manny's Kitchen"]['URL'].iloc[0][7:]

'(if i remember correctly) ordered fish tacos and was a blast in the rainy weather! warmed me right up :D'

In [21]:
def moving_notes(row):
    url = row['URL'][7:]
    if 'http' not in row['URL']:
        row['Notes'] = url
        row['URL'] = 'No URL'
    return row

In [22]:
df = df.apply(moving_notes, axis = 1)
df

Unnamed: 0,Name,URL,Notes,Cuisines,Location,Ratings
0,So Saap,https://sosaap.com/?utm_source=google,i forgot wat spice level i ordered but it was ...,"[Laotian, Thai]",City Heights,10.0
1,Kura Revolving Sushi Bar,https://kurasushi.com/locations/sandiegocaconvoy/,i am in love. sushi was great. 10/10,"[Japanese, Sushi]",Kearny Mesa,10.0
2,Kura Revolving Sushi Bar,https://kurasushi.com/locations/pleasantonca/,"we ate good. i got snow crab this time, didnt ...","[Japanese, Sushi]",Pleasanton,10.0
3,Sugar and Scribe,https://www.sugarandscribe.com/,the shashuka was so good! hot chocolate was go...,"[Bakery, Brunch]",Village of La Jolla,9.9
4,Hako Sushi Box,https://www.hakosushibox.com/,the box had three rolls and was decently price...,"[Sushi, Japanese]",Chula Vista,9.9
...,...,...,...,...,...,...
87,Menya Ultra UTC La Jolla,https://menyaultra.com/,No Notes,"[Ramen, Japanese]",University City,5.6
88,Fortunate Son Chinese,http://www.fortunatesonchinese.com/,No Notes,[Chinese],North Park,5.0
89,The Taco Stand,http://www.letstaco.com/,No Notes,"[Mexican, Tacos, Taqueria]",Village of La Jolla,5.0
90,The Melt,http://www.themelt.com/,No Notes,[American],La Jolla,4.0


### Final Data Frame

In [23]:
df

Unnamed: 0,Name,URL,Notes,Cuisines,Location,Ratings
0,So Saap,https://sosaap.com/?utm_source=google,i forgot wat spice level i ordered but it was ...,"[Laotian, Thai]",City Heights,10.0
1,Kura Revolving Sushi Bar,https://kurasushi.com/locations/sandiegocaconvoy/,i am in love. sushi was great. 10/10,"[Japanese, Sushi]",Kearny Mesa,10.0
2,Kura Revolving Sushi Bar,https://kurasushi.com/locations/pleasantonca/,"we ate good. i got snow crab this time, didnt ...","[Japanese, Sushi]",Pleasanton,10.0
3,Sugar and Scribe,https://www.sugarandscribe.com/,the shashuka was so good! hot chocolate was go...,"[Bakery, Brunch]",Village of La Jolla,9.9
4,Hako Sushi Box,https://www.hakosushibox.com/,the box had three rolls and was decently price...,"[Sushi, Japanese]",Chula Vista,9.9
...,...,...,...,...,...,...
87,Menya Ultra UTC La Jolla,https://menyaultra.com/,No Notes,"[Ramen, Japanese]",University City,5.6
88,Fortunate Son Chinese,http://www.fortunatesonchinese.com/,No Notes,[Chinese],North Park,5.0
89,The Taco Stand,http://www.letstaco.com/,No Notes,"[Mexican, Tacos, Taqueria]",Village of La Jolla,5.0
90,The Melt,http://www.themelt.com/,No Notes,[American],La Jolla,4.0


In [32]:
df.to_csv('cleaned_beli.csv', header = True, index = False)

### Missing Values + General Data Frame Statistics

In [59]:
df[df['Cuisines'] == 'No Cuisine']['Cuisines'].count()

0

In [57]:
df[df['Notes'] == 'No Notes']['Notes'].count()

30

In [60]:
df[df['Location'] == 'No Location']['Location'].count()

0

In [61]:
df[df['URL'] == 'No URL']['URL'].count()

5

In [64]:
df.shape

(92, 6)

## Graphs and EDA

In [126]:
fig = px.bar(df['Location'].value_counts(), 
             title = 'Counts of Location of Restaurants',
             labels = {'Location' : 'Locations', 'value' : 'Frequency'})
fig.show()

In [24]:
df_cuisines = df.explode('Cuisines')
cuisine_counts = df_cuisines['Cuisines'].value_counts().reset_index()

cuisine_counts.columns = ['Cuisine', 'Count']

cuisine_counts = cuisine_counts.sort_values('Count', ascending=False).reset_index(drop=True)

fig1 = px.pie(cuisine_counts, names = 'Cuisine', values = 'Count', title = 'Frequency of Cuisines')

fig1.update_traces(textinfo='none')
fig1.show()

In [25]:
fig2 = px.bar(cuisine_counts, 
              x = 'Cuisine', 
              y = 'Count', 
              title = 'Frequency of Cuisines', 
              labels = {'Count' : 'Count of Cuisine'},
              text_auto= True)
fig2.update_traces(textposition='inside')
fig2.update_layout(uniformtext_minsize=8) 
fig2.show()

In [26]:
ratings_count = df['Ratings'].value_counts().reset_index()

ratings_count.columns = ['Rating', 'Count']

ratings_count = ratings_count.sort_values('Count', ascending=False).reset_index(drop=True)

fig3 = px.histogram(ratings_count, x = 'Rating', y = 'Count',
             title = 'Distribution of Ratings of Restaurants',
             labels = {'value' : 'Frequency'})
fig3.update_traces(textposition='inside')
fig3.update_layout(uniformtext_minsize=8) 
fig3.show()

In [27]:
location_ratings = df.groupby('Location')['Ratings'].agg(['mean', 'count']).reset_index()
location_ratings

Unnamed: 0,Location,mean,count
0,Avalon,7.0,1
1,Captain Cook,9.2,1
2,Chula Vista,9.9,1
3,City Heights,10.0,1
4,Clairemont,9.7,1
5,Del Mar,8.6,1
6,Dublin,7.966667,3
7,Food Truck,8.0,1
8,Hillcrest,7.1,1
9,Kailua-Kona,7.55,2


In [28]:
fig7 = px.box(df, x = 'Location', y = 'Ratings', 
            title = 'Distribution of Ratings in La Jolla')
fig7.show()

### La Jolla Data

In [29]:
la_jolla = df[df['Location'] == 'La Jolla']
la_jolla

Unnamed: 0,Name,URL,Notes,Cuisines,Location,Ratings
6,Caroline's Seaside Cafe by Giuseppe,http://carolinesseasidecafe.com/,"good sandwich, fries are top tier, and the dis...","[American, Burgers, Sandwiches]",La Jolla,9.9
8,Nozomi Sushi La Jolla,https://www.nozomilajolla.com/,No Notes,"[Japanese, Korean, Sushi]",La Jolla,9.7
13,Tous Les Jours,https://order.online/store/touslesjourslajolla...,No Notes,"[Bakery, Korean, French]",La Jolla,9.7
15,Tahini Authentic Middle Eastern Street Food,http://www.tahinistreetfood.com/,everyone knows i’m in love with this place and...,[Mediterranean],La Jolla,9.4
27,Shorehouse Kitchen,http://www.shorehousekitchen.com/,No Notes,"[Breakfast, Brunch, Californian]",La Jolla,9.2
29,Hennessey's Tavern,http://hennesseystavern.com/,No Notes,[American],La Jolla,9.2
30,AROI,http://www.aroithailajolla.com/,No Notes,[Thai],La Jolla,9.2
31,Micheline's Pita House,https://www.pitahousesd.com/,No Notes,[Mediterranean],La Jolla,9.2
36,Pho La Jolla,http://www.pholajolla.com/,No Notes,[Vietnamese],La Jolla,8.6
44,Blue Bowl Superfoods,http://mybluebowl.com/,I LOVE BLUE BOWL i stuff that shit in the bowl...,[Açaí Bowls],La Jolla,8.3


In [30]:
rating_la_jolla = la_jolla[['Ratings']]
fig4 = px.box(rating_la_jolla, y = 'Ratings', 
            title = 'Distribution of Ratings in La Jolla')
fig4.show()

In [31]:
lj_cuisines = la_jolla.explode('Cuisines')
lj_cuisines = lj_cuisines['Cuisines'].value_counts().reset_index()
lj_cuisines.columns = ['Cuisine', 'Count']



fig6 = px.bar(lj_cuisines, 
              x = 'Cuisine',
              y = 'Count',  
              title = 'Frequency of Cuisines in La Jolla', 
              labels = {'Count' : 'Count of Cuisine'},
              text_auto= True)
fig6.update_traces(textposition='inside')
fig6.update_layout(uniformtext_minsize=8) 
fig6.show()


## ML Model

### Feature Engineering

In [None]:
class RestaurantFeatureExtractor:
    def __init__(self):
        self.text_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.cuisine_encoder = LabelEncoder()
        self.scaler = StandardScaler()
        self.feature_names = []
        self.sentiment_analyzer = pipeline('sentiment-analysis', model = 'siebert/sentiment-roberta-large-english', return_all_scores = True)
    
    def extract_all_features(self, df):
        """Extract all features from restaurant data"""
        print('Extracting text features..')
        text_features = self._extract_text_features(df)

        print('Extracting name features..')
        name_features = self._extract_name_features(df)

        print("Extracting cuisine features..")
        cuisine_features = self._extract_cuisine_features(df)
        
        print("Extracting URL features..")
        url_features = self._extract_url_features(df)
        
        print("Creating missing data indicators..")
        missing_indicators = self._create_missing_indicators(df)

        all_features = np.hstack([
            text_features,
            name_features,
            cuisine_features,
            url_features,
            missing_indicators
        ])

        print(f'Total Feature Dimensions: {all_features.shape[1]}')
        return all_features
    
    def _extract_text_features(self, df):
        """Extract features from notes using Sentence-BERT embeddings"""
        text_embeddings = []
        text_stats = []
        for notes in df['Notes']:
            if pd.isna(notes) or str(notes).strip() == 'No Notes':
                embedding = np.zeroes(384)
                stats = [0] * 5
            else:
                notes_string = str(notes)
                embedding = self.text_model([notes_string])[0]

                sentiment = self.sentiment_analyzer(notes_string, truncation = True)[0]
                pos_score = 

                stats = [
                len(notes_string),
                len(notes_string.split()),
                
                ]
            text_embeddings.append(embedding)
            text_stats.append(stats)
        self.feature_names.extend([f'embedding_{i}' for i in range(384)])
        self.feature_names.extend(text_stats)

