In [84]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')

import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt



## Parsing the TXT File into a List

In [85]:
with open('text-E1B5DBE1DEA1-1.txt', "r") as file:
    lines = file.read()

In [86]:
restaurants = "".join(lines).split("\n\n")[1:-1]

## TXT File Contents to a Dictionary

In [87]:
restaurant_details = {}
restaurant_details['Name'] = []
restaurant_details['URL'] = []
restaurant_details['Notes'] = []

In [88]:
for restaurant in restaurants:
    info = restaurant.split('\n')
    restaurant_details['Name'].append(info[0])
    restaurant_details['URL'].append(info[1])
    if len(info) == 2:
        restaurant_details['Notes'].append('No Notes')
    else:
        restaurant_details['Notes'].append(info[2][7:])

## DataFrame of Restaurants and Making Columns for Location + Cuisines

In [89]:
df = pd.DataFrame(restaurant_details)

In [90]:
def cuisines_details(val):
    if '(' not in val:
        cuisine = 'No Cuisine'
    else:
        detail = val[val.find("(") + 1 : val.find(')')]
        if detail.find(' in ') == -1:
            cuisine = detail.split(', ')
        else:
            cuisine = detail[:detail.find(' in ')].strip().split(', ')
        return cuisine

In [91]:
def location_details(val):
    detail = val[val.find("(") + 1 : val.find(')')]
    if detail.find(' in ') == -1:
        location = 'No Detail'
    else:
        location = detail[detail.find(' in ') + 4: ].strip()
    return location

In [92]:
df['Cuisines'] = df['Name'].apply(cuisines_details)

In [93]:
df['Location'] = df['Name'].apply(location_details)

In [94]:
df['Name'] = df['Name'].str.split(r'\((.*)\)').str[0].str.strip()

In [95]:
missing_locations = df[df['Location'] == 'No Detail'].index.tolist()

In [96]:
df['Location'].iloc[2] = 'Pleasanton'
df['Location'].iloc[4] = 'Chula Vista'
df['Location'].iloc[24] = 'Captain Cook'
df['Location'].iloc[31] = 'La Jolla'
df['Location'].iloc[33] = 'Dublin'
df['Location'].iloc[38] = 'Del Mar'
df['Location'].iloc[39] = 'San Diego'
df['Location'].iloc[41] = 'Dublin'
df['Location'].iloc[45] = 'Livermore'
df['Location'].iloc[51] = 'Food Truck'
df['Location'].iloc[52] = 'Pleasanton'
df['Location'].iloc[59] = 'Kailua-Kona'
df['Location'].iloc[66] = 'Kailua-Kona'
df['Location'].iloc[69] = 'Pleasanton'
df['Location'].iloc[77] = 'Avalon'
df['Location'].iloc[78] = 'San Ramon'
df['Location'].iloc[79] = 'Pleasanton'
df['Location'].iloc[81] = 'Pleasanton'
df['Location'].iloc[82] = 'Dublin'
df['Location'].iloc[83] = 'Pleasanton'
df['Location'].iloc[86] = 'Livermore'

In [97]:
df['Ratings'] = None

In [98]:
df.loc[0:2, 'Ratings'] = 10.0
df.loc[3:6, 'Ratings'] = 9.9
df.loc[7:14, 'Ratings'] = 9.7
df.loc[14, 'Ratings'] = 9.5
df.loc[15:17, 'Ratings'] = 9.4
df.loc[17:20, 'Ratings'] = 9.3
df.loc[20:33, 'Ratings'] = 9.2
df.loc[33:35, 'Ratings'] = 8.7
df.loc[35:39, 'Ratings'] = 8.6
df.loc[39, 'Ratings'] = 8.5
df.loc[40:44, 'Ratings'] = 8.4
df.loc[44:46, 'Ratings'] = 8.3
df.loc[46:50, 'Ratings'] = 8.2
df.loc[50, 'Ratings'] = 8.1
df.loc[51:53, 'Ratings'] = 8.0
df.loc[53:56, 'Ratings'] = 7.9
df.loc[56:58, 'Ratings'] = 7.8
df.loc[58:65, 'Ratings'] = 7.7
df.loc[65, 'Ratings'] = 7.5
df.loc[66:69, 'Ratings'] = 7.4
df.loc[69:71, 'Ratings'] = 7.3
df.loc[71:74, 'Ratings'] = 7.2
df.loc[74:76, 'Ratings'] = 7.1
df.loc[76:79, 'Ratings'] = 7.0
df.loc[79:82, 'Ratings'] = 6.9
df.loc[82:84, 'Ratings'] = 6.8
df.loc[84:86, 'Ratings'] = 6.7
df.loc[86, 'Ratings'] = 6.2
df.loc[87, 'Ratings'] = 5.6
df.loc[88:90, 'Ratings'] = 5.0
df.loc[90, 'Ratings'] = 4.0
df.loc[91, 'Ratings'] = 3.4

In [99]:
df.loc[df['Cuisines'].apply(lambda x: x == ['La Jolla']), 'Cuisines'] = df.loc[
    df['Cuisines'].apply(lambda x: x == ['La Jolla'])
]['Cuisines'].apply(lambda _: ['Mediterranean'])

In [100]:
df['Cuisines'].value_counts()

Cuisines
[Pizza]                                7
[Mexican]                              5
[American]                             5
[Thai]                                 4
[Italian]                              4
[Indian]                               3
[Japanese, Sushi]                      3
[Chicken, Fast Food]                   2
[Mediterranean]                        2
[Breakfast]                            2
[Chinese]                              2
[Ramen, Japanese]                      2
[Japanese]                             2
[Ramen]                                2
[Barbecue, Bar]                        1
[Kebab, Mediterranean]                 1
[Caribbean]                            1
[Wine Bar, Italian]                    1
[American, Burgers, Fast Food]         1
[Hawaiian, Poke]                       1
[Mediterranean, Greek]                 1
[Laotian, Thai]                        1
[Sushi ,Grill and Chinese Cuisine]     1
[Breakfast, Brunch, American]          1
[Americ

In [101]:
def moving_notes(row):
    url = row['URL'][7:]
    if 'http' not in row['URL']:
        row['Notes'] = url
        row['URL'] = 'No URL'
    return row

In [102]:
df = df.apply(moving_notes, axis = 1)

### Final Data Frame

In [103]:
df.head(3)

Unnamed: 0,Name,URL,Notes,Cuisines,Location,Ratings
0,So Saap,https://sosaap.com/?utm_source=google,i forgot wat spice level i ordered but it was ...,"[Laotian, Thai]",City Heights,10.0
1,Kura Revolving Sushi Bar,https://kurasushi.com/locations/sandiegocaconvoy/,i am in love. sushi was great. 10/10,"[Japanese, Sushi]",Kearny Mesa,10.0
2,Kura Revolving Sushi Bar,https://kurasushi.com/locations/pleasantonca/,"we ate good. i got snow crab this time, didnt ...","[Japanese, Sushi]",Pleasanton,10.0


In [104]:
df.to_csv('cleaned_beli.csv', header = True, index = False)

### Missing Values + General Data Frame Statistics

In [105]:
df[df['Cuisines'] == 'No Cuisine']['Cuisines'].count()

0

In [106]:
df[df['Notes'] == 'No Notes']['Notes'].count()

30

In [107]:
df[df['Location'] == 'No Location']['Location'].count()

0

In [108]:
df[df['URL'] == 'No URL']['URL'].count()

5

In [109]:
df.shape

(92, 6)

## Graphs and EDA

In [74]:
fig = px.bar(df['Location'].value_counts(), 
             title = 'Counts of Location of Restaurants',
             labels = {'Location' : 'Locations', 'value' : 'Frequency'})
fig.show()

In [75]:
df_cuisines = df.explode('Cuisines')
cuisine_counts = df_cuisines['Cuisines'].value_counts().reset_index()

cuisine_counts.columns = ['Cuisine', 'Count']

cuisine_counts = cuisine_counts.sort_values('Count', ascending=False).reset_index(drop=True)

fig1 = px.pie(cuisine_counts, names = 'Cuisine', values = 'Count', title = 'Frequency of Cuisines')

fig1.update_traces(textinfo='none')
fig1.show()

In [76]:
fig2 = px.bar(cuisine_counts, 
              x = 'Cuisine', 
              y = 'Count', 
              title = 'Frequency of Cuisines', 
              labels = {'Count' : 'Count of Cuisine'},
              text_auto= True)
fig2.update_traces(textposition='inside')
fig2.update_layout(uniformtext_minsize=8) 
fig2.show()

In [77]:
ratings_count = df['Ratings'].value_counts().reset_index()

ratings_count.columns = ['Rating', 'Count']

ratings_count = ratings_count.sort_values('Count', ascending=False).reset_index(drop=True)

fig3 = px.histogram(ratings_count, x = 'Rating', y = 'Count',
             title = 'Distribution of Ratings of Restaurants',
             labels = {'value' : 'Frequency'})
fig3.update_traces(textposition='inside')
fig3.update_layout(uniformtext_minsize=8) 
fig3.show()

In [80]:
fig7 = px.box(df, x = 'Location', y = 'Ratings', 
            title = 'Distribution of Ratings Per Location')
fig7.show()

### La Jolla Data

In [81]:
la_jolla = df[df['Location'] == 'La Jolla']

In [82]:
rating_la_jolla = la_jolla[['Ratings']]
fig4 = px.box(rating_la_jolla, y = 'Ratings', 
            title = 'Distribution of Ratings in La Jolla')
fig4.show()

In [83]:
lj_cuisines = la_jolla.explode('Cuisines')
lj_cuisines = lj_cuisines['Cuisines'].value_counts().reset_index()
lj_cuisines.columns = ['Cuisine', 'Count']



fig6 = px.bar(lj_cuisines, 
              x = 'Cuisine',
              y = 'Count',  
              title = 'Frequency of Cuisines in La Jolla', 
              labels = {'Count' : 'Count of Cuisine'},
              text_auto= True)
fig6.update_traces(textposition='inside')
fig6.update_layout(uniformtext_minsize=8) 
fig6.show()
