Data: 1000 restaurants for each city

* Which cities have the greatest number of restaurants per capita?
* Which cities have the greatest concentration of mexican, ethiopian, etc.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
import os

### Load cities

In [None]:
# Load cities info
df_cities = pd.read_csv('/gh/data2/yelp/city_pop.csv', index_col=0)
df_cities.head()

Unnamed: 0,city,state,population
0,New York,New York,8537673
1,Los Angeles,California,3976322
2,Chicago,Illinois,2704958
3,Houston,Texas,2303482
4,Phoenix,Arizona,1615017


# Reformat dataframes of restaurants
1. Main characteristics of each restaurant
2. Sparse matrix of categories of each restaurant

In [None]:
# Determine paths to all dataframes
all_cities = df_cities['city']
dfs_places = []
df_path_base = '/gh/data2/yelp/food_by_city/places/'
for city in all_cities:
    json_path = df_path_base + city + '.json'
    if os.path.isfile(json_path):
        df_temp = pd.read_json(json_path)
        df_temp['city'] = city
        dfs_places.append(df_temp)
    else:
        print('No data for', city)
    
# Concatenate dataframes for each city
df_places = pd.concat(dfs_places)
df_places.reset_index(inplace=True)

# Create custom columns
df_places['all_aliases'] = [[a['alias'] for a in df_places['categories'][i]] for i in df_places.index]
df_places['latitude'] = [df_places.loc[i]['coordinates']['latitude'] for i in df_places.index]
df_places['longitude'] = [df_places.loc[i]['coordinates']['longitude'] for i in df_places.index]
df_places['cost'] = [len(str(x)) for x in df_places['price'].values]
df_places['has_delivery'] = ['delivery' in x for x in df_places['transactions'].values]
df_places['has_pickup'] = ['pickup' in x for x in df_places['transactions'].values]

# Determine which columns to keep
cols_keep = ['id', 'name', 'city', 'rating', 'review_count', 'cost', 'all_aliases',
             'latitude', 'longitude', 'has_delivery', 'has_pickup', 'url']
df_places_final = df_places[cols_keep]

# Determine all categories and their indices
all_categories = np.unique(np.hstack(df_places['all_aliases'].values))
k,v = np.unique(all_categories,return_inverse=True)
idx_by_category = dict(zip(k,v))

# Make a dataframe indicating if each restaurant is each category
N_categories = len(all_categories)
matrix_categories = np.zeros((len(df_places), N_categories), dtype=int)
for i, row in df_places.iterrows():
    # Determine number of aliases
    N_aliases = len(row['all_aliases'])
    for j in range(N_aliases):
        # Mark alias as present
        alias_name = row['all_aliases'][j]
        matrix_categories[i, idx_by_category[alias_name]] = 1
df_places_categories = pd.DataFrame(matrix_categories, columns=all_categories)

# Save dataframes
df_places.drop('all_aliases', axis=1).to_csv('/gh/data2/yelp/food_by_city/df_restaurants.csv')
df_places_categories.to_csv('/gh/data2/yelp/food_by_city/df_categories.csv')