In [1]:
import glob
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use('ggplot')

In [2]:
# Setup some constants for use in the analysis:

# Goal is to filter out some of the chain coffee shops to only count independent shops
# This should be less of a problem with the bikeshops and breweries
BAD_COFFEE = ['donut', 'starbucks', 'dunkin', 'caribou', 'peet', 'mcdonald', 'tea']

# Population data is taken from the city wikipedia pages for cities only
POPULATIONS = {"denver": 704621, "portland": 639863, "sacramento": 495234}

# Just keeping this to filter out the file
API_CONFIG_FILE = "api_config.json"

In [3]:
# Loop through the files to collect aggregated information into a single dataframe
files = glob.glob("data/*.json")
files = [file for file in files if file != API_CONFIG_FILE]

# Set up a dataframe to collect results
columns_list = ['category', 'city', 'total', 'per_capita_rate']
results = pd.DataFrame(columns=columns_list)

# Then loop through each of the files to get the information we want
for file in files:
    name = file.split('.')[0].split('/')[1]
    cat = name.split('_')[1].lower()
    city = name.split('_')[0].lower()
    df = pd.read_json(file)

    # Looking to only keep small/independent coffee shops and not major chains
    if 'coffee' in name:
        for bad in BAD_COFFEE:
            df = df[df.name.str.lower().str.contains(bad)==False]
    
    result = {'category': [cat], 'city': [city]}
    result['total'] = [df.name.count()]
    result['per_capita_rate'] = [df.name.count() /  POPULATIONS[city]]
    pd.DataFrame.from_dict(result)
    results = pd.concat([results, pd.DataFrame.from_dict(result)])

In [4]:
results

Unnamed: 0,category,city,per_capita_rate,total
0,coffee,sacramento,0.000248,123
0,coffee,portland,0.000677,433
0,breweries,sacramento,4.2e-05,21
0,bikes,portland,0.000131,84
0,bikes,denver,7.7e-05,54
0,bikes,sacramento,6.7e-05,33
0,coffee,denver,0.000324,228
0,breweries,portland,0.000156,100
0,breweries,denver,0.000131,92


In [11]:
results_cof = results[results.category=="coffee"]
results_cof.set_index('city', inplace=True)
#results[results.category=="coffee"].per_capita_rate.plot(kind='bar')