# Starbucks Stores Analysis

In [1]:
# Housekeeping
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Datasets

Data Constraints:
- Both Starbucks and US datasets published in 2017.
- Starbucks store locations limited to US country. 
- Starbucks store limited to Starbucks brand (no Teavana)
- Exclude Puerto Rico from US datasets

In [2]:
starbucks = pd.read_csv('data/directory.csv')
starbucks = starbucks.query("Brand == 'Starbucks'").query("Country == 'US'")
starbucks = starbucks.drop(columns=["Brand", "Store Name", "Ownership Type", "Street Address","Phone Number","Timezone", "Postcode", "Country"])
starbucks = starbucks.rename(columns={'State/Province' : 'State'})

In [3]:
cities = pd.read_csv('data/uscities.csv')
cities = cities[["city", "state_id", "state_name", "county_name"]]

In [4]:
demographic = pd.read_csv('data/demo.csv', encoding='cp1252')
demographic = demographic[demographic['State'] != 'Puerto Rico']
demographic["County"] = demographic["County"].apply(lambda x: ' '.join(x.split()[0:-1]))

### Merge Data 

In [5]:
mapping = pd.merge(starbucks, cities, left_on=["City", "State"], right_on=["city", "state_id"])
mapping = mapping.drop(columns=["state_id", "city", "State"])
mapping = mapping.rename(columns={"state_name":"State", "county_name":"County"})
mapping

Unnamed: 0,Store Number,City,Longitude,Latitude,State,County
0,3513-125945,Anchorage,-149.78,61.21,Alaska,Anchorage
1,74352-84449,Anchorage,-149.84,61.14,Alaska,Anchorage
2,12449-152385,Anchorage,-149.85,61.11,Alaska,Anchorage
3,24936-233524,Anchorage,-149.89,61.13,Alaska,Anchorage
4,8973-85630,Anchorage,-149.86,61.14,Alaska,Anchorage
...,...,...,...,...,...,...
12119,22353-220004,Lander,-108.75,42.84,Wyoming,Fremont
12120,74385-87621,Laramie,-105.59,41.32,Wyoming,Albany
12121,73320-24375,Laramie,-105.56,41.31,Wyoming,Albany
12122,22425-219024,Laramie,-105.56,41.31,Wyoming,Albany


***



## Data Analysis

In [6]:
storecount = mapping.groupby(['County', 'State'])['Store Number'].count().to_frame().reset_index()
storecount = storecount.rename(columns={"Store Number":"Count"})
storecount

Unnamed: 0,County,State,Count
0,Ada,Idaho,32
1,Adair,Missouri,1
2,Adams,Colorado,62
3,Adams,Illinois,2
4,Adams,Pennsylvania,1
...,...,...,...
1026,York,Pennsylvania,10
1027,York,South Carolina,8
1028,York,Virginia,2
1029,Yuba,California,1


In [30]:
df = storecount.merge(demographic, how='right', left_on=['County', 'State'], right_on=['County', 'State']).drop(columns=["Unnamed: 0", "CountyId", "VotingAgeCitizen"])
df['Count'] = df['Count'].fillna(0)
df['Men'] = (df['Men']/df['TotalPop'])*100
df['Women'] = (df['Women']/df['TotalPop'])*100
df['Employed'] = (df['Employed']/df['TotalPop'])*100
df['Non White'] = (df['Hispanic']+df['Black']+df['Native']+df['Asian']+df['Pacific'])/5
df

Unnamed: 0,County,State,Count,TotalPop,Men,Women,Hispanic,White,Black,Native,...,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment,Non White
0,Ada,Idaho,32.0,435117,50.101237,49.898763,7.9,85.2,1.2,0.4,...,2.8,6.9,20.4,49.408320,78.3,15.0,6.6,0.1,4.3,2.44
1,Adair,Missouri,1.0,25437,47.226481,52.773519,2.3,90.5,2.4,0.2,...,2.6,4.0,17.1,44.321264,73.6,20.9,5.3,0.2,5.5,1.46
2,Adams,Colorado,62.0,487850,50.392539,49.607461,39.3,51.1,3.0,0.5,...,1.1,5.0,29.2,50.517577,83.6,11.2,5.1,0.1,5.1,9.34
3,Adams,Illinois,2.0,66787,48.958630,51.041370,1.5,92.0,3.9,0.2,...,1.5,4.5,17.0,48.277359,83.1,10.6,6.1,0.1,5.5,1.30
4,Adams,Pennsylvania,1.0,101589,49.272067,50.727933,6.8,89.5,1.4,0.0,...,0.9,3.8,27.6,49.770152,83.4,10.7,5.6,0.2,4.9,1.78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,Sheridan,Wyoming,0.0,29964,49.446002,50.553998,4.2,91.6,0.9,1.0,...,0.7,5.6,16.0,48.935389,71.3,22.1,6.3,0.2,3.1,1.36
3138,Sublette,Wyoming,0.0,10037,54.269204,45.730796,7.3,89.2,0.0,0.1,...,0.9,7.0,20.9,53.761084,72.6,23.9,3.4,0.2,4.2,1.50
3139,Uinta,Wyoming,0.0,20758,51.030928,48.969072,9.1,87.7,0.1,0.9,...,1.3,2.0,19.9,45.900376,71.5,21.5,6.6,0.4,6.4,2.04
3140,Washakie,Wyoming,0.0,8253,49.897007,50.102993,14.2,82.2,0.3,0.4,...,1.3,4.4,14.3,46.443717,69.8,22.0,8.1,0.2,6.1,3.00


### Correlation

In [33]:
var = ['Count', 'TotalPop', 'Men', 'Women', 'White', 'Non White',\
       'IncomeErr', 'IncomePerCap', 'IncomePerCapErr', 'Poverty', 'ChildPoverty', \
       'Professional', 'Service', 'Office', 'Construction', 'Production', 'Drive', 'Carpool', 'Transit', 'Walk', \
       'OtherTransp', 'WorkAtHome', 'MeanCommute', 'Employed','PrivateWork', 'PublicWork', 'SelfEmployed', \
       'FamilyWork', 'Unemployment']
corr = df[var].corr().drop('Count')[['Count']]
corr = corr[abs(corr["Count"])>.19]
corr

Unnamed: 0,Count
TotalPop,0.896795
White,-0.202117
Non White,0.194375
IncomePerCap,0.256303
Professional,0.247451
Construction,-0.212778
Transit,0.327334


***

## Data Visualization