# Starbucks Stores Analysis

In [73]:
# Housekeeping
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [74]:
# Import datasets
popURL = 'https://www2.census.gov/programs-surveys/popest/datasets/2010-2017/cities/totals/sub-est2017_all.csv'

starbucks = pd.read_csv('data/directory.csv')
income = pd.read_csv('data/archive/kaggle_income.csv', encoding='cp1252')
population = pd.read_csv(popURL, encoding='cp1252')
demographic = pd.read_csv('data/demo.csv', encoding='cp1252')
cities = pd.read_csv('data/uscities.csv')

***

## Data Cleaning

Data Constraints:
- Both Starbucks and US datasets published in 2017.
- Starbucks store locations limited to US country. 
- Starbucks store limited to Starbucks brand (no Teavana)
- Exclude Puerto Rico from US datasets

### Starbucks Dataset

In [75]:
starbucks = starbucks.query("Brand == 'Starbucks'").query("Country == 'US'")
starbucks = starbucks.drop(columns=["Brand", "Store Name", "Ownership Type", "Street Address","Phone Number","Timezone", "Postcode", "Country"])
starbucks = starbucks.rename(columns={'State/Province' : 'State'})
starbucks

Unnamed: 0,Store Number,City,State,Longitude,Latitude
11964,3513-125945,Anchorage,AK,-149.78,61.21
11965,74352-84449,Anchorage,AK,-149.84,61.14
11966,12449-152385,Anchorage,AK,-149.85,61.11
11967,24936-233524,Anchorage,AK,-149.89,61.13
11968,8973-85630,Anchorage,AK,-149.86,61.14
...,...,...,...,...,...
25567,74385-87621,Laramie,WY,-105.59,41.32
25568,73320-24375,Laramie,WY,-105.56,41.31
25569,22425-219024,Laramie,WY,-105.56,41.31
25570,10849-103163,Rock Springs,WY,-109.25,41.58


### US County Income Dataset

In [76]:
# income = income[["State_Name","State_ab","County","City","Median"]]
income

Unnamed: 0,id,State_Code,State_Name,State_ab,County,City,Place,Type,Primary,Zip_Code,Area_Code,ALand,AWater,Lat,Lon,Mean,Median,Stdev,sum_w
0,1011000,1,Alabama,AL,Mobile County,Chickasaw,Chickasaw city,City,place,36611,251,10894952,909156,30.771450,-88.079697,38773,30506,33101,1638.260513
1,1011010,1,Alabama,AL,Barbour County,Louisville,Clio city,City,place,36048,334,26070325,23254,31.708516,-85.611039,37725,19528,43789,258.017685
2,1011020,1,Alabama,AL,Shelby County,Columbiana,Columbiana city,City,place,35051,205,44835274,261034,33.191452,-86.615618,54606,31930,57348,926.031000
3,1011030,1,Alabama,AL,Mobile County,Satsuma,Creola city,City,place,36572,251,36878729,2374530,30.874343,-88.009442,63919,52814,47707,378.114619
4,1011040,1,Alabama,AL,Mobile County,Dauphin Island,Dauphin Island,Town,place,36528,251,16204185,413605152,30.250913,-88.171268,77948,67225,54270,282.320328
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32521,720296,72,Puerto Rico,PR,Adjuntas Municipio,Guaynabo,Adjuntas,Track,Track,970,787,589417,1691,18.397925,-66.130633,30649,13729,37977,1321.278082
32522,7202966,72,Puerto Rico,PR,Adjuntas Municipio,Aguada,Adjuntas,Track,Track,602,787,1801613,795887,18.385424,-67.203310,15520,9923,15541,238.813450
32523,7202976,72,Puerto Rico,PR,Adjuntas Municipio,Aguada,Adjuntas,Track,Track,602,787,11031227,0,18.356565,-67.180686,41933,34054,31539,313.551070
32524,7202986,72,Puerto Rico,PR,Adjuntas Municipio,Aguada,Adjuntas,Track,Track,602,787,0,33597561,18.412041,-67.213413,0,0,0,0.000000


### US County Population Dataset

In [77]:
population = population.loc[population['SUMLEV'] == 162]
population = population[["NAME", "STNAME", "POPESTIMATE2017"]]
population["NAME"] = population["NAME"].apply(lambda x: ' '.join(x.split()[0:-1]))

### US City <--> County Mapping

In [78]:
cities = cities[["city", "state_id", "state_name", "county_name"]]
cities

Unnamed: 0,city,state_id,state_name,county_name
0,New York,NY,New York,New York
1,Los Angeles,CA,California,Los Angeles
2,Chicago,IL,Illinois,Cook
3,Miami,FL,Florida,Miami-Dade
4,Dallas,TX,Texas,Dallas
...,...,...,...,...
28333,Gross,NE,Nebraska,Boyd
28334,Lotsee,OK,Oklahoma,Tulsa
28335,The Ranch,MN,Minnesota,Mahnomen
28336,Shamrock,OK,Oklahoma,Creek


### US County Demographic Dataset

In [79]:
demographic

Unnamed: 0.1,Unnamed: 0,CountyId,State,County,TotalPop,Men,Women,Hispanic,White,Black,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,0,1001,Alabama,Autauga County,55036,26899,28137,2.7,75.4,18.9,...,0.6,1.3,2.5,25.8,24112,74.1,20.2,5.6,0.1,5.2
1,1,1003,Alabama,Baldwin County,203360,99527,103833,4.4,83.1,9.5,...,0.8,1.1,5.6,27.0,89527,80.7,12.9,6.3,0.1,5.5
2,2,1005,Alabama,Barbour County,26201,13976,12225,4.2,45.7,47.8,...,2.2,1.7,1.3,23.4,8878,74.1,19.1,6.5,0.3,12.4
3,3,1007,Alabama,Bibb County,22580,12251,10329,2.4,74.6,22.0,...,0.3,1.7,1.5,30.0,8171,76.0,17.4,6.3,0.3,8.2
4,4,1009,Alabama,Blount County,57667,28490,29177,9.0,87.4,1.5,...,0.4,0.4,2.1,35.0,21380,83.9,11.9,4.0,0.1,4.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3215,3215,72145,Puerto Rico,Vega Baja Municipio,54754,26269,28485,96.7,3.1,0.1,...,1.4,0.6,0.9,31.6,14234,76.2,19.3,4.3,0.2,16.8
3216,3216,72147,Puerto Rico,Vieques Municipio,8931,4351,4580,95.7,4.0,0.0,...,5.0,0.0,1.7,14.9,2927,40.7,40.9,18.4,0.0,12.8
3217,3217,72149,Puerto Rico,Villalba Municipio,23659,11510,12149,99.7,0.2,0.1,...,2.1,0.0,2.8,28.4,6873,59.2,30.2,10.4,0.2,24.8
3218,3218,72151,Puerto Rico,Yabucoa Municipio,35025,16984,18041,99.9,0.1,0.0,...,1.4,1.8,0.1,30.5,7878,62.7,30.9,6.3,0.0,25.4


### Merge Data 

In [80]:
#data = pd.merge(income, population, left_on=["State_Name","City"], right_on=["STNAME", "NAME"]).drop(columns=["NAME","STNAME"])
#data = pd.merge(starbucks, data, how='right', left_on=["State", "City"], right_on=["State_ab","City"]).drop(columns=['State_Name','State'])
#data = data.rename(columns={"Longitude":"Starbucks Lon", "Latitude":"Starbucks Lat",\
#                            "State_ab":"State",\
#                            "POPESTIMATE2017":"Population"})
#data = data[~data['Store Number'].duplicated() | data['Store Number'].isnull()]
#data

mapping = demographic
mapping["County"] = mapping["County"].apply(lambda x: ' '.join(x.split()[0:-1]))
mapping = pd.merge(demographic, cities, left_on=["County"], right_on=["county_name"]).drop(columns=["state_name"]).drop_duplicates()
data = pd.merge(starbucks, mapping, how='right', left_on=["State", "City"], right_on=["state_id","city"]).drop(columns=['state_id','city','county_name'])
data = data.rename(columns={"Longitude":"Starbucks Lon", "Latitude":"Starbucks Lat",\
                            "State_ab":"State",\
                            "POPESTIMATE2017":"Population"})
data = data[~data['Store Number'].duplicated() | data['Store Number'].isnull()]
data

Unnamed: 0.1,Store Number,City,State_x,Starbucks Lon,Starbucks Lat,Unnamed: 0,CountyId,State_y,County,TotalPop,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,3513-125945,Anchorage,AK,-149.78,61.21,69,2020,Alaska,Anchorage,298225,...,3.4,3.5,3.8,18.9,152751,73.2,21.1,5.5,0.1,5.8
1,74352-84449,Anchorage,AK,-149.84,61.14,69,2020,Alaska,Anchorage,298225,...,3.4,3.5,3.8,18.9,152751,73.2,21.1,5.5,0.1,5.8
2,12449-152385,Anchorage,AK,-149.85,61.11,69,2020,Alaska,Anchorage,298225,...,3.4,3.5,3.8,18.9,152751,73.2,21.1,5.5,0.1,5.8
3,24936-233524,Anchorage,AK,-149.89,61.13,69,2020,Alaska,Anchorage,298225,...,3.4,3.5,3.8,18.9,152751,73.2,21.1,5.5,0.1,5.8
4,8973-85630,Anchorage,AK,-149.86,61.14,69,2020,Alaska,Anchorage,298225,...,3.4,3.5,3.8,18.9,152751,73.2,21.1,5.5,0.1,5.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168419,,,,,,3218,72151,Puerto Rico,Yabucoa,35025,...,1.4,1.8,0.1,30.5,7878,62.7,30.9,6.3,0.0,25.4
168420,,,,,,3218,72151,Puerto Rico,Yabucoa,35025,...,1.4,1.8,0.1,30.5,7878,62.7,30.9,6.3,0.0,25.4
168421,,,,,,3218,72151,Puerto Rico,Yabucoa,35025,...,1.4,1.8,0.1,30.5,7878,62.7,30.9,6.3,0.0,25.4
168422,,,,,,3219,72153,Puerto Rico,Yauco,37585,...,1.7,0.1,5.0,24.4,8995,66.4,28.7,5.0,0.0,24.0


***



## Data Analysis

After merging, 2530 out of 13311 Starbucks store data was lost due to no income and/or population data

In [56]:
data[~data['Store Number'].isnull()]

Unnamed: 0,Store Number,City,Starbucks Lon,Starbucks Lat,id,State_Code,State,County,Place,Type,...,Area_Code,ALand,AWater,Lat,Lon,Mean,Median,Stdev,sum_w,Population
8,17221-169985,Cullman,-86.84,34.15,1011050,1,AL,Cullman County,Dodge City,Town,...,256,8913021,26837,34.045414,-86.882670,50715,42643,35886,173.325959,15385
26,23672-230991,Gadsden,-86.08,33.99,1011110,1,AL,Etowah County,Gadsden city,City,...,256,96637272,3060329,34.009563,-86.015643,42251,28811,41397,10716.139237,35409
66,19891-192724,Mobile,-88.18,30.70,1011280,1,AL,Mobile County,Mobile city,City,...,251,361044263,105325210,30.668426,-88.100226,53330,38231,50855,51243.648369,190265
67,9970-97273,Mobile,-88.12,30.68,1011280,1,AL,Mobile County,Mobile city,City,...,251,361044263,105325210,30.668426,-88.100226,53330,38231,50855,51243.648369,190265
68,11673-105401,Mobile,-88.12,30.67,1011280,1,AL,Mobile County,Mobile city,City,...,251,361044263,105325210,30.668426,-88.100226,53330,38231,50855,51243.648369,190265
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
427522,21239-210245,Casper,-106.33,42.86,5602364,56,WY,Albany County,Laramie city,Track,...,307,2836752,0,42.844880,-106.309837,62251,54419,42997,480.343657,57814
427523,9619-97642,Casper,-106.26,42.85,5602364,56,WY,Albany County,Laramie city,Track,...,307,2836752,0,42.844880,-106.309837,62251,54419,42997,480.343657,57814
427524,79668-108627,Casper,-106.27,42.85,5602364,56,WY,Albany County,Laramie city,Track,...,307,2836752,0,42.844880,-106.309837,62251,54419,42997,480.343657,57814
427525,11862-103659,Casper,-106.37,42.82,5602364,56,WY,Albany County,Laramie city,Track,...,307,2836752,0,42.844880,-106.309837,62251,54419,42997,480.343657,57814


### Number of Starbucks per City

In [57]:
data['Location'] = data['City'] + ', ' + data['State']
storesPerCity = data.groupby('Location')['Store Number'].count()
storesPerCity = storesPerCity.to_frame().rename(columns={'Store Number':'Total'}).reset_index()

#### Cities with Starbucks (descending order)

In [58]:
hasStarbucks = storesPerCity[storesPerCity['Total'] > 0].sort_values('Total', ascending = False)
hasStarbucks

Unnamed: 0,Location,Total
5173,"New York, NY",230
1340,"Chicago, IL",179
3993,"Las Vegas, NV",153
3421,"Houston, TX",151
6607,"Seattle, WA",151
...,...,...
3981,"Larchmont, NY",1
3975,"Lapeer, MI",1
3968,"Lansdale, PA",1
3962,"Landrum, SC",1


#### Number of cities without Starbucks

In [59]:
noStarbucks = storesPerCity[storesPerCity["Total"] == 0]
len(noStarbucks.index)

5806

In [60]:
store_data = pd.merge(storesPerCity, data[["Location", "Median", "Population"]], on=["Location"]).drop_duplicates()
store_data.sort_values('Total', ascending = False)

Unnamed: 0,Location,Total,Median,Population
12247,"New York, NY",230,68128,8622698
3141,"Chicago, IL",179,48321,2716450
9431,"Las Vegas, NV",153,44734,641676
8002,"Houston, TX",151,300000,2312717
16648,"Seattle, WA",151,28154,724745
...,...,...,...,...
7187,"Greenville, MO",0,53626,485
7188,"Greenville, MS",0,12655,30686
7189,"Greenville, MS",0,42369,30686
7190,"Greenville, MS",0,56999,30686


In [61]:
store_data.sort_values('Median', ascending = False)

Unnamed: 0,Location,Total,Median,Population
14512,"Portland, TN",0,300000,12697
17374,"Spring Lake Heights, NJ",0,300000,4618
19931,"Wyoming, OH",0,300000,8536
4176,"Covington, WA",4,300000,20916
7803,"Highland Park, NJ",0,300000,14164
...,...,...,...,...
17009,"Sikeston, MO",0,0,16155
10635,"Manteo, NC",0,0,1435
13623,"Patterson, LA",0,0,5982
6506,"Fulton, MO",0,0,12844


In [62]:
store_data['Total'].corr(store_data['Median'])

0.04472559925275821

In [63]:
store_data['Total'].corr(store_data['Population'])

0.8099224080130096

***

## Data Visualization

### Distribution of Number of Starbucks Locations per County

### Starbucks Location vs. County Income

### Starbucks Location vs. County Population