# Starbucks Stores Analysis

In [525]:
# Housekeeping
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [526]:
# Import datasets
demoURL = 'https://storage.googleapis.com/kagglesdsdata/datasets/7001/312628/acs2017_county_data.csv?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20211112%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20211112T041604Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=36991b214fbd2eaa350a196e3ba92eedf89209b288630a3a84157a58d7ee14a3749d5b94748ee93465ebc5386cd6d5ebe9cd7a72bec9e9ddf03c7a7217e357aa7965b2060acd66834edd1147c111e61eb050a9f31f6382591ab9683434ec1949682b4703908fa4fd209def11513f8a5cde172ae68077f69131e2bdefc746ddcb0fa85c43d9c3391c76da17ae959169657cf0f833a0a3de80644ddf72c257f9a7b63a0a7b9849e579fab907b2629d36070d0eee2c17cfaa6a38c7c368846c1a5b9cf2c8c82a6a1e531a329d6176095e42098c7318901414afcc82423c7fb091ad7e172f212a59f2ec655e2f01bea8d0bdd0ab48447a81302de3f7c5379362ab4d'
popURL = 'https://www2.census.gov/programs-surveys/popest/datasets/2010-2017/cities/totals/sub-est2017_all.csv'

starbucks = pd.read_csv('data/directory.csv')
income = pd.read_csv('data/archive/kaggle_income.csv', encoding='cp1252')
population = pd.read_csv(popURL, encoding='cp1252')
demographic = pd.read_csv(demoURL, encoding='cp1252')

***

## Data Cleaning

Data Constraints:
- Both Starbucks and US datasets published in 2017.
- Starbucks store locations limited to US country. 
- Starbucks store limited to Starbucks brand (no Teavana)
- Exclude Puerto Rico from US datasets

### Starbucks Dataset

In [527]:
starbucks = starbucks.query("Brand == 'Starbucks'").query("Country == 'US'")
starbucks = starbucks.drop(columns=["Brand", "Store Name", "Ownership Type", "Street Address","Phone Number","Timezone", "Postcode", "Country"])
starbucks = starbucks.rename(columns={'State/Province' : 'State'})

### US County Income Dataset

In [548]:
income = income[["State_Name","State_ab","County","City","Median"]]

### US County Population Dataset

In [529]:
population = population.loc[population['SUMLEV'] == 162]
population = population[["NAME", "STNAME", "POPESTIMATE2017"]]
population["NAME"] = population["NAME"].apply(lambda x: ' '.join(x.split()[0:-1]))

### Merge Data 

In [530]:
data = pd.merge(income, population, left_on=["State_Name","City"], right_on=["STNAME", "NAME"]).drop(columns=["NAME","STNAME"])
data = pd.merge(starbucks, data, how='right', left_on=["State", "City"], right_on=["State_ab","City"]).drop(columns=['State_Name','State'])
data = data.rename(columns={"Longitude":"Starbucks Lon", "Latitude":"Starbucks Lat",\
                            "State_ab":"State",\
                            "POPESTIMATE2017":"Population"})
data = data[~data['Store Number'].duplicated() | data['Store Number'].isnull()]
data

Unnamed: 0,Store Number,City,Starbucks Lon,Starbucks Lat,State,County,Median,Population
0,,Chickasaw,,,AL,Mobile County,30506,5794
1,,Chickasaw,,,AL,Autauga County,20116,5794
2,,Louisville,,,AL,Barbour County,19528,468
3,,Louisville,,,AL,Autauga County,27636,468
4,,Columbiana,,,AL,Shelby County,31930,4647
...,...,...,...,...,...,...,...,...
375049,,Torrington,,,WY,Albany County,42475,6691
375050,,Clearmont,,,WY,Albany County,73921,147
375051,,Pinedale,,,WY,Albany County,94509,1890
375052,,Lyman,,,WY,Albany County,70155,2067


### US County Demographic Dataset

***



## Data Analysis

After merging, 2530 out of 13311 Starbucks store data was lost due to no income and/or population data

### Number of Starbucks per City

In [531]:
data['Location'] = data['City'] + ', ' + data['State']
storesPerCity = data.groupby('Location')['Store Number'].count()
storesPerCity = storesPerCity.to_frame().rename(columns={'Store Number':'Total'}).reset_index()

#### Cities with Starbucks (descending order)

In [532]:
hasStarbucks = storesPerCity[storesPerCity['Total'] > 0].sort_values('Total', ascending = False)
hasStarbucks

Unnamed: 0,Location,Total
4889,"New York, NY",230
1274,"Chicago, IL",179
3775,"Las Vegas, NV",153
6230,"Seattle, WA",151
3234,"Houston, TX",151
...,...,...
3794,"Lawndale, CA",1
3780,"Lauderhill, FL",1
3777,"Latrobe, PA",1
3766,"Larkspur, CA",1


#### Number of cities without Starbucks

In [533]:
noStarbucks = storesPerCity[storesPerCity["Total"] == 0]
len(noStarbucks.index)

5438

In [546]:
store_data = pd.merge(storesPerCity, data[["Location", "Median", "Population"]], on=["Location"]).drop_duplicates()
store_data.sort_values('Total', ascending = False)

Unnamed: 0,Location,Total,Median,Population
11579,"New York, NY",230,68128,8622698
2983,"Chicago, IL",179,48321,2716450
8925,"Las Vegas, NV",153,44734,641676
7559,"Houston, TX",151,57194,2312717
15793,"Seattle, WA",151,28154,724745
...,...,...,...,...
6812,"Greenville, MS",0,31224,30686
6813,"Greenville, MS",0,22030,30686
6823,"Greenville, OH",0,52711,12771
6824,"Greenville, OH",0,25856,12771


In [547]:
store_data.sort_values('Median', ascending = False)

Unnamed: 0,Location,Total,Median,Population
14334,"Ridgewood, NJ",2,229111,25692
6504,"Globe, AZ",0,226983,7356
11340,"Narberth, PA",0,224616,4353
1168,"Bay Harbor Islands, FL",0,223921,6006
2970,"Chevy Chase, MD",2,218711,3014
...,...,...,...,...
3813,"Coon Rapids, IA",0,0,1264
9754,"Los Fresnos, TX",0,0,7830
13073,"Petersburg, NE",0,0,323
12999,"Pentwater, MI",0,0,850


In [542]:
store_data['Total'].corr(store_data['Median'])

0.04130264128747358

In [541]:
store_data['Total'].corr(store_data['Population'])

0.8078199103079486

***

## Data Visualization

### Distribution of Number of Starbucks Locations per County

### Starbucks Location vs. County Income

### Starbucks Location vs. County Population