# Starbucks Stores Analysis

In [31]:
# Housekeeping
!pip install tensorflow
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from tensorflow.python.keras import models
from tensorflow.python.keras import layers
import math
#from sklearn.metris import accuarcy_score, confusion_matrix

Collecting tensorflow
  Downloading tensorflow-2.7.0-cp38-cp38-win_amd64.whl (430.8 MB)
Collecting libclang>=9.0.1
  Downloading libclang-12.0.0-py2.py3-none-win_amd64.whl (13.1 MB)
Collecting flatbuffers<3.0,>=1.12
  Downloading flatbuffers-2.0-py2.py3-none-any.whl (26 kB)
Collecting grpcio<2.0,>=1.24.3
  Downloading grpcio-1.42.0-cp38-cp38-win_amd64.whl (3.3 MB)
Collecting keras-preprocessing>=1.1.1
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
Collecting absl-py>=0.4.0
  Downloading absl_py-1.0.0-py3-none-any.whl (126 kB)
Collecting keras<2.8,>=2.7.0rc0
  Downloading keras-2.7.0-py2.py3-none-any.whl (1.3 MB)
Collecting tensorflow-estimator<2.8,~=2.7.0rc0
  Downloading tensorflow_estimator-2.7.0-py2.py3-none-any.whl (463 kB)
Collecting astunparse>=1.6.0
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting tensorboard~=2.6
  Downloading tensorboard-2.7.0-py3-none-any.whl (5.8 MB)
Collecting protobuf>=3.9.2
  Downloading protobuf-3.19.1-cp38-cp3

## Datasets

Data Constraints:
- Both Starbucks and US datasets published in 2017.
- Starbucks store locations limited to US country. 
- Starbucks store limited to Starbucks brand (no Teavana)
- Exclude Puerto Rico from US datasets

In [19]:
starbucks = pd.read_csv('data/directory.csv')
starbucks = starbucks.query("Brand == 'Starbucks'").query("Country == 'US'")
starbucks = starbucks.drop(columns=["Brand", "Store Name", "Ownership Type", "Street Address","Phone Number","Timezone", "Postcode", "Country"])
starbucks = starbucks.rename(columns={'State/Province' : 'State'})

In [20]:
cities = pd.read_csv('data/uscities.csv')
cities = cities[["city", "state_id", "state_name", "county_name"]]

In [21]:
demographic = pd.read_csv('data/demo.csv', encoding='cp1252')
demographic = demographic[demographic['State'] != 'Puerto Rico']
demographic["County"] = demographic["County"].apply(lambda x: ' '.join(x.split()[0:-1]))

### Merge Data 

In [22]:
mapping = pd.merge(starbucks, cities, left_on=["City", "State"], right_on=["city", "state_id"])
mapping = mapping.drop(columns=["state_id", "city", "State"])
mapping = mapping.rename(columns={"state_name":"State", "county_name":"County"})
mapping

Unnamed: 0,Store Number,City,Longitude,Latitude,State,County
0,3513-125945,Anchorage,-149.78,61.21,Alaska,Anchorage
1,74352-84449,Anchorage,-149.84,61.14,Alaska,Anchorage
2,12449-152385,Anchorage,-149.85,61.11,Alaska,Anchorage
3,24936-233524,Anchorage,-149.89,61.13,Alaska,Anchorage
4,8973-85630,Anchorage,-149.86,61.14,Alaska,Anchorage
...,...,...,...,...,...,...
12119,22353-220004,Lander,-108.75,42.84,Wyoming,Fremont
12120,74385-87621,Laramie,-105.59,41.32,Wyoming,Albany
12121,73320-24375,Laramie,-105.56,41.31,Wyoming,Albany
12122,22425-219024,Laramie,-105.56,41.31,Wyoming,Albany


***



## Data Analysis

In [23]:
storecount = mapping.groupby(['County', 'State'])['Store Number'].count().to_frame().reset_index()
storecount = storecount.rename(columns={"Store Number":"Count"})
storecount

Unnamed: 0,County,State,Count
0,Ada,Idaho,32
1,Adair,Missouri,1
2,Adams,Colorado,62
3,Adams,Illinois,2
4,Adams,Pennsylvania,1
...,...,...,...
1026,York,Pennsylvania,10
1027,York,South Carolina,8
1028,York,Virginia,2
1029,Yuba,California,1


In [24]:
df = storecount.merge(demographic, how='right', left_on=['County', 'State'], right_on=['County', 'State']).drop(columns=["Unnamed: 0", "CountyId", "VotingAgeCitizen"])
df['Count'] = df['Count'].fillna(0)
df['Men'] = (df['Men']/df['TotalPop'])*100
df['Women'] = (df['Women']/df['TotalPop'])*100
df['Employed'] = (df['Employed']/df['TotalPop'])*100
df['Non White'] = (df['Hispanic']+df['Black']+df['Native']+df['Asian']+df['Pacific'])
df

Unnamed: 0,County,State,Count,TotalPop,Men,Women,Hispanic,White,Black,Native,...,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment,Non White
0,Ada,Idaho,32.0,435117,50.101237,49.898763,7.9,85.2,1.2,0.4,...,2.8,6.9,20.4,49.408320,78.3,15.0,6.6,0.1,4.3,12.2
1,Adair,Missouri,1.0,25437,47.226481,52.773519,2.3,90.5,2.4,0.2,...,2.6,4.0,17.1,44.321264,73.6,20.9,5.3,0.2,5.5,7.3
2,Adams,Colorado,62.0,487850,50.392539,49.607461,39.3,51.1,3.0,0.5,...,1.1,5.0,29.2,50.517577,83.6,11.2,5.1,0.1,5.1,46.7
3,Adams,Illinois,2.0,66787,48.958630,51.041370,1.5,92.0,3.9,0.2,...,1.5,4.5,17.0,48.277359,83.1,10.6,6.1,0.1,5.5,6.5
4,Adams,Pennsylvania,1.0,101589,49.272067,50.727933,6.8,89.5,1.4,0.0,...,0.9,3.8,27.6,49.770152,83.4,10.7,5.6,0.2,4.9,8.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,Sheridan,Wyoming,0.0,29964,49.446002,50.553998,4.2,91.6,0.9,1.0,...,0.7,5.6,16.0,48.935389,71.3,22.1,6.3,0.2,3.1,6.8
3138,Sublette,Wyoming,0.0,10037,54.269204,45.730796,7.3,89.2,0.0,0.1,...,0.9,7.0,20.9,53.761084,72.6,23.9,3.4,0.2,4.2,7.5
3139,Uinta,Wyoming,0.0,20758,51.030928,48.969072,9.1,87.7,0.1,0.9,...,1.3,2.0,19.9,45.900376,71.5,21.5,6.6,0.4,6.4,10.2
3140,Washakie,Wyoming,0.0,8253,49.897007,50.102993,14.2,82.2,0.3,0.4,...,1.3,4.4,14.3,46.443717,69.8,22.0,8.1,0.2,6.1,15.0


### Correlation

In [25]:
var = ['Count', 'TotalPop', 'Men', 'Women', 'White', 'Non White',\
       'IncomeErr', 'IncomePerCap', 'IncomePerCapErr', 'Poverty', 'ChildPoverty', \
       'Professional', 'Service', 'Office', 'Construction', 'Production', 'Drive', 'Carpool', 'Transit', 'Walk', \
       'OtherTransp', 'WorkAtHome', 'MeanCommute', 'Employed','PrivateWork', 'PublicWork', 'SelfEmployed', \
       'FamilyWork', 'Unemployment']
corr = df[var].corr().drop('Count')[['Count']]
corr = corr[abs(corr["Count"])>.19]
corr

Unnamed: 0,Count
TotalPop,0.896795
White,-0.202117
Non White,0.194375
IncomePerCap,0.256303
Professional,0.247451
Construction,-0.212778
Transit,0.327334


### Preprocessing

***

In [None]:
features = df[["TotalPop","White","Non White","IncomePerCap","Professional","Construction","Transit"]]
features.loc[:,"White"] = -1*features.loc[:,"White"]
features.loc[:,"Construction"] = -1*features.loc[:,"Construction"]
features = features.apply(lambda x: stats.zscore(x))
target = df[["Count"]]
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0)

r2s = [0,0,0,0,0,0,0,0,0,0]
i = 1
while i < 11:
    j = 0
    r_sq = 0
    while j < 10:
        x_train_temp, x_test_temp, y_train_temp, y_test_temp = train_test_split(x_train, y_train, test_size=0.1)
        poly_model = make_pipeline(PolynomialFeatures(i), LinearRegression())
        poly_model.fit(x_train_temp, y_train_temp)
        predicts = poly_model.predict(x_test_temp)
        predicts = pd.DataFrame(predicts, columns = ['Prediction'])
        predicts.loc[:,"Prediction"] = predicts["Prediction"].apply(lambda x: 0 if x < 0 else math.floor(x))
        r_sq = r_sq + r2_score(predicts.loc[:,"Prediction"].to_numpy(),y_test_temp.loc[:,"Count"].to_numpy())
        j = j + 1
    r_sq = r_sq/10
    r2s[i-1] = r_sq
    i=i+1
print(r2s)
max_r2 = max(r2s)
best_degree = r2s.index(max_r2)+1
print("Best degree for polynomial:")
print(best_degree)

### Neural Network

In [None]:
features = df[["TotalPop","White","Non White","IncomePerCap","Professional","Construction","Transit"]]
features.loc[:,"White"] = -1*features.loc[:,"White"]
features.loc[:,"Construction"] = -1*features.loc[:,"Construction"]
features = features.apply(lambda x: stats.zscore(x))
target = df[["Count"]]
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0)

model = models.Sequential()
model.add(layers.Dense(128, activation='relu', input_shape=(7,)))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(1, activation='softmax'))
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
model.fit(x_train, y_train, epochs = 100, batch_size=32)
results = model.evaluate(x_test, y_test)
results

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
 1/79 [..............................] - ETA: 0s - loss: 422.0312 - accuracy: 0.0625

## Data Visualization