# Starbucks Stores Analysis

In [92]:
# Housekeeping
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

## Datasets

Data Constraints:
- Both Starbucks and US datasets published in 2017.
- Starbucks store locations limited to US country. 
- Starbucks store limited to Starbucks brand (no Teavana)
- Exclude Puerto Rico from US datasets

In [93]:
starbucks = pd.read_csv('data/directory.csv')
starbucks = starbucks.query("Brand == 'Starbucks'").query("Country == 'US'")
starbucks = starbucks.drop(columns=["Brand", "Store Name", "Ownership Type", "Street Address","Phone Number","Timezone", "Postcode", "Country"])
starbucks = starbucks.rename(columns={'State/Province' : 'State'})

In [94]:
cities = pd.read_csv('data/uscities.csv')
cities = cities[["city", "state_id", "state_name", "county_name"]]

In [95]:
demographic = pd.read_csv('data/demo.csv', encoding='cp1252')
demographic = demographic[demographic['State'] != 'Puerto Rico']
demographic["County"] = demographic["County"].apply(lambda x: ' '.join(x.split()[0:-1]))

### Data Clean Up

In [96]:
mapping = pd.merge(starbucks, cities, left_on=["City", "State"], right_on=["city", "state_id"])
mapping = mapping.drop(columns=["state_id", "city", "State"])
mapping = mapping.rename(columns={"state_name":"State", "county_name":"County"})

In [97]:
storecount = mapping.groupby(['County', 'State'])['Store Number'].count().to_frame().reset_index()
storecount = storecount.rename(columns={"Store Number":"Count"})

In [98]:
df = storecount.merge(demographic, how='right', left_on=['County', 'State'], right_on=['County', 'State']).drop(columns=["Unnamed: 0", "CountyId", "VotingAgeCitizen"])
df['Count'] = df['Count'].fillna(0)
df['Men'] = (df['Men']/df['TotalPop'])*100
df['Women'] = (df['Women']/df['TotalPop'])*100
df['Employed'] = (df['Employed']/df['TotalPop'])*100

### Correlation

In [99]:
var = ['Count', 'TotalPop', 'Men', 'Women', 'Hispanic',
       'White', 'Black', 'Native', 'Asian', 'Pacific', 'Income', 'IncomeErr',
       'IncomePerCap', 'IncomePerCapErr', 'Poverty', 'ChildPoverty',
       'Professional', 'Service', 'Office', 'Construction', 'Production',
       'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp', 'WorkAtHome',
       'MeanCommute', 'Employed', 'PrivateWork', 'PublicWork', 'SelfEmployed',
       'FamilyWork', 'Unemployment']
corr = df[var].corr().drop('Count')[['Count']]
best_corr = corr[abs(corr["Count"])>.20]
print(best_corr)

                 Count
TotalPop      0.896795
White        -0.202117
Asian         0.450766
Income        0.233709
IncomePerCap  0.256303
Professional  0.247451
Construction -0.212778
Transit       0.327334


### Preprocessing

In [100]:
# Train test split
feature_names = ['TotalPop', 'Hispanic', 'White', 'Black', 'Native', 'Asian', 'Pacific', 'IncomePerCap', 'Poverty',\
                 'Professional', 'Service', 'Office', 'Construction', 'Production', 'Drive', 'Carpool', 'Transit',\
                 'Walk', 'OtherTransp']
features = df[feature_names].fillna(0)
features = features.apply(lambda x: stats.zscore(x))
target = df[["Count"]].fillna(0)
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0)

In [101]:
# Cross validation
for i in range(1,5):
    x_train_temp, x_test_temp, y_train_temp, y_test_temp = train_test_split(x_train, y_train, test_size=0.2, random_state=i)
    poly_model = make_pipeline(PolynomialFeatures(i), LinearRegression())
    d = pd.DataFrame(cross_validate(poly_model, x_train_temp, y_train_temp, cv=5, scoring=('r2', 'neg_mean_squared_error')))
    print(d.mean())

fit_time                        0.005454
score_time                      0.003966
test_r2                         0.760518
test_neg_mean_squared_error   -78.506952
dtype: float64
fit_time                         0.019534
score_time                       0.003925
test_r2                          0.422705
test_neg_mean_squared_error   -114.892441
dtype: float64
fit_time                       6.657817e-01
score_time                     8.738041e-03
test_r2                       -3.173584e+03
test_neg_mean_squared_error   -1.902187e+06
dtype: float64
fit_time                       1.150140e+00
score_time                     2.017798e-02
test_r2                       -8.768581e+03
test_neg_mean_squared_error   -3.235406e+06
dtype: float64


### Linear Regression

In [102]:
model = LinearRegression()
model.fit(x_train, y_train)

map_ = np.vectorize(lambda x: 0 if x < 0 else math.floor(x))
y_pred = np.array([map_(y) for y in model.predict(x_test)])

print(r2_score(y_test, y_pred))

0.6306102723265778


## Data Visualization

In [103]:
def normalize(df, columns):
    result = df[columns]
    result = (result - result.mean())/result.std()
    return result

data = df[["TotalPop","White","Non White","IncomePerCap","Professional","Construction","Transit"]]
columns = ["TotalPop","White","Non White","IncomePerCap","Professional","Construction","Transit"]
data = normalize(data, columns)
data = data.join(df['Count'])

# Feature = TotalPop
data.plot.scatter(x="TotalPop", y="Count",s=1)
plt.ylabel("# of Starbucks")
plt.xlabel("Total Population")
plt.title("Number of Starbucks by Feature")

# Feature = White Population
data.plot.scatter(x="White", y="Count",s=1)
plt.ylabel("# of Starbucks")
plt.xlabel("Proportiona of Population that is White")
plt.title("Number of Starbucks by Feature")

# Feature = Non White Population
data.plot.scatter(x="Non White", y="Count",s=1)
plt.ylabel("# of Starbucks")
plt.xlabel("Proportion of Population that is Non White")
plt.title("Number of Starbucks by Feature")

# Feature = Income Per Capita
data.plot.scatter(x="IncomePerCap", y="Count",s=1)
plt.ylabel("# of Starbucks")
plt.xlabel("Income Per Capita")
plt.title("Number of Starbucks by Feature")

# Feature = Professional Population
data.plot.scatter(x="TotalPop", y="Count",s=1)
plt.ylabel("# of Starbucks")
plt.xlabel("Proportion of Population that are Professionals")
plt.title("Number of Starbucks by Feature")

# Feature = Construction Population
data.plot.scatter(x="Construction", y="Count",s=1)
plt.ylabel("# of Starbucks")
plt.xlabel("Proportion of Population that work in Construction")
plt.title("Number of Starbucks by Feature")

# Feature = White Population
data.plot.scatter(x="Transit", y="Count",s=1)
plt.ylabel("# of Starbucks")
plt.xlabel("Proportion of Population that work in Transit")
plt.title("Number of Starbucks by Feature")

KeyError: "['Non White'] not in index"