In [None]:
%matplotlib inline

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
plt.style.use('ggplot')

from sklearn.neighbors import DistanceMetric
from math import radians
import re
from googletrans import Translator
import plotly.express as px

In [2]:
# Read the directory.csv file into a data frame
starbucks = pd.read_csv('directory.csv')
starbucks= starbucks[['store_name','City', 'State/Province','Country','Longitude', 'Latitude']]


#Create a new column for store count
starbucks['store_count'] = np.ones(starbucks.shape[0]).astype(int).tolist()
starbucks.sample(5)

#Read the country code file into the dateframe.
country_code = pd.read_csv('country_code.csv')
country_code

#Dataframe for the world population
world_population = pd.read_csv('world_population.csv')
world_population = world_population[['country', 'population']]
world_population.loc[world_population.country=='United States']

#DataFrame for the gdp
gdp=pd.read_csv('gdp.csv')

starbucks.sample(6)


Unnamed: 0,store_name,City,State/Province,Country,Longitude,Latitude,store_count
21371,Bohemia,Oakdale,NY,US,-73.12,40.75,1
14413,Graton Hotel & Casino,Rohnert Park,CA,US,-122.72,38.36,1
3288,苏州久光百货店,江苏省,32,CN,120.71,31.32,1
8663,Youngin Gu-office,Geonggi-do,41,KR,127.12,37.15,1
8196,Yangcheon Hyanggyo,Seoul,11,KR,126.84,37.57,1
13602,Pavilions-Lakewood #2209,Lakewood,CA,US,-118.12,33.86,1


In [3]:
#Merge the starbucks data frame and the country code
starbucks2= pd.merge(starbucks,country_code, left_on='Country', right_on='Two_Letter_Country_Code').drop('Two_Letter_Country_Code', axis=1)
starbucks2.sample(5)

Unnamed: 0,store_name,City,State/Province,Country,Longitude,Latitude,store_count,Continent_Name,Country_Name
15779,Ralph's - Ventura #664,Ventura,CA,US,-119.21,34.26,1,North America,United States of America
20307,Harris Teeter-Boone #165,Boone,NC,US,-81.66,36.2,1,North America,United States of America
20425,Super Target-Durham ST-1872,Durham,NC,US,-78.96,35.97,1,North America,United States of America
7442,MARIER Toyama,Toyama,16,JP,137.21,36.7,1,Asia,Japan
18841,Target Lexington T-1094,Lexington,KY,US,-84.42,38.02,1,North America,United States of America


In [4]:
#Modify the name of some countries, removing 'the, republic of'
country_name = starbucks2['Country_Name'].tolist()

country_name = [country.split(',')[0] for country in country_name]

#Assign the modified values in the country_name column
starbucks2['Country_Name'] = country_name


#Change united states of america to united states
starbucks2['Country_Name']= starbucks2['Country_Name'].apply(lambda x : 'United States' if x=='United States of America' else x)

#Change Russian Federation to Russia
starbucks2['Country_Name']= starbucks2['Country_Name'].apply(lambda x : 'Russia' if x=='Russian Federation' else x)


starbucks2['Country_Name']= starbucks2['Country_Name'].apply(lambda x : 'South Korea' if x=='Korea' else x)

#Change Russian Federation to Russia
starbucks2['Country_Name']= starbucks2['Country_Name'].apply(lambda x : 'Czech Republic' if x=='Czech Republic (Czechia)' else x)


# Change column names to lower case
starbucks2.columns = [x.lower()  for x in starbucks2.columns]
starbucks2.sample(5)


Unnamed: 0,store_name,city,state/province,country,longitude,latitude,store_count,continent_name,country_name
3763,厦门厦禾路源通中心分店,厦门市,35,CN,118.08,24.46,1,Asia,China
442,Alameda Santos,Sao Paulo,SP,BR,-46.65,-23.57,1,South America,Brazil
25965,Rib Mountain & Highway 51,Wausau,WI,US,-89.65,44.91,1,North America,United States
17909,"Sandpoint, ID",Sandpoint,ID,US,-116.55,48.27,1,North America,United States
5531,Epping - High Street,Epping,ENG,GB,0.11,51.7,1,Europe,United Kingdom


In [5]:
#Merge world population data with starbucks data 
starbucks3 = pd.merge(starbucks2, world_population, how='left', left_on='country_name', right_on='country')
starbucks3.drop('country_y', axis=1, inplace=True)

#Rename some columns
starbucks3.rename(columns={'country_x':'code', 'continent_name':'continent', 'country_name':'country'}, inplace=True)
starbucks3.sample(5)


Unnamed: 0,store_name,city,state/province,code,longitude,latitude,store_count,continent,country,population
14432,Target Murrieta North T-2499,Murrieta,CA,US,-117.18,33.6,1,North America,United States,331002651
21258,Harrahs Laughlin Club Cappucino,Laughlin,NV,US,-114.58,35.14,1,North America,United States,331002651
18167,Decatur-Rte 51 & Mound Rd,Decatur,IL,US,-88.96,39.89,1,North America,United States,331002651
9036,Public authority for youth and spor,Jahra,KU,KW,47.91,29.33,1,Asia,Kuwait,4270571
12620,Safeway - Bisbee #261,Bisbee,AZ,US,-109.92,31.4,1,North America,United States,331002651


In [6]:
#Merge gdp data with starbucks data 
starbucks4 = pd.merge(starbucks3, gdp, how='left', left_on='country', right_on='country')
starbucks4.drop('code_x', axis=1, inplace=True)
starbucks4.rename(columns={'code_y': 'code'}, inplace=True)
starbucks4.sample(5)


Unnamed: 0,store_name,city,state/province,longitude,latitude,store_count,continent,country,population,code,gdp_ppp
8672,Bucheon-Sangdong,Bucheon,41,126.75,37.51,1,Asia,South Korea,51269185,KOR,2220000000000.0
9318,Palmas,Mexico City,DIF,-99.22,19.43,1,North America,Mexico,128932753,MEX,2600000000000.0
3150,南通人民路金鹰店,南通市,32,120.87,32.02,1,Asia,China,1439323776,CHN,23500000000000.0
25405,Albertsons - Mt Lake Terrace #414,Mountlake Terrace,WA,-122.29,47.81,1,North America,United States,331002651,USA,21400000000000.0
2614,静安紫苑店,上海市,31,121.45,31.23,1,Asia,China,1439323776,CHN,23500000000000.0


In [7]:
#Top 15 countries in terms of starbucks store count
df_bar= starbucks4[['country','store_count']].groupby('country').sum().sort_values( ascending=False, by='store_count').reset_index().head(15)



In [None]:
#Create a dataframe with the most starbucks store in the world
df_bar= starbucks4[['country','store_count']].groupby('country').sum().sort_values( ascending=False, by='store_count').reset_index().head(15)

#Display a bar graph
fig = px.bar(df_bar, x='country', y ='store_count')
fig.show("svg")

In [9]:
# #Remove the comma in the population 78,100 -->78100
pop_df= starbucks4.copy()
pop_df.dropna(axis=0, inplace=True)
pop_df['population']= pop_df['population'].str.replace(',', '').astype(float)


In [10]:
#Create a new data frame to include modified population values
df_scatter= pop_df.groupby(['country', 'population', 'continent', 'gdp_ppp']).agg({'store_count':'sum'}).reset_index()
df_scatter.sample(5)

Unnamed: 0,country,population,continent,gdp_ppp,store_count
28,Indonesia,273523615.0,Asia,3330000000000.0,268
21,Finland,5540720.0,Europe,283000000000.0,8
32,Kazakhstan,18776707.0,Asia,508000000000.0,8
23,Germany,83783942.0,Europe,4660000000000.0,160
13,Chile,19116201.0,South America,477000000000.0,96


In [None]:
#Plot a scatter plot with population , gdp and store count
df = df_scatter
fig = px.scatter(df, x="population", y="gdp_ppp", size="store_count",color='continent',
           hover_name="country", log_x=True, size_max=50)
fig.show()



In [None]:
#Create a new data frame without us, china and india
df_scatter2=df_scatter.loc[(df_scatter.country !='United States') & (df_scatter.country !='China') & (df_scatter.country !='India')]

#Graph the new dataframe
df = df_scatter2
fig = px.scatter(df, x="population", y="gdp_ppp", size="store_count",color='continent',
           hover_name="country", log_x=True, size_max=20)
fig.show('svg')



In [None]:
city_df = starbucks3.loc[starbucks4.gdp_ppp>5e9]
city_df = city_df.dropna(axis=0)
city_df.set_index('city', inplace=True)
city_df.rename(index={'上海市':'Shanghai', '北京市':'Beijing','서울':'Seoul', '杭州市':'Hangzhou'}, inplace=True)
city_df.shape



In [None]:
city_df.reset_index()
city_bar = city_df.groupby(['city', 'country', 'continent']).agg({'store_count':'sum'}).sort_values(by='store_count',ascending=False)
city_bar

In [None]:
def avg_dist(df):
    
    """
    df: is a pandas dataframe

    Return the average distance of stores in miles
    """

    #Convert lat and lon degrees in radians
    df['latitude'] = np.radians(df['latitude'])
    df['longitude'] = np.radians(df['longitude'])

    #find the haversine distance
    dist = DistanceMetric.get_metric('haversine')

    #an array of distance of neighboring stores in miles
    array =dist.pairwise(df[['latitude','longitude']].to_numpy())*3798

    return np.mean(array[0])


In [None]:
city1 =city_bar.reset_index().loc[:,['city','country']].to_numpy()


In [None]:
city_df.reset_index()
city_list =[]
distance_list =[]

for ele in city1:
    city=ele[0]
    country= ele[1]
    
    city_2 =city_df.loc[(city_df.city==city) & (city_df.country==country)]
    distance = avg_dist(city2)
    
    city_list.append(city)
    distance_list.append(distance)
    


In [None]:

distance_df =pd.DataFrame({'city': pd.Series(city_list), 'avg_dist':pd.Series(distance_list)})
distance_df
    