In [None]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
plt.style.use('ggplot')

from sklearn.neighbors import DistanceMetric
from math import radians
import re
from googletrans import Translator

In [119]:
# Read the directory.csv file into a data frame
starbucks = pd.read_csv('directory.csv')
starbucks= starbucks[['store_name','City', 'State/Province','Country','Longitude', 'Latitude']]


#Create a new column for store count
starbucks['store_count'] = np.ones(starbucks.shape[0]).astype(int).tolist()
starbucks.sample(5)

#Read the country code file into the dateframe.
country_code = pd.read_csv('country_code.csv')
country_code

#Dataframe for the world population
world_population = pd.read_csv('world_population.csv')
world_population = world_population[['country', 'population']]
world_population.loc[world_population.country=='United States']

#DataFrame for the gdp
gdp=pd.read_csv('gdp.csv')


In [118]:
#Merge the starbucks data frame and the country code
starbucks2= pd.merge(starbucks,country_code, left_on='Country', right_on='Two_Letter_Country_Code').drop('Two_Letter_Country_Code', axis=1)


In [5]:
#Modify the name of some countries, removing 'the, republic of'
country_name = starbucks2['Country_Name'].tolist()

country_name = [country.split(',')[0] for country in country_name]

#Assign the modified values in the country_name column
starbucks2['Country_Name'] = country_name


#Change united states of america to united states
starbucks2['Country_Name']= starbucks2['Country_Name'].apply(lambda x : 'United States' if x=='United States of America' else x)

#Change Russian Federation to Russia
starbucks2['Country_Name']= starbucks2['Country_Name'].apply(lambda x : 'Russia' if x=='Russian Federation' else x)


starbucks2['Country_Name']= starbucks2['Country_Name'].apply(lambda x : 'South Korea' if x=='Korea' else x)

#Change Russian Federation to Russia
starbucks2['Country_Name']= starbucks2['Country_Name'].apply(lambda x : 'Czech Republic' if x=='Czech Republic (Czechia)' else x)


# Change column names to lower case
starbucks2.columns = [x.lower()  for x in starbucks2.columns]


In [117]:
#Merge world population data with starbucks data 
starbucks3 = pd.merge(starbucks2, world_population, how='left', left_on='country_name', right_on='country')
starbucks3.drop('country_y', axis=1, inplace=True)

#Rename some columns
starbucks3.rename(columns={'country_x':'code', 'continent_name':'continent', 'country_name':'country'}, inplace=True)

In [116]:
#Merge gdp data with starbucks data 
starbucks4 = pd.merge(starbucks3, gdp, how='left', left_on='country', right_on='country')
starbucks4.drop('code_x', axis=1, inplace=True)


In [112]:
#Top 15 countries in terms of starbucks store count
df_bar= starbucks4[['country','store_count']].groupby('country').sum().sort_values( ascending=False, by='store_count').reset_index().head(15)



In [125]:
import plotly.express as px

fig = px.bar(df_bar, x='country', y ='store_count')
fig.show()

In [113]:
# #Remove the comma in the population 78,100 -->78100
pop_df= starbucks4.copy()
pop_df.dropna(axis=0, inplace=True)
pop_df['population']= pop_df['population'].str.replace(',', '').astype(float)


In [86]:
df_scatter= pop_df.groupby(['country', 'population', 'continent', 'gdp_ppp']).agg({'store_count':'sum'}).reset_index()


In [127]:
df_scatter.sample(5)

import plotly.express as px
df = df_scatter
fig = px.scatter(df, x="population", y="gdp_ppp", size="store_count",color='continent',
           hover_name="country", log_x=True, size_max=60)
fig.show()



In [38]:
city_df = starbucks3[['city', 'country', 'store_count', 'longitude', 'latitude']]
city_df = city_df.dropna(axis=0)
city_df.set_index('city', inplace=True)
city_df.rename(index={'上海市':'Shanghai', '北京市':'Beijing','서울':'Seoul', '杭州市':'Hangzhou'}, inplace=True)


In [115]:
city_df.reset_index()
city_bar = city_df.groupby(['city', 'country']).agg({'store_count':'sum'}).sort_values(by='store_count',ascending=False).head(20)


In [40]:
def avg_dist(df):
    
    """
    df: is a pandas dataframe

    Return the average distance of stores in miles
    """

    #Convert lat and lon degrees in radians
    df['latitude'] = np.radians(df['latitude'])
    df['longitude'] = np.radians(df['longitude'])

    #find the haversine distance
    dist = DistanceMetric.get_metric('haversine')

    #an array of distance of neighboring stores in miles
    array =dist.pairwise(df[['latitude','longitude']].to_numpy())*3798

    return np.mean(array[0])



In [41]:
city1 =city_bar.reset_index().loc[:,['city','country']].to_numpy()

In [128]:
city_df.reset_index(inplace=True)
city_list =[]
distance_list =[]

for ele in city1:
    city=ele[0]
    country= ele[1]
    
    city2 = city_df.loc[(city_df.city==city) & (city_df.country==country)]
    distance = avg_dist(city2)
    
    city_list.append(city)
    distance_list.append(distance)
    




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [44]:

distance_df =pd.DataFrame({'city': pd.Series(city_list), 'avg_dist':pd.Series(distance_list)})
distance_df
    

Unnamed: 0,city,avg_dist
0,Shanghai,9.95683
1,Seoul,9.431684
2,Istanbul,12.080779
3,Beijing,7.992574
4,New York,2.485534
5,London,6.196099
6,Toronto,3.275323
7,Chicago,4.345767
8,Mexico City,10.114578
9,Las Vegas,4.706273
