In [None]:
%matplotlib inline

In [121]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
plt.style.use('ggplot')

from sklearn.neighbors import DistanceMetric
from math import radians
import re

In [91]:
# Read the directory.csv file into a data frame
starbucks = pd.read_csv('directory.csv')
starbucks= starbucks[['store_name','City', 'State/Province','Country','Longitude', 'Latitude']]


#Create a new column for store count
starbucks['store_count'] = np.ones(starbucks.shape[0]).astype(int).tolist()
starbucks.sample(5)

#Read the country code file into the dateframe.
country_code = pd.read_csv('country_code.csv')
country_code

#Dataframe for the world population
world_population = pd.read_csv('world_population.csv')
world_population = world_population[['country', 'population']]
world_population.loc[world_population.country=='United States']
world_population.sample(5)



Unnamed: 0,country,population
21,France,65273511
32,Algeria,43851044
207,Northern Mariana Islands,57559
68,Netherlands,17134872
170,Suriname,586632


In [105]:
#Merge the starbucks data frame and the country code
starbucks2= pd.merge(starbucks,country_code, left_on='Country', right_on='Two_Letter_Country_Code').drop('Two_Letter_Country_Code', axis=1)
starbucks2.sample(3)

starbucks2.sample(3)

Unnamed: 0,store_name,City,State/Province,Country,Longitude,Latitude,store_count,Continent_Name,Country_Name
26052,Rex,Thành Phố Hồ Chí Minh,SG,VN,106.7,10.78,1,Asia,"Vietnam, Socialist Republic of"
16609,The Portals,Washington,DC,US,-77.03,38.88,1,North America,United States of America
16504,Vernon,Vernon,CT,US,-72.49,41.83,1,North America,United States of America


In [106]:
#Modify the name of some countries, removing 'the, republic of'
country_name = starbucks2['Country_Name'].tolist()

country_name = [country.split(',')[0] for country in country_name]

#Assign the modified values in the country_name column
starbucks2['Country_Name'] = country_name


#Change united states of america to united states
starbucks2['Country_Name']= starbucks2['Country_Name'].apply(lambda x : 'United States' if x=='United States of America' else x)

#Change Russian Federation to Russia
starbucks2['Country_Name']= starbucks2['Country_Name'].apply(lambda x : 'Russia' if x=='Russian Federation' else x)

# Change column names to lower case
starbucks2.columns = [x.lower()  for x in starbucks2.columns]


(26057, 9)

In [113]:
#Merge world population data with starbucks data 
starbucks3 = pd.merge(starbucks2, world_population, how='left', left_on='country_name', right_on='country')
starbucks3.drop('country_y', axis=1, inplace=True)

#Rename some columns
starbucks3.rename(columns={'country_x':'code', 'continent_name':'continent', 'country_name':'country'}, inplace=True)



In [None]:
df= starbucks3.loc[starbucks3.country=='China']
df.sample(5)


In [None]:
from googletrans import Translator
translator=Translator()

df["city"] = df["city"].map(lambda x: translator.translate(x, src="zh-TW", dest="en").text)
df.sample(5)

In [None]:
#Top 15 countries in terms of starbucks store count
df_bar= starbucks2[['Country_Name','store_count']].groupby('Country_Name').sum().sort_values( ascending=False, by='store_count')

#Bar graph
df_bar.head(15).plot.bar(color='g')


In [None]:


# #Remove the comma in the population 78,100 -->78100
starbucks3['population'] = starbucks3['population'].str.replace(',', '').astype(int)



In [None]:
df_scatter= starbucks3.groupby(['Country_Name', 'population']).agg({'store_count':'sum'}).reset_index()
df_scatter.plot.scatter('population', 'store_count')


In [None]:
#removing the outliers
df_scatter = df_scatter[['population', 'store_count']]
df_scatter = df_scatter.loc[df_scatter.apply(lambda x: np.abs(x - x.mean()) / x.std() < 3).all(axis=1)]
df_scatter.plot.scatter('population', 'store_count')

In [None]:
city_df = starbucks3[['City', 'Country_Name', 'store_count', 'Longitude', 'Latitude']]
city_bar = city_df.groupby(['City', 'Country_Name']).agg({'store_count':'sum'}).sort_values(by='store_count',ascending=False).head(17)
city_bar.plot.bar()

city_bar

In [None]:
def avg_dist(city_df):
     """
     df: is a pandas dataframe
    
     Return the average distance of stores in miles
     """
    
    #Convert lat and lon degrees in radians
    city_df['Latitude'] = np.radians(city_df['Latitude'])
    city_df['Longitude'] = np.radians(city_df['Longitude'])

    #find the haversine distance
    dist = DistanceMetric.get_metric('haversine')
    
    #an array of distance of stores
    array =dist.pairwise(city_df[['Latitude','Longitude']].to_numpy())*3798
    
    return np.mean(array[0])



In [None]:
city1 =city_bar.reset_index()[['City', 'Country_Name']].to_numpy()

city_list =[]
distance_list =[]


for ele in city1:
    city=ele[0]
    country= ele[1]
    city_df = starbucks3.loc[(starbucks3.City==city) & (starbucks3.Country_Name==country)]
    distance = avg_dist(city_df)
    city_list.append(city)
    distance_list.append(distance)
    

pd.DataFrame({'city': pd.Series(city_list), 'avg_dist':pd.Series(distance_list)})
    
    



In [None]:
city_df = starbucks.loc[(starbucks.City=='London') & (starbucks.Country=='GB')]
city_df

city_bar

