### Capstone project -  Opening a new dessert shop in Budapest

In [4]:
import numpy as np
import pandas as pd
import json
from geopy.geocoders import Nominatim
import geocoder
import requests
from bs4 import BeautifulSoup
import urllib

### 1. Collecting data

#### 1.1 Extract Budapest neighbourhood information 

In [6]:
url ='https://en.wikipedia.org/wiki/List_of_districts_in_Budapest'
with urllib.request.urlopen(url) as response:
   html = response.read()
bs = BeautifulSoup(html)
bs.prettify()
tables = bs.find_all("tbody")
storeTable = tables[0].find_all("tr")
storeValueRows = tables[2].find_all("tr")

storeRank = []
nlist=[]
for row in storeTable[1:]:
    storeRank.append(row.get_text().strip())
    r=row.get_text().rsplit('\n')
    nlist.append((r[1],r[5].rsplit(',')))
budapest_neighborhoods = pd.DataFrame(nlist,columns=['District','Neighborhood'])
budapest_neighborhoods.head()

Unnamed: 0,District,Neighborhood
0,I.,"[Buda Castle, Tabán, Gellérthegy, Krisztina..."
1,II.,"[Adyliget, Budakeszierdő, Budaliget, Csatár..."
2,III.,"[Óbuda, Aquincum, Aranyhegy, Békásmegyer, ..."
3,IV.,"[Újpest, Megyer, Káposztásmegyer, Székesdűl..."
4,V.,"[Inner City, Lipótváros]"


In [7]:
city='Budapest'
country='Hungary'
neighborhood='Buda Castle'
'{0},{1},{2}'.format(neighborhood,city,country)


'Buda Castle,Budapest,Hungary'

#### 1.2 Extract location  information 

In [44]:
# define a function to get coordinates
def get_latlng(neighborhood,city,country):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{0},{1},{2}'.format(neighborhood,city,country))
        lat_lng_coords = g.latlng
    return lat_lng_coords

# selected districts
selected = ['I.','V.','VI.','VII.']
selected_neighborhoods = budapest_neighborhoods[budapest_neighborhoods['District'].isin(selected)]
neighborhoods = [item for sublist in  selected_neighborhoods["Neighborhood"].tolist() for item in sublist]
coords = [ get_latlng(neighborhood,'Budapest','Hungary') for neighborhood in neighborhoods]
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])
bp_df = pd.DataFrame(neighborhoods)
bp_df['Latitude'] = df_coords['Latitude']
bp_df['Longitude'] = df_coords['Longitude']
bp_df.columns=['Neighborhood','Latitude','Longitude']

In [45]:
print(bp_df.shape)
bp_df

(9, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,Buda Castle,47.49539,19.03982
1,Tabán,47.49257,19.04241
2,Gellérthegy,47.48707,19.04305
3,Krisztinaváros,47.50062,19.02423
4,southern Víziváros,47.50261,19.0381
5,Inner City,47.49972,19.05508
6,Lipótváros,47.50312,19.05066
7,Terézváros,47.50488,19.06283
8,Erzsébetváros,47.50091,19.06936


In [46]:
# save the DataFrame as CSV file
bp_df.to_csv("bp_df.csv", index=False)
bp_df

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Buda Castle,47.49539,19.03982
1,Tabán,47.49257,19.04241
2,Gellérthegy,47.48707,19.04305
3,Krisztinaváros,47.50062,19.02423
4,southern Víziváros,47.50261,19.0381
5,Inner City,47.49972,19.05508
6,Lipótváros,47.50312,19.05066
7,Terézváros,47.50488,19.06283
8,Erzsébetváros,47.50091,19.06936


In [62]:
# get the coordinates of Budapest
address = 'Budapest, Hungary'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Budapest, Hungary {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Budapest, Hungary 47.48138955, 19.14607278448202.


In [48]:
# create map of Budapest using latitude and longitude values
import folium
map_bp = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(bp_df['Latitude'], bp_df['Longitude'], bp_df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_bp)  
    
map_bp

In [49]:
# save the map as HTML file
map_bp.save('map_bp.html')

#### 1.3 Extract venue information via Foursquare API

In [50]:
# define Foursquare Credentials and Version
CLIENT_ID='EU0PEJSWFHJKDPV1RBJTINXOMYPUI0PDXI3LRPEAQTCBPB4R'
CLIENT_SECRET='QRWV1SRQ0GGDJHURKTYII3XOSXVBFSTQ1UQH21EXVSMHBV4X'
VERSION = '20180605' # Foursquare API version
LIMIT=100
radius=500

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: EU0PEJSWFHJKDPV1RBJTINXOMYPUI0PDXI3LRPEAQTCBPB4R
CLIENT_SECRET:QRWV1SRQ0GGDJHURKTYII3XOSXVBFSTQ1UQH21EXVSMHBV4X


In [51]:
radius = 2000
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(bp_df['Latitude'], bp_df['Longitude'], bp_df['Neighborhood']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [63]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)
venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']
print(venues_df.shape)
venues_df.head(20)

(900, 7)


Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Buda Castle,47.49539,19.03982,Budavári Palota,47.496198,19.039543,Castle
1,Buda Castle,47.49539,19.03982,Zhao Zhou Teashop & Lab,47.497354,19.041026,Tea Room
2,Buda Castle,47.49539,19.03982,Magyar Nemzeti Galéria | Hungarian National Ga...,47.496082,19.039468,Art Museum
3,Buda Castle,47.49539,19.03982,Várhegy,47.49757,19.038747,Scenic Lookout
4,Buda Castle,47.49539,19.03982,Várkert Bazár,47.494343,19.042807,Historic Site
5,Buda Castle,47.49539,19.03982,Hotel Clark,47.498507,19.040412,Hotel
6,Buda Castle,47.49539,19.03982,Bortársaság,47.497441,19.04109,Wine Shop
7,Buda Castle,47.49539,19.03982,Asztalka,47.492193,19.044231,Dessert Shop
8,Buda Castle,47.49539,19.03982,Oxygen Wellness Naphegy,47.491025,19.0379,Gym / Fitness Center
9,Buda Castle,47.49539,19.03982,Dísz tér,47.4991,19.036163,Plaza


In [64]:
venues_df.groupby(["Neighborhood"]).count()
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

# print out the list of categories
venues_df['VenueCategory'].unique()[:50]

There are 115 uniques categories.


array(['Castle', 'Tea Room', 'Art Museum', 'Scenic Lookout',
       'Historic Site', 'Hotel', 'Wine Shop', 'Dessert Shop',
       'Gym / Fitness Center', 'Plaza', 'Sculpture Garden',
       'Soccer Stadium', 'Coffee Shop', 'Bistro', 'Outdoor Sculpture',
       'Chocolate Shop', 'Eastern European Restaurant',
       'Hungarian Restaurant', 'Pub', 'Waterfront', 'Lounge', 'Wine Bar',
       'Café', 'Grocery Store', 'Italian Restaurant',
       'Mediterranean Restaurant', 'French Restaurant',
       'Sushi Restaurant', 'Vietnamese Restaurant', 'Trail', 'Restaurant',
       'Burger Joint', 'History Museum', 'Bookstore', 'Salad Place',
       'Gift Shop', 'Spa', 'Playground', 'Mountain', 'Theater',
       'Cosmetics Shop', 'Ice Cream Shop', 'Thai Restaurant', 'Church',
       'Park', 'Cocktail Bar', 'Greek Restaurant', 'Monument / Landmark',
       'Bakery', 'Capitol Building'], dtype=object)

In [65]:
# check if the results contain "
"Neighborhood" in venues_df['VenueCategory'].unique()

False

In [66]:
# one hot encoding
bp_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
bp_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [bp_onehot.columns[-1]] + list(bp_onehot.columns[:-1])
bp_onehot = bp_onehot[fixed_columns]

print(bp_onehot.shape)
bp_onehot.head()

(900, 116)


Unnamed: 0,Neighborhoods,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Bagel Shop,Bakery,Bank,...,Toy / Game Store,Track,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Yoga Studio
0,Buda Castle,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Buda Castle,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Buda Castle,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Buda Castle,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Buda Castle,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [72]:
# group by neighourhoods, get frequencies for venue occurences per neighborhood
bp_grouped = bp_onehot.groupby(["Neighborhoods"]).mean().reset_index()
print(bp_grouped.shape)
bp_grouped

(9, 116)


Unnamed: 0,Neighborhoods,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Bagel Shop,Bakery,Bank,...,Toy / Game Store,Track,Trail,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Yoga Studio
0,Gellérthegy,0.01,0.0,0.01,0.0,0.0,0.0,0.0,0.03,0.0,...,0.01,0.0,0.01,0.02,0.01,0.02,0.0,0.01,0.01,0.0
1,Krisztinaváros,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.1,0.0,...,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0
2,Lipótváros,0.0,0.01,0.0,0.0,0.0,0.01,0.01,0.02,0.01,...,0.01,0.0,0.0,0.0,0.01,0.01,0.01,0.03,0.01,0.01
3,Tabán,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,...,0.0,0.0,0.01,0.0,0.01,0.01,0.0,0.02,0.01,0.01
4,southern Víziváros,0.0,0.0,0.01,0.0,0.01,0.0,0.01,0.03,0.01,...,0.0,0.0,0.0,0.0,0.02,0.01,0.0,0.02,0.02,0.0
5,Buda Castle,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.02,0.0,...,0.0,0.0,0.01,0.0,0.01,0.01,0.0,0.02,0.02,0.0
6,Erzsébetváros,0.0,0.01,0.0,0.01,0.0,0.01,0.0,0.03,0.0,...,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.03,0.0,0.01
7,Inner City,0.0,0.01,0.0,0.0,0.0,0.01,0.01,0.02,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.02,0.01,0.01
8,Terézváros,0.0,0.01,0.0,0.0,0.0,0.02,0.01,0.03,0.0,...,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.02,0.0,0.0


In [74]:
# get dessert shop venue data
len(bp_grouped[bp_grouped["Dessert Shop"] > 0])
bp_dessert = bp_grouped[["Neighborhoods","Dessert Shop"]]
bp_dessert.head(20)

Unnamed: 0,Neighborhoods,Dessert Shop
0,Gellérthegy,0.02
1,Krisztinaváros,0.02
2,Lipótváros,0.01
3,Tabán,0.03
4,southern Víziváros,0.02
5,Buda Castle,0.03
6,Erzsébetváros,0.02
7,Inner City,0.02
8,Terézváros,0.03


### 2. Clustering - cluster neighbourhoods with KMeans

In [80]:
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 3

bp_clustering = bp_dessert.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(bp_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
bp_merged = bp_dessert.copy()

# add clustering labels
bp_merged["Cluster Labels"] = kmeans.labels_

bp_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
bp_merged.head()

Unnamed: 0,Neighborhood,Dessert Shop,Cluster Labels
0,Gellérthegy,0.02,1
1,Krisztinaváros,0.02,1
2,Lipótváros,0.01,2
3,Tabán,0.03,0
4,southern Víziváros,0.02,1


In [81]:
# merge bp_grouped with bp_data to add latitude/longitude for each neighborhood
bp_merged = bp_merged.join(bp_df.set_index("Neighborhood"), on="Neighborhood")

print(bp_merged.shape)
bp_merged.head() # check the last columns!

(9, 5)


Unnamed: 0,Neighborhood,Dessert Shop,Cluster Labels,Latitude,Longitude
0,Gellérthegy,0.02,1,47.48707,19.04305
1,Krisztinaváros,0.02,1,47.50062,19.02423
2,Lipótváros,0.01,2,47.50312,19.05066
3,Tabán,0.03,0,47.49257,19.04241
4,southern Víziváros,0.02,1,47.50261,19.0381


In [82]:
# sort the results by Cluster Labels
print(bp_merged.shape)
bp_merged.sort_values(["Cluster Labels"], inplace=True)
bp_merged

(9, 5)


Unnamed: 0,Neighborhood,Dessert Shop,Cluster Labels,Latitude,Longitude
3,Tabán,0.03,0,47.49257,19.04241
5,Buda Castle,0.03,0,47.49539,19.03982
8,Terézváros,0.03,0,47.50488,19.06283
0,Gellérthegy,0.02,1,47.48707,19.04305
1,Krisztinaváros,0.02,1,47.50062,19.02423
4,southern Víziváros,0.02,1,47.50261,19.0381
6,Erzsébetváros,0.02,1,47.50091,19.06936
7,Inner City,0.02,1,47.49972,19.05508
2,Lipótváros,0.01,2,47.50312,19.05066


In [83]:
# create map
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(bp_merged['Latitude'], bp_merged['Longitude'], bp_merged['Neighborhood'], bp_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters