# Final Report
Opening a New Hotel in Ha Noi, Viet Nam
- Build a dataframe of neighborhoods in Ha Noi by web scraping the data from Wikipedia page
- Get the geographical coordinates of the neighborhoods
- Obtain the venue data for the neighborhoods from Foursquare API
- Explore and cluster the neighborhoods
- Select the best cluster to open a new hotel

### 1. Import libraries

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

!pip install geocoder
import geocoder # to get coordinates

import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip install folium
import folium # map rendering library

print("Libraries imported.")

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 15.3MB/s ta 0:00:01
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6
Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 18.8MB/s eta 0:00:01
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/63/36/1c93318e9653f4e414a2e0c3b98fc898b4970e939afeedeee6075dd3b703/branca-0.3.1-py3-none-any.whl
I

### 2. Scrap data from Wikipedia page into a DataFrame

In [2]:
# send the GET request
data = requests.get("https://en.wikipedia.org/wiki/Category:Districts_of_Hanoi").text

In [3]:
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

In [4]:
# create a list to store neighborhood data
neighborhoodList = []

In [5]:
# append the data into the list
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text)

In [6]:
# create a new DataFrame from the list
df = pd.DataFrame({"Neighborhood": neighborhoodList})
df.head()

Unnamed: 0,Neighborhood
0,Ba Đình District
1,Ba Vì District
2,Bắc Từ Liêm District
3,Cầu Giấy District
4,Chương Mỹ District


### 3. Get the geographical coordinates

In [7]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Ha Noi, Vietnam'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [8]:
# call the function to get the coordinates, store in a new list using list comprehension
coords = [ get_latlng(neighborhood) for neighborhood in df["Neighborhood"].tolist() ]
coords

[[21.033520000000067, 105.81404000000003],
 [21.19966000000005, 105.42270000000008],
 [21.062170000000037, 105.76941000000005],
 [21.029840000000036, 105.79953000000006],
 [20.923640000000034, 105.70268000000004],
 [21.083210000000065, 105.67281000000008],
 [21.168130000000076, 105.84818000000007],
 [21.020410000000027, 105.83082000000007],
 [21.019790000000057, 105.93751000000009],
 [20.973820000000046, 105.77916000000005],
 [21.009910000000048, 105.85076000000004],
 [21.066100000000063, 105.70758000000006],
 [21.02902000000006, 105.85622000000006],
 [21.007130000000075, 105.83491000000004],
 [21.045650000000023, 105.86964000000006],
 [21.18208000000004, 105.72061000000008],
 [20.685970000000054, 105.74276000000003],
 [21.00813000000005, 105.76650000000006],
 [20.75451000000004, 105.92102000000006],
 [21.107110000000034, 105.53787000000005],
 [20.992210000000057, 105.64124000000004],
 [21.25732000000005, 105.84826000000004],
 [21.032795923976675, 105.83013720540957],
 [21.074180000000

In [9]:
# create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

In [10]:
# merge the coordinates into the original dataframe
df['Latitude'] = df_coords['Latitude']
df['Longitude'] = df_coords['Longitude']

In [11]:
# check the neighborhoods and the coordinates
print(df.shape)
df

(30, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,Ba Đình District,21.03352,105.81404
1,Ba Vì District,21.19966,105.4227
2,Bắc Từ Liêm District,21.06217,105.76941
3,Cầu Giấy District,21.02984,105.79953
4,Chương Mỹ District,20.92364,105.70268
5,Đan Phượng District,21.08321,105.67281
6,Đông Anh District,21.16813,105.84818
7,Đống Đa District,21.02041,105.83082
8,Gia Lâm District,21.01979,105.93751
9,Hà Đông District,20.97382,105.77916


In [12]:
# save the DataFrame as CSV file
df.to_csv("df.csv", index=False)

### 4. Create a map of Ha Noi with neighborhoods superimposed on top

In [13]:
# get the coordinates of Ha Noi
address = 'Ha Noi, Vietnam'
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Ha Noi, Vietnam {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Ha Noi, Vietnam 21.0294498, 105.8544441.


In [14]:
# create map of Ha Noi using latitude and longitude values
map_hn = folium.Map(location=[latitude, longitude], zoom_start=11)
# add markers to map
for lat, lng, neighborhood in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=6,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_hn)  

map_hn

In [15]:
# save the map as HTML file
map_hn.save('map_hn.html')

### 5. Use the Foursquare API to explore the neighborhoods

In [16]:
# define Foursquare Credentials and Version
CLIENT_ID = 'EXL4HEJIXBUND5ZSMSMI3Z45ONSCROGWZCGWD0KY3T4UMVKH' # your Foursquare ID
CLIENT_SECRET = 'FF1FC0HM0NQYK3JRUTFKTXVDXPVX0YUKALAGFWT3ZQTWXKVM' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: EXL4HEJIXBUND5ZSMSMI3Z45ONSCROGWZCGWD0KY3T4UMVKH
CLIENT_SECRET:FF1FC0HM0NQYK3JRUTFKTXVDXPVX0YUKALAGFWT3ZQTWXKVM


**Now, let's get the top 1000 venues that are within a radius of 10000 meters.**

In [18]:
radius = 10000
LIMIT = 1000
venues = []
for lat, long, neighborhood in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [20]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)
# define the column names
venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']
print(venues_df.shape)
venues_df.head()

(1650, 7)


Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Ba Đình District,21.03352,105.81404,Pizza 4P,21.034181,105.812777,Pizza Place
1,Ba Đình District,21.03352,105.81404,Pizza 4Ps Lotte Hanoi,21.032145,105.813378,Pizza Place
2,Ba Đình District,21.03352,105.81404,Lotte Hotel Hanoi,21.032241,105.812817,Hotel
3,Ba Đình District,21.03352,105.81404,Sky Walk Lotte Centre,21.032131,105.812428,Scenic Lookout
4,Ba Đình District,21.03352,105.81404,Pepe la Poule,21.055677,105.821286,French Restaurant


**Let's check how many venues were returned for each neighorhood**

In [21]:
venues_df.groupby(["Neighborhood"]).count()

Unnamed: 0_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Ba Vì District,4,4,4,4,4,4
Ba Đình District,100,100,100,100,100,100
Bắc Từ Liêm District,100,100,100,100,100,100
Chương Mỹ District,8,8,8,8,8,8
Cầu Giấy District,100,100,100,100,100,100
Gia Lâm District,100,100,100,100,100,100
Hai Bà Trưng District,100,100,100,100,100,100
Hoài Đức District,32,32,32,32,32,32
Hoàn Kiếm District,100,100,100,100,100,100
"Hoàng Mai District, Hanoi",100,100,100,100,100,100


**Let's find out how many unique categories can be curated from all the returned venues**

In [22]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

There are 90 uniques categories.


In [23]:
# print out the list of categories
venues_df['VenueCategory'].unique()[:50]

array(['Pizza Place', 'Hotel', 'Scenic Lookout', 'French Restaurant',
       'Park', 'Confucian Temple', 'Beer Bar', 'Vietnamese Restaurant',
       'Noodle House', 'Italian Restaurant', 'Museum', 'Brewery',
       'Supermarket', 'Sandwich Place', 'Coffee Shop',
       'Hotpot Restaurant', 'Café', 'Massage Studio', 'Bistro',
       'Fried Chicken Joint', 'Japanese Restaurant', 'Temple', 'Spa',
       'Mexican Restaurant', 'BBQ Joint', 'Lounge', 'Cocktail Bar',
       'Climbing Gym', 'Dessert Shop', 'Chocolate Shop', 'Multiplex',
       'Hotel Bar', 'Buddhist Temple', 'Vegetarian / Vegan Restaurant',
       'Beer Garden', 'Modern European Restaurant', 'Mobile Phone Shop',
       'Steakhouse', 'Gym / Fitness Center', 'Hostel', 'Historic Site',
       'Cemetery', 'Resort', 'Campground', 'Golf Course',
       'Asian Restaurant', 'Sports Bar', 'Bakery',
       'Furniture / Home Store', 'Rock Club'], dtype=object)

In [24]:
# check if the results contain "Hotel"
"Neighborhood" in venues_df['VenueCategory'].unique()

False

### 6. Analyze Each Neighborhood

In [25]:
# one hot encoding
onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
onehot['Neighborhoods'] = venues_df['Neighborhood'] 
# move neighborhood column to the first column
fixed_columns = [onehot.columns[-1]] + list(onehot.columns[:-1])
onehot = onehot[fixed_columns]
print(onehot.shape)
onehot.head()

(1650, 91)


Unnamed: 0,Neighborhoods,Airport,Airport Lounge,Art Gallery,Asian Restaurant,BBQ Joint,Bakery,Beer Bar,Beer Garden,Bistro,Boat or Ferry,Bowling Alley,Brewery,Buddhist Temple,Buffet,Burger Joint,Cable Car,Café,Campground,Cantonese Restaurant,Cemetery,Chocolate Shop,Climbing Gym,Cocktail Bar,Coffee Shop,Confucian Temple,Dessert Shop,Dim Sum Restaurant,Duty-free Shop,Electronics Store,Farm,Fast Food Restaurant,Food & Drink Shop,French Restaurant,Fried Chicken Joint,Furniture / Home Store,Golf Course,Gym / Fitness Center,Historic Site,History Museum,Hostel,Hotel,Hotel Bar,Hotpot Restaurant,IT Services,Ice Cream Shop,Italian Restaurant,Japanese Restaurant,Jazz Club,Juice Bar,Korean Restaurant,Lake,Liquor Store,Lounge,Malay Restaurant,Market,Massage Studio,Mexican Restaurant,Mobile Phone Shop,Modern European Restaurant,Movie Theater,Multiplex,Museum,Noodle House,Park,Pizza Place,Pub,Resort,Restaurant,River,Rock Club,Salad Place,Sandwich Place,Scenic Lookout,Shopping Mall,Snack Place,Soccer Field,Souvenir Shop,Spa,Sports Bar,Stadium,Steakhouse,Supermarket,Temple,Theme Park,Tiki Bar,Toll Booth,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Village,Wine Bar
0,Ba Đình District,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Ba Đình District,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Ba Đình District,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Ba Đình District,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Ba Đình District,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


**Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category**

In [26]:
grouped = onehot.groupby(["Neighborhoods"]).mean().reset_index()
print(grouped.shape)
grouped

(30, 91)


Unnamed: 0,Neighborhoods,Airport,Airport Lounge,Art Gallery,Asian Restaurant,BBQ Joint,Bakery,Beer Bar,Beer Garden,Bistro,Boat or Ferry,Bowling Alley,Brewery,Buddhist Temple,Buffet,Burger Joint,Cable Car,Café,Campground,Cantonese Restaurant,Cemetery,Chocolate Shop,Climbing Gym,Cocktail Bar,Coffee Shop,Confucian Temple,Dessert Shop,Dim Sum Restaurant,Duty-free Shop,Electronics Store,Farm,Fast Food Restaurant,Food & Drink Shop,French Restaurant,Fried Chicken Joint,Furniture / Home Store,Golf Course,Gym / Fitness Center,Historic Site,History Museum,Hostel,Hotel,Hotel Bar,Hotpot Restaurant,IT Services,Ice Cream Shop,Italian Restaurant,Japanese Restaurant,Jazz Club,Juice Bar,Korean Restaurant,Lake,Liquor Store,Lounge,Malay Restaurant,Market,Massage Studio,Mexican Restaurant,Mobile Phone Shop,Modern European Restaurant,Movie Theater,Multiplex,Museum,Noodle House,Park,Pizza Place,Pub,Resort,Restaurant,River,Rock Club,Salad Place,Sandwich Place,Scenic Lookout,Shopping Mall,Snack Place,Soccer Field,Souvenir Shop,Spa,Sports Bar,Stadium,Steakhouse,Supermarket,Temple,Theme Park,Tiki Bar,Toll Booth,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Village,Wine Bar
0,Ba Vì District,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Ba Đình District,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.01,0.01,0.0,0.0,0.02,0.01,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.01,0.01,0.03,0.13,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.0,0.02,0.15,0.01,0.01,0.0,0.0,0.02,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.02,0.01,0.01,0.01,0.0,0.01,0.01,0.06,0.01,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.01,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.02,0.02,0.01,0.0,0.0,0.0,0.01,0.13,0.0,0.0
2,Bắc Từ Liêm District,0.0,0.0,0.0,0.02,0.01,0.01,0.01,0.01,0.01,0.0,0.0,0.02,0.01,0.0,0.01,0.0,0.04,0.0,0.0,0.0,0.0,0.01,0.01,0.12,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.01,0.01,0.01,0.0,0.0,0.0,0.15,0.01,0.01,0.0,0.0,0.02,0.01,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.02,0.02,0.01,0.01,0.0,0.01,0.01,0.05,0.01,0.04,0.0,0.0,0.0,0.0,0.01,0.0,0.02,0.01,0.0,0.0,0.0,0.0,0.02,0.01,0.0,0.01,0.01,0.01,0.0,0.0,0.0,0.0,0.13,0.0,0.01
3,Chương Mỹ District,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.125,0.0
4,Cầu Giấy District,0.0,0.0,0.0,0.01,0.01,0.0,0.02,0.01,0.01,0.0,0.0,0.02,0.01,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.01,0.01,0.03,0.11,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.01,0.01,0.01,0.0,0.0,0.0,0.14,0.01,0.01,0.0,0.0,0.02,0.01,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.02,0.01,0.02,0.01,0.0,0.01,0.01,0.05,0.01,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.01,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.02,0.02,0.01,0.0,0.0,0.0,0.01,0.14,0.0,0.0
5,Gia Lâm District,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.01,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.05,0.0,0.0,0.0,0.01,0.0,0.03,0.14,0.0,0.02,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.01,0.01,0.0,0.0,0.0,0.04,0.14,0.01,0.0,0.0,0.01,0.02,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.07,0.0,0.02,0.02,0.0,0.01,0.0,0.0,0.0,0.04,0.0,0.01,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.01,0.01,0.0,0.0,0.0,0.01,0.14,0.01,0.0
6,Hai Bà Trưng District,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.01,0.01,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.04,0.0,0.0,0.0,0.01,0.0,0.03,0.15,0.01,0.02,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.16,0.01,0.01,0.0,0.01,0.02,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.07,0.01,0.02,0.01,0.0,0.01,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.01,0.01,0.0,0.0,0.0,0.01,0.17,0.0,0.0
7,Hoài Đức District,0.0,0.0,0.0,0.0,0.0625,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.03125,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.03125,0.0,0.03125,0.0,0.0,0.0,0.0,0.0,0.03125,0.03125,0.0,0.0,0.0,0.0,0.125,0.0,0.03125,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,0.0,0.03125,0.0,0.0,0.03125,0.03125,0.03125,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.03125,0.0,0.03125,0.0,0.03125,0.0,0.0,0.0,0.0,0.0,0.0
8,Hoàn Kiếm District,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.01,0.01,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.05,0.0,0.0,0.0,0.01,0.0,0.03,0.14,0.01,0.02,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.16,0.01,0.01,0.0,0.01,0.02,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07,0.0,0.02,0.02,0.0,0.01,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.01,0.01,0.0,0.0,0.0,0.01,0.17,0.0,0.0
9,"Hoàng Mai District, Hanoi",0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.01,0.01,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.04,0.0,0.0,0.0,0.01,0.0,0.03,0.15,0.01,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.02,0.01,0.0,0.0,0.0,0.0,0.0,0.02,0.17,0.01,0.01,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.07,0.02,0.04,0.01,0.0,0.0,0.0,0.0,0.0,0.03,0.01,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.01,0.01,0.0,0.0,0.0,0.01,0.16,0.0,0.0


In [27]:
len(grouped[grouped["Hotel"] > 0])

21

**Create a new DataFrame for Hotel data only**

In [28]:
hotel = grouped[["Neighborhoods","Hotel"]]
hotel.head()

Unnamed: 0,Neighborhoods,Hotel
0,Ba Vì District,0.0
1,Ba Đình District,0.15
2,Bắc Từ Liêm District,0.15
3,Chương Mỹ District,0.125
4,Cầu Giấy District,0.14


### 7. Cluster Neighborhoods
Run k-means to cluster the neighborhoods in Ha Noi into 6 clusters.

In [51]:
# set number of clusters
clusters = 6
clustering = hotel.drop(["Neighborhoods"], 1)
# run k-means clustering
kmeans = KMeans(n_clusters=clusters, random_state=0).fit(clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 1, 1, 4, 1, 1, 5, 4, 5, 3], dtype=int32)

In [52]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
merged = hotel.copy()
# add clustering labels
merged["Cluster Labels"] = kmeans.labels_

In [53]:
merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
merged.head()

Unnamed: 0,Neighborhood,Hotel,Cluster Labels
0,Ba Vì District,0.0,2
1,Ba Đình District,0.15,1
2,Bắc Từ Liêm District,0.15,1
3,Chương Mỹ District,0.125,4
4,Cầu Giấy District,0.14,1


In [54]:
# merge grouped with data to add latitude/longitude for each neighborhood
merged = merged.join(df.set_index("Neighborhood"), on="Neighborhood")
print(merged.shape)
merged.head() # check the last columns!

(30, 5)


Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
0,Ba Vì District,0.0,2,21.19966,105.4227
1,Ba Đình District,0.15,1,21.03352,105.81404
2,Bắc Từ Liêm District,0.15,1,21.06217,105.76941
3,Chương Mỹ District,0.125,4,20.92364,105.70268
4,Cầu Giấy District,0.14,1,21.02984,105.79953


In [55]:
# sort the results by Cluster Labels
print(merged.shape)
merged.sort_values(["Cluster Labels"], inplace=True)
merged

(30, 5)


Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
12,Mê Linh District,0.05,0,21.18208,105.72061
1,Ba Đình District,0.15,1,21.03352,105.81404
2,Bắc Từ Liêm District,0.15,1,21.06217,105.76941
4,Cầu Giấy District,0.14,1,21.02984,105.79953
5,Gia Lâm District,0.14,1,21.01979,105.93751
19,"Sơn Tây, Hanoi",0.15,1,21.032796,105.830137
10,Hà Đông District,0.15,1,20.97382,105.77916
16,Phúc Thọ District,0.142857,1,21.10711,105.53787
0,Ba Vì District,0.0,2,21.19966,105.4227
26,Đan Phượng District,0.0,2,21.08321,105.67281


**Finally, let's visualize the resulting clusters**

In [56]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(clusters)
ys = [i+x+(i*x)**2 for i in range(clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(merged['Latitude'], merged['Longitude'], merged['Neighborhood'], merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=7,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
map_clusters

In [57]:
# save the map as HTML file
map_clusters.save('map_clusters.html')

### 8. Examine Clusters

#### Cluster 0

In [58]:
merged.loc[merged['Cluster Labels'] == 0]

Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
12,Mê Linh District,0.05,0,21.18208,105.72061


#### Cluster 1

In [59]:
merged.loc[merged['Cluster Labels'] == 1]

Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
1,Ba Đình District,0.15,1,21.03352,105.81404
2,Bắc Từ Liêm District,0.15,1,21.06217,105.76941
4,Cầu Giấy District,0.14,1,21.02984,105.79953
5,Gia Lâm District,0.14,1,21.01979,105.93751
19,"Sơn Tây, Hanoi",0.15,1,21.032796,105.830137
10,Hà Đông District,0.15,1,20.97382,105.77916
16,Phúc Thọ District,0.142857,1,21.10711,105.53787


#### Cluster 2

In [60]:
merged.loc[merged['Cluster Labels'] == 2]

Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
0,Ba Vì District,0.0,2,21.19966,105.4227
26,Đan Phượng District,0.0,2,21.08321,105.67281
24,Thạch Thất District,0.0,2,21.05825,105.57495
23,Thường Tín District,0.0,2,20.87161,105.86508
20,Thanh Oai District,0.0,2,20.85282,105.76893
17,Quốc Oai District,0.0,2,20.99221,105.64124
15,Phú Xuyên District,0.0,2,20.75451,105.92102
29,Ứng Hòa District,0.0,2,20.73055,105.7714
13,Mỹ Đức District,0.0,2,20.68597,105.74276


#### Cluster 3

In [61]:
merged.loc[merged['Cluster Labels'] == 3]

Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
11,Long Biên District,0.17,3,21.04565,105.86964
18,Sóc Sơn District,0.166667,3,21.25732,105.84826
9,"Hoàng Mai District, Hanoi",0.17,3,21.00713,105.83491
22,Thanh Xuân District,0.17,3,20.99774,105.79883


#### Cluster 4

In [62]:
merged.loc[merged['Cluster Labels'] == 4]

Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
7,Hoài Đức District,0.125,4,21.0661,105.70758
25,Tây Hồ District,0.11,4,21.07418,105.81237
3,Chương Mỹ District,0.125,4,20.92364,105.70268
27,Đông Anh District,0.125,4,21.16813,105.84818
14,Nam Từ Liêm District,0.12,4,21.00813,105.7665


#### Cluster 5

In [63]:
merged.loc[merged['Cluster Labels'] == 5]

Unnamed: 0,Neighborhood,Hotel,Cluster Labels,Latitude,Longitude
8,Hoàn Kiếm District,0.16,5,21.02902,105.85622
21,Thanh Trì District,0.16,5,20.95123,105.84621
6,Hai Bà Trưng District,0.16,5,21.00991,105.85076
28,Đống Đa District,0.16,5,21.02041,105.83082
