# Coursera Capstone Project

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
print('Libraries imported')

Libraries imported


### 
### 
### Get a list of HK districts.

OK, so there is a list of HK neighbourhoods at: https://en.wikipedia.org/wiki/Districts_of_Hong_Kong which I will use which has District and Neighbourhood colums.

In [3]:
# Read the file in from my local drive (, removing the thousands separator)
df = pd.read_csv('~/dev/GitHub/Coursera_Capstone/hk-districts.csv', thousands=',')
df.shape

(19, 4)

In [4]:
df.head(2)

Unnamed: 0,District,Population,PopulationGrowth,Density
0,Central and Western,243266,-2.7%,19391
1,Eastern,555034,-2.8%,30861


In [5]:
# Remove % sign in PopulationGrowth column
df['PopulationGrowth'] = (df['PopulationGrowth'].str.strip('%').astype(float))

In [6]:
print("\n", "Here's a sample of the results (from df) ...", "\n")

df.head(2)


 Here's a sample of the results (from df) ... 



Unnamed: 0,District,Population,PopulationGrowth,Density
0,Central and Western,243266,-2.7,19391
1,Eastern,555034,-2.8,30861


###
###
### Add latlong values to the districts data

I'll need some lat-long values from GeoCoder.

In [7]:
import geocoder
print('Libraries imported')

Libraries imported


In [8]:
# Define a retrieval function for geocoder.
def LatiLong(district):
    LatiLong_Coords = None
    
    while(LatiLong_Coords is None):
        g = geocoder.arcgis('{}, Hong Kong'.format(district))
        LatiLong_Coords = g.latlng
    return LatiLong_Coords

# And test it with a single district value.
LatiLong('Eastern')

[22.272090400000025, 114.22139600000003]

In [9]:
# Now work through all the districts in df to retrieve coordinates from geocoder

# Lists to hold the parsed data
coords = []
row = []

# How many rows are there?
numrows = df.shape[0]

# Set loop limits (for testing while loop).
startrow = 0
endrow = numrows

# Now loop through selected rows in df.
i = startrow
while(i < endrow):

    # Retrieve data for this postcode from geocoder.
    coord = LatiLong(df['District'].loc[i])

    # piece together a data row from the various components
    row = df['District'].loc[i], \
        df['Population'].loc[i], \
        df['PopulationGrowth'].loc[i], \
        df['Density'].loc[i], \
        coord[0], \
        coord[1]
    print(row)
    
    # Append this row to the list of coordinates.
    coords.append(row)
    
    i=i+1

('Central and Western', 243266, -2.7, 19391, 22.28219000000007, 114.14486000000011)
('Eastern', 555034, -2.8, 30861, 22.272090400000025, 114.22139600000003)
('Southern', 274994, -0.6, 7080, 22.25801000000007, 114.15308000000005)
('Wan Chai', 180123, -0.1, 17137, 22.277100582414448, 114.17383672856147)
('Sham Shui Po', 405869, 11.0, 43381, 22.329350805367028, 114.15917854227246)
('Kowloon City', 418732, 15.5, 41802, 22.31113000000005, 114.18354000000011)
('Kwun Tong', 648541, 10.4, 57530, 22.31423591741776, 114.22662473746914)
('Wong Tai Sin', 425235, 0.4, 45711, 22.336087900859354, 114.19172572859588)
('Yau Tsim Mong', 342970, 22.3, 49046, 22.30973890000007, 114.16852090000009)
('Islands', 156801, 14.4, 886, 22.314680000000067, 113.93243000000007)
('Kwai Tsing', 520572, -0.5, 22307, 22.31423591741776, 114.22662473746914)
('North', 315270, 12.3, 2310, 22.513688157514594, 114.20695567996347)
('Sai Kung', 461864, 13.6, 3563, 22.400011962251483, 114.31379699737874)
('Sha Tin', 659794, 8.6,

In [10]:
# Put it all into a dataframe
coordsdf = pd.DataFrame(coords, columns=['District', 'Population', 'PopulationGrowth', 'Density', 'Latitude', 'Longitude'])

print("\n", "Here's a sample of the results (from coordsdf) ...", "\n")
coordsdf.tail(20)


 Here's a sample of the results (from coordsdf) ... 



Unnamed: 0,District,Population,PopulationGrowth,Density,Latitude,Longitude
0,Central and Western,243266,-2.7,19391,22.28219,114.14486
1,Eastern,555034,-2.8,30861,22.27209,114.221396
2,Southern,274994,-0.6,7080,22.25801,114.15308
3,Wan Chai,180123,-0.1,17137,22.277101,114.173837
4,Sham Shui Po,405869,11.0,43381,22.329351,114.159179
5,Kowloon City,418732,15.5,41802,22.31113,114.18354
6,Kwun Tong,648541,10.4,57530,22.314236,114.226625
7,Wong Tai Sin,425235,0.4,45711,22.336088,114.191726
8,Yau Tsim Mong,342970,22.3,49046,22.309739,114.168521
9,Islands,156801,14.4,886,22.31468,113.93243


In [11]:
# Change datatypes to float
coordsdf['Population'] = coordsdf['Population'].astype(float)
coordsdf['PopulationGrowth'] = coordsdf['PopulationGrowth'].astype(float)
coordsdf['Density'] = coordsdf['Density'].astype(float)
coordsdf.dtypes

District             object
Population          float64
PopulationGrowth    float64
Density             float64
Latitude            float64
Longitude           float64
dtype: object

###
###
### And print a map of HK districts

In [12]:
from geopy.geocoders import Nominatim # convert an address into latlong

In [15]:
# Use geopy library to get the latitude and longitude values of Hong Kong.
address = 'Hong Kong'
geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinate of Hong Kong are {}, {}.'.format(latitude, longitude))

The geographical coordinate of Hong Kong are 22.2793278, 114.1628131.


In [16]:
import folium # map rendering library

In [17]:
# Create a map with districts superimposed on top.
# using latitude and longitude values
map_to = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for district, lat, lng in zip( \
                        coordsdf['District'], \
                        coordsdf['Latitude'], \
                        coordsdf['Longitude']):
    label = '{}'.format(district)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_to)  
    
map_to

###
###
### Use Foursquare to get Venues data for ALL Districts in HK...

In [18]:
# Define Foursquare Credentials and Version
CLIENT_ID = 'N0ICIYH1JETTJIRN43TOFSXSRFLBOA40BD4W0ROCHJOMJFW5' # your Foursquare ID
CLIENT_SECRET = 'REMOVED' # your Foursquare Secret
VERSION = '20180605' 						# Foursquare API version
LIMIT = 100 							# A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
# print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: N0ICIYH1JETTJIRN43TOFSXSRFLBOA40BD4W0ROCHJOMJFW5


In [19]:
# Create a function to get venues for all districts
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['District', 
                  'District Latitude', 
                  'District Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [20]:
# Now run the above function on each district 
# to create a new dataframe called hk_venues.
hk_venues = getNearbyVenues(names=coordsdf['District'],
                            latitudes=coordsdf['Latitude'],
                           longitudes=coordsdf['Longitude']
                                  )

Central and Western
Eastern
Southern
Wan Chai
Sham Shui Po
Kowloon City
Kwun Tong
Wong Tai Sin
Yau Tsim Mong
Islands
Kwai Tsing
North
Sai Kung
Sha Tin
Tai Po
Tsuen Wan
Tuen Mun
Yuen Long
Marine


In [21]:
print("\n", \
    "Foursquare returned {} venues in HK. Here's a sample (from hk_venues) ...".format( \
    hk_venues.shape[0]), "\n")

hk_venues.head(2)


 Foursquare returned 634 venues in HK. Here's a sample (from hk_venues) ... 



Unnamed: 0,District,District Latitude,District Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Central and Western,22.28219,114.14486,Caine Road Garden (堅道花園),22.283009,114.14793,Garden
1,Central and Western,22.28219,114.14486,Yoga Bam Bam,22.284453,114.147278,Yoga Studio


In [22]:
# I'm gonna save that as a CSV.
path='~/dev/GitHub/Coursera_Capstone/hk-venues.csv'
hk_venues.to_csv(path)

###
###
### Examine the whole of HK for Venue Types, particularly restaurants

In [23]:
# Create a dataframe of restaurant categories and their counts 
# for the whole of HK.
counts = pd.DataFrame(hk_venues['Venue Category'].value_counts())
counts.head(2)

Unnamed: 0,Venue Category
Chinese Restaurant,31
Coffee Shop,30


In [24]:
# I need to restructure the above df.

# Copy the categories into an actual category column
counts['Category'] = counts.index

# Copy the counts into a count column
counts['Count'] = counts['Venue Category']

# drop original columns
counts.drop("Venue Category", axis = 1, inplace=True)

# Rename categories column
counts.rename(columns={'Category':'Venue Category'}, inplace=True )

# Reset index
counts.reset_index(drop=True, inplace=True)

counts.head(2)

Unnamed: 0,Venue Category,Count
0,Chinese Restaurant,31
1,Coffee Shop,30


In [25]:
# Now create a df of only the counts of venues which are restaurants
restcounts = counts[ counts['Venue Category'].str.contains('Restaurant') ]
restcounts.head(100)

Unnamed: 0,Venue Category,Count
0,Chinese Restaurant,31
2,Cantonese Restaurant,23
3,Hong Kong Restaurant,22
5,Japanese Restaurant,21
7,Fast Food Restaurant,19
10,Italian Restaurant,15
14,Thai Restaurant,13
17,Sushi Restaurant,10
20,Hotpot Restaurant,8
23,Dumpling Restaurant,7


In [26]:
# I'm gonna save that as a CSV.
path='~/dev/GitHub/Coursera_Capstone/restaurant-type-counts.csv'
restcounts.to_csv(path)

###
###
### Examine Venue Types in each District

In [27]:
# one hot encoding
hk_onehot = pd.get_dummies(hk_venues[['Venue Category']], prefix="", prefix_sep="")

# add District column back to dataframe
hk_onehot['District'] = hk_venues['District'] 

# move neighborhood column to the first column
fixed_columns = [hk_onehot.columns[-1]] + list(hk_onehot.columns[:-1])
hk_onehot = hk_onehot[fixed_columns]

print("\n", 'One-hot encoding gives this...')
print()
hk_onehot.head()


 One-hot encoding gives this...



Unnamed: 0,District,Airport,Airport Food Court,Airport Gate,Airport Lounge,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Waterfall,Wine Bar,Women's Store,Yoga Studio,Yunnan Restaurant,Zhejiang Restaurant
0,Central and Western,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Central and Western,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,Central and Western,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Central and Western,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Central and Western,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
print("\n", 'Grouping this by neighborhood, using the mean of the frequency of occurrence of each category...')

hk_grouped = hk_onehot.groupby('District').mean().reset_index()
hk_grouped


 Grouping this by neighborhood, using the mean of the frequency of occurrence of each category...


Unnamed: 0,District,Airport,Airport Food Court,Airport Gate,Airport Lounge,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Waterfall,Wine Bar,Women's Store,Yoga Studio,Yunnan Restaurant,Zhejiang Restaurant
0,Central and Western,0.0,0.0,0.0,0.0,0.017241,0.017241,0.0,0.0,0.0,...,0.0,0.0,0.017241,0.0,0.0,0.0,0.0,0.034483,0.0,0.0
1,Islands,0.023256,0.023256,0.023256,0.232558,0.023256,0.0,0.023256,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.023256,0.023256,0.0,0.0,0.0
2,Kowloon City,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Kwai Tsing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.019231,0.0,0.0,0.0,0.0,0.0,0.0
4,Kwun Tong,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.019231,0.0,0.0,0.0,0.0,0.0,0.0
5,Marine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010309,...,0.0,0.0,0.020619,0.020619,0.0,0.0,0.0,0.010309,0.0,0.0
6,North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
7,Sha Tin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Sham Shui Po,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.030303,0.0,0.0,0.0,0.0,0.0,0.0
9,Southern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0


In [29]:
# Write a function to sort the venues in descending order.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [30]:
# Now create the new dataframe and display the top 10 venue types for each neighborhood.

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['District']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
districts_venues_sorted = pd.DataFrame(columns=columns)
districts_venues_sorted['District'] = hk_grouped['District']

for ind in np.arange(hk_grouped.shape[0]):
    districts_venues_sorted.iloc[ind, 1:] = \
    return_most_common_venues(hk_grouped.iloc[ind, :], num_top_venues)

In [31]:
print('\n', 'Here is a sample of the top 10 venue types ',
      '\n', 'for each District (from districts_venues_sorted) ...')

print()
print(districts_venues_sorted.shape)
districts_venues_sorted.head(20)


 Here is a sample of the top 10 venue types  
 for each District (from districts_venues_sorted) ...

(17, 11)


Unnamed: 0,District,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central and Western,Tapas Restaurant,Mexican Restaurant,Café,Coffee Shop,Italian Restaurant,Thai Restaurant,Hong Kong Restaurant,French Restaurant,Tea Room,Garden
1,Islands,Airport Lounge,Coffee Shop,Bakery,Clothing Store,Airport,Fujian Restaurant,Chocolate Shop,Cocktail Bar,Convenience Store,Dumpling Restaurant
2,Kowloon City,Hotpot Restaurant,Fast Food Restaurant,Cha Chaan Teng,Theater,Dessert Shop,Noodle House,Airport,Perfume Shop,Paper / Office Supplies Store,Park
3,Kwai Tsing,Chinese Restaurant,Cha Chaan Teng,Café,Japanese Restaurant,Fast Food Restaurant,Restaurant,Coffee Shop,Sushi Restaurant,Supermarket,Cantonese Restaurant
4,Kwun Tong,Chinese Restaurant,Cha Chaan Teng,Café,Japanese Restaurant,Fast Food Restaurant,Restaurant,Coffee Shop,Sushi Restaurant,Supermarket,Cantonese Restaurant
5,Marine,Cantonese Restaurant,Café,Coffee Shop,Italian Restaurant,Japanese Restaurant,Hong Kong Restaurant,Hotel,Thai Restaurant,Chinese Restaurant,Bakery
6,North,Waterfall,Airport,Pharmacy,Noodle House,Organic Grocery,Paper / Office Supplies Store,Park,Performing Arts Venue,Perfume Shop,Pizza Place
7,Sha Tin,Chinese Restaurant,Fast Food Restaurant,Bus Station,Coffee Shop,Noodle House,Convenience Store,Karaoke Bar,Recreation Center,Dumpling Restaurant,Café
8,Sham Shui Po,Noodle House,Italian Restaurant,Dessert Shop,Shopping Mall,Hong Kong Restaurant,Snack Place,Indonesian Restaurant,Vietnamese Restaurant,Cha Chaan Teng,Burger Joint
9,Southern,Waterfall,Grocery Store,Trail,Airport,Performing Arts Venue,Multiplex,Noodle House,Organic Grocery,Paper / Office Supplies Store,Park


###
###
### CLUSTER the Districts using k-means...

In [32]:
from sklearn.cluster import KMeans

In [33]:
# Run k-means to cluster the neighborhood into 5 clusters.

# set number of clusters
kclusters = 5

hk_grouped_clustering = hk_grouped.drop('District', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(hk_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 0, 3, 2, 2, 2, 1, 2, 2, 4], dtype=int32)

In [34]:
# Create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

# add clustering labels in column 1
districts_venues_sorted.insert(1, 'Cluster Labels', kmeans.labels_)

In [35]:
print('\n', 'Here is a sample of the top 10 venue types for each District,', \
    '\n', 'with the Cluster label added (in districts_venues_sorted)...')

print()
print(districts_venues_sorted.shape)
districts_venues_sorted.head(20)


 Here is a sample of the top 10 venue types for each District, 
 with the Cluster label added (in districts_venues_sorted)...

(17, 12)


Unnamed: 0,District,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central and Western,2,Tapas Restaurant,Mexican Restaurant,Café,Coffee Shop,Italian Restaurant,Thai Restaurant,Hong Kong Restaurant,French Restaurant,Tea Room,Garden
1,Islands,0,Airport Lounge,Coffee Shop,Bakery,Clothing Store,Airport,Fujian Restaurant,Chocolate Shop,Cocktail Bar,Convenience Store,Dumpling Restaurant
2,Kowloon City,3,Hotpot Restaurant,Fast Food Restaurant,Cha Chaan Teng,Theater,Dessert Shop,Noodle House,Airport,Perfume Shop,Paper / Office Supplies Store,Park
3,Kwai Tsing,2,Chinese Restaurant,Cha Chaan Teng,Café,Japanese Restaurant,Fast Food Restaurant,Restaurant,Coffee Shop,Sushi Restaurant,Supermarket,Cantonese Restaurant
4,Kwun Tong,2,Chinese Restaurant,Cha Chaan Teng,Café,Japanese Restaurant,Fast Food Restaurant,Restaurant,Coffee Shop,Sushi Restaurant,Supermarket,Cantonese Restaurant
5,Marine,2,Cantonese Restaurant,Café,Coffee Shop,Italian Restaurant,Japanese Restaurant,Hong Kong Restaurant,Hotel,Thai Restaurant,Chinese Restaurant,Bakery
6,North,1,Waterfall,Airport,Pharmacy,Noodle House,Organic Grocery,Paper / Office Supplies Store,Park,Performing Arts Venue,Perfume Shop,Pizza Place
7,Sha Tin,2,Chinese Restaurant,Fast Food Restaurant,Bus Station,Coffee Shop,Noodle House,Convenience Store,Karaoke Bar,Recreation Center,Dumpling Restaurant,Café
8,Sham Shui Po,2,Noodle House,Italian Restaurant,Dessert Shop,Shopping Mall,Hong Kong Restaurant,Snack Place,Indonesian Restaurant,Vietnamese Restaurant,Cha Chaan Teng,Burger Joint
9,Southern,4,Waterfall,Grocery Store,Trail,Airport,Performing Arts Venue,Multiplex,Noodle House,Organic Grocery,Paper / Office Supplies Store,Park


In [36]:
# We only need latlong values to add to the dataset, so let's
# reduce the number of columns we take from the coordsdf dataset.
hk_merged = coordsdf[['District','Latitude','Longitude']]

print()
print(hk_merged.shape)
hk_merged.head(2)


(19, 3)


Unnamed: 0,District,Latitude,Longitude
0,Central and Western,22.28219,114.14486
1,Eastern,22.27209,114.221396


In [37]:
# merge datasets to add latitude/longitude for each district.
hk_merged = hk_merged.join( \
        districts_venues_sorted.set_index('District'), on='District')

In [38]:
# Look at the results.
print('\n', 'Here is a sample of the top 10 venue types for each District,', \
    '\n', 'with LatLong added (in hk_merged)...')

print()
print(hk_merged.shape)
hk_merged.head(2)


 Here is a sample of the top 10 venue types for each District, 
 with LatLong added (in hk_merged)...

(19, 14)


Unnamed: 0,District,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central and Western,22.28219,114.14486,2.0,Tapas Restaurant,Mexican Restaurant,Café,Coffee Shop,Italian Restaurant,Thai Restaurant,Hong Kong Restaurant,French Restaurant,Tea Room,Garden
1,Eastern,22.27209,114.221396,,,,,,,,,,,


In [39]:
# I notice that "Eastern" and "Sai Kung" districts have no data, 

# simply drop whole row with NaN in "price" column
hk_merged.dropna(subset=["Cluster Labels"], axis=0, inplace=True)

# reset index, because we droped two rows
hk_merged.reset_index(drop=True, inplace=True)

# Also that the Cluster Label column has changed to float,
# so change it back to int.
hk_merged["Cluster Labels"] = hk_merged["Cluster Labels"].astype("int")

In [40]:
# Look at the results.
print('\n', 'Here is a sample of the top 10 venue types for each District,', \
    '\n', 'with rows removed where Cluster Labels = Nan (in hk_merged)...')

print()
print(hk_merged.shape)
hk_merged.head(2) # check the last columns!


 Here is a sample of the top 10 venue types for each District, 
 with rows removed where Cluster Labels = Nan (in hk_merged)...

(17, 14)


Unnamed: 0,District,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central and Western,22.28219,114.14486,2,Tapas Restaurant,Mexican Restaurant,Café,Coffee Shop,Italian Restaurant,Thai Restaurant,Hong Kong Restaurant,French Restaurant,Tea Room,Garden
1,Southern,22.25801,114.15308,4,Waterfall,Grocery Store,Trail,Airport,Performing Arts Venue,Multiplex,Noodle House,Organic Grocery,Paper / Office Supplies Store,Park


###
###
### Visualise the Clusters...

In [41]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [42]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip( \
                    hk_merged['Latitude'], \
                    hk_merged['Longitude'], \
                    hk_merged['District'], \
                    hk_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

###
###
### Examine each cluster to determine the venue categories that distinguish each cluster

In [43]:
print('\n', '    Cluster 1', '\n')
hk_merged.loc[hk_merged['Cluster Labels'] == 0, hk_merged.columns[[0] + list(range(4, hk_merged.shape[1]))]]


     Cluster 1 



Unnamed: 0,District,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,Islands,Airport Lounge,Coffee Shop,Bakery,Clothing Store,Airport,Fujian Restaurant,Chocolate Shop,Cocktail Bar,Convenience Store,Dumpling Restaurant


In [44]:
print('\n', '    Cluster 2', '\n')
hk_merged.loc[hk_merged['Cluster Labels'] == 1, hk_merged.columns[[0] + list(range(4, hk_merged.shape[1]))]]


     Cluster 2 



Unnamed: 0,District,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,North,Waterfall,Airport,Pharmacy,Noodle House,Organic Grocery,Paper / Office Supplies Store,Park,Performing Arts Venue,Perfume Shop,Pizza Place


In [45]:
print('\n', '    Cluster 3', '\n')
hk_merged.loc[hk_merged['Cluster Labels'] == 2, hk_merged.columns[[0] + list(range(4, hk_merged.shape[1]))]]


     Cluster 3 



Unnamed: 0,District,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central and Western,Tapas Restaurant,Mexican Restaurant,Café,Coffee Shop,Italian Restaurant,Thai Restaurant,Hong Kong Restaurant,French Restaurant,Tea Room,Garden
2,Wan Chai,Cantonese Restaurant,Coffee Shop,Japanese Restaurant,Hong Kong Restaurant,Hotel,Chinese Restaurant,Café,Italian Restaurant,Bakery,Thai Restaurant
3,Sham Shui Po,Noodle House,Italian Restaurant,Dessert Shop,Shopping Mall,Hong Kong Restaurant,Snack Place,Indonesian Restaurant,Vietnamese Restaurant,Cha Chaan Teng,Burger Joint
5,Kwun Tong,Chinese Restaurant,Cha Chaan Teng,Café,Japanese Restaurant,Fast Food Restaurant,Restaurant,Coffee Shop,Sushi Restaurant,Supermarket,Cantonese Restaurant
6,Wong Tai Sin,Cha Chaan Teng,Shopping Mall,Supermarket,Market,Pizza Place,Noodle House,Café,Cantonese Restaurant,Chinese Restaurant,Park
7,Yau Tsim Mong,Hotel,Dessert Shop,Chinese Restaurant,Hong Kong Restaurant,Cantonese Restaurant,Seafood Restaurant,Indian Restaurant,Noodle House,Café,Dim Sum Restaurant
9,Kwai Tsing,Chinese Restaurant,Cha Chaan Teng,Café,Japanese Restaurant,Fast Food Restaurant,Restaurant,Coffee Shop,Sushi Restaurant,Supermarket,Cantonese Restaurant
11,Sha Tin,Chinese Restaurant,Fast Food Restaurant,Bus Station,Coffee Shop,Noodle House,Convenience Store,Karaoke Bar,Recreation Center,Dumpling Restaurant,Café
12,Tai Po,Convenience Store,Bus Stop,Supermarket,Furniture / Home Store,Bus Station,Butcher,Fast Food Restaurant,Cantonese Restaurant,Pizza Place,Market
13,Tsuen Wan,Auto Garage,Bakery,Bus Station,Dim Sum Restaurant,Cha Chaan Teng,Market,Hong Kong Restaurant,Performing Arts Venue,Perfume Shop,Pharmacy


In [46]:
print('\n', '    Cluster 4', '\n')
hk_merged.loc[hk_merged['Cluster Labels'] == 3, hk_merged.columns[[0] + list(range(4, hk_merged.shape[1]))]]


     Cluster 4 



Unnamed: 0,District,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Kowloon City,Hotpot Restaurant,Fast Food Restaurant,Cha Chaan Teng,Theater,Dessert Shop,Noodle House,Airport,Perfume Shop,Paper / Office Supplies Store,Park


In [47]:
print('\n', '    Cluster 5', '\n')
hk_merged.loc[hk_merged['Cluster Labels'] == 4, hk_merged.columns[[0] + list(range(4, hk_merged.shape[1]))]]


     Cluster 5 



Unnamed: 0,District,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Southern,Waterfall,Grocery Store,Trail,Airport,Performing Arts Venue,Multiplex,Noodle House,Organic Grocery,Paper / Office Supplies Store,Park
