# IBM Data Science Capstone Project

## Clustering Hong Kong MTR Stations

#### By Rafael Lobo

---

## Gathering MTR Station Coordinates and Routes

In [3]:
import numpy as np 
import pandas as pd
import json
#!pip install geopy
from geopy.geocoders import Nominatim
#!pip install geopandas
import geopandas as gpd
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
#! pip install folium==0.5.0
import folium 
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_colwidth',None)

Collecting folium==0.5.0
  Downloading folium-0.5.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 5.5 MB/s eta 0:00:011
[?25hCollecting branca
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Building wheels for collected packages: folium
  Building wheel for folium (setup.py) ... [?25ldone
[?25h  Created wheel for folium: filename=folium-0.5.0-py3-none-any.whl size=76240 sha256=dc3068036afb6c7c784344630636d7751331e21b2d799fd580284a8db2557da2
  Stored in directory: /tmp/wsuser/.cache/pip/wheels/b2/2f/2c/109e446b990d663ea5ce9b078b5e7c1a9c45cca91f377080f8
Successfully built folium
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.5.0


### Used Overpass Turbo to generate a map of the HK MTR stations and their routes. Overpass Turbo is a web version of the Overpass API that can generate queries regarding Openstreetmap elements

In [8]:
#Used Overpass Turbo to output a Geojson file and then saved it on my Github repo
github='https://raw.githubusercontent.com/prtlobo/Coursera_Capstone/master/export.geojson'
#Then used Geopandas to read the geojson file
mtr_geo=gpd.read_file(github)

In [5]:
# Found the stations as nodes from the file but each station had multiple nodes and did not know which is the "real" location of the station. 
#Used another method to generate coordinates for the stations
#Scraping data off Wikipedia instead
#Extract table(s) from wikipedia
pd.set_option('display.max_rows', None)
wiki='https://en.wikipedia.org/wiki/List_of_MTR_stations'
site=requests.get(wiki).text
stations=pd.read_html(site,flavor='bs4')
colors=['#53B7E8','#00AB4E','#ED1D24','#007DC5','#F7943E','#00888A','#7D499D','#A3238F','#923011','#F173AC','#BAC429']
lines=['East Rail line','Kwun Tong line','Tsuen Wan line','Island line','Tung Chung line','Airport Express','Tseung Kwan O line','West Rail line','Tuen Ma line Phase 1','Disneyland Resort line','South Island line']

#Add MTR line name before concat
for i in range(0,11):
    temp_df=stations[i]
    temp_df.insert(2,'Line', lines[i])
    temp_df.insert(8, 'Line Color', colors[i])
#Cleaning up some stations names
mtr=pd.concat(stations[0:11])
mtr.drop(['Livery','Photo','Connections','Opened','Notes'], axis=1,inplace=True)
mtr.reset_index(drop=True,inplace=True)
mtr.iloc[6,0]='University'
mtr.iloc[12,0]='Mong Kok East'
mtr.iloc[13,0]='Hung Hom'
mtr.iloc[16,0]='Yau Ma Tei'
mtr.iloc[17,0]='Mong Kok'
mtr.iloc[36,0]='Mei Foo'
mtr.iloc[41,0]='Mong Kok'
mtr.iloc[42,0]='Yau Ma Tei'
mtr.iloc[46,0]='Central'
#mtr.drop_duplicates(inplace=True)
mtr.reset_index(drop=True)
mtr.head()

Unnamed: 0,Name,Line,District,Code,Line Color
0,Lo Wu,East Rail line,North,LOW,#53B7E8
1,Lok Ma Chau,East Rail line,Yuen Long,LMC,#53B7E8
2,Sheung Shui,East Rail line,North,SHS,#53B7E8
3,Fanling,East Rail line,North,FAN,#53B7E8
4,Tai Wo,East Rail line,Tai Po,TWO,#53B7E8


In [6]:
#use Geolocator to add latitude and longitude coordinates
geolocator = Nominatim(user_agent="HK")

def LatLong(station):
    
    address='{} Station, Hong Kong'.format(station)
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    return pd.Series([latitude, longitude])

mtr[['Latitude','Longitude']]=mtr['Name'].apply(LatLong)
mtr.head()

Unnamed: 0,Name,Line,District,Code,Line Color,Latitude,Longitude
0,Lo Wu,East Rail line,North,LOW,#53B7E8,22.528975,114.113267
1,Lok Ma Chau,East Rail line,Yuen Long,LMC,#53B7E8,22.515505,114.066628
2,Sheung Shui,East Rail line,North,SHS,#53B7E8,22.501223,114.12795
3,Fanling,East Rail line,North,FAN,#53B7E8,22.491573,114.140308
4,Tai Wo,East Rail line,Tai Po,TWO,#53B7E8,22.45101,114.160773


In [11]:
routes=mtr_geo[mtr_geo['id'].str.contains('relation')]
routes.head()

Unnamed: 0,id,@id,brand,brand:en,brand:zh,colour,from,from:en,from:zh,name,...,maxspeed,voltage,electrified,passenger_lines,train,opening_hours,name:zh-Hans,name:zh-Hant,@relations,geometry
0,relation/269669,relation/269669,港鐵 MTR,港鐵,港鐵 MTR,#ed1d24,荃灣 Tsuen Wan,Tsuen Wan,荃灣,港鐵荃灣綫 MTR Tsuen Wan Line (南行 Southbound),...,,,,,,,,,,"MULTILINESTRING ((114.10905 22.37924, 114.1096..."
1,relation/269672,relation/269672,,,,#7d499d,北角 North Point,,,將軍澳綫 Tseung Kwan O Line (北行 Northbound),...,,,,,,,,,,"MULTILINESTRING ((114.25686 22.32397, 114.2579..."
2,relation/272125,relation/272125,港鐵 MTR,港鐵,港鐵 MTR,#00ab4e,調景嶺 Tiu Keng Leng,Tiu Keng Leng,調景嶺,觀塘綫 Kwun Tong Line (調景嶺 Tiu Keng Leng → 黃埔 Wha...,...,,,,,,,,,,"LINESTRING (114.25233 22.30411, 114.25126 22.3..."
3,relation/4248589,relation/4248589,,,,#53b7e8,落馬洲 Lok Ma Chau,Lok Ma Chau,落馬洲,東鐵綫 East Rail Line (落馬洲 Lok Ma Chau → 紅磡 Hung ...,...,,,,,,,,,,"MULTILINESTRING ((114.06335 22.51444, 114.0637..."
4,relation/4248590,relation/4248590,,,,#53b7e8,羅湖 Lo Wu,,,東鐵綫 East Rail Line (羅湖 Lo Wu → 紅磡 Hung Hom),...,,,,,,,,,,"MULTILINESTRING ((114.11351 22.52818, 114.1135..."


In [12]:
map_hk = folium.Map(location=[22.3193,114.1694], tiles='Cartodb dark_matter', zoom_start=11)

gjson = folium.features.GeoJson(routes)

map_hk.add_child(gjson)
                                    
map_hk

## Map of MTR stations and Routes

In [None]:
map_hk = folium.Map(location=[22.3193,114.1694], tiles='Cartodb dark_matter', zoom_start=11)

# add markers to map
for lat, lng, station, clr in zip(mtr['Latitude'], mtr['Longitude'], mtr['Name'], mtr['Color']):
    label = '{} Station'.format(station)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=clr,
        fill=True,
        fill_color=clr,
        fill_opacity=0.7,
        parse_html=False).add_to(map_hk)  
    
map_hk

In [99]:
nodes=mtr[mtr['id'].str.contains('node')]
nodes.head()

Unnamed: 0,id,@id,brand,brand:en,brand:zh,colour,from,from:en,from:zh,name,...,maxspeed,voltage,electrified,passenger_lines,train,opening_hours,name:zh-Hans,name:zh-Hant,@relations,geometry
55,node/24357989,node/24357989,,,,,,,,,...,,,,,,,,,"[ { ""role"": ""stop_entry_only"", ""rel"": 5317239,...",POINT (114.10742 22.35834)
56,node/243844812,node/243844812,,,,,,,,,...,,,,,,,,,"[ { ""role"": ""stop"", ""rel"": 4248592, ""reltags"":...",POINT (114.11314 22.52820)
57,node/243844834,node/243844834,,,,,,,,,...,,,,,,,,,"[ { ""role"": ""stop"", ""rel"": 4248591, ""reltags"":...",POINT (114.12805 22.50112)
58,node/243844839,node/243844839,,,,,,,,,...,,,,,,,,,"[ { ""role"": ""stop"", ""rel"": 4248591, ""reltags"":...",POINT (114.13867 22.49204)
59,node/243844884,node/243844884,,,,,,,,,...,,,,,,,,,"[ { ""role"": ""stop"", ""rel"": 4248591, ""reltags"":...",POINT (114.17077 22.44451)


In [22]:
map_hk = folium.Map(location=[22.3193,114.1694], tiles='Cartodb dark_matter', zoom_start=11)

gjson = folium.features.GeoJson(mtr)

map_hk.add_child(gjson)
                                    
map_hk

## Create Map of Current Stations