# Capstone Project - The Battle of the Neighborhoods (Week 2)

## Code

Lets start by downloading the required dependencies.

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

from urllib.request import urlopen # library to open URLs
from bs4 import BeautifulSoup # Package to extract data from HTML files

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # Package to obtain coordinates

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

# import k-means from clustering stage
from sklearn.cluster import KMeans

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    geopy-1.22.0               |     pyh9f0ad1d_0          63 KB  conda-forge
    certifi-2020.6.20          |   py36h9f0ad1d_0         151 KB  conda-forge
    ca-certificates-2020.6.20  |       hecda079_0         145 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0           conda-forge
    geopy:          

Defining the URL of the wikipedia page we are scraping the data from and opening it to obtain html.

In [2]:
url = "https://en.wikipedia.org/wiki/G_postcode_area#Coverage"
html = urlopen(url)

Using BeautifulSoup to parse the HTML into the correct format and looking at the underlying code.

In [3]:
soup = BeautifulSoup(html, "lxml")
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   G postcode area - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"72856667-b55c-49f1-ae8c-0348dff1b509","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"G_postcode_area","wgTitle":"G postcode area","wgCurRevisionId":962677243,"wgRevisionId":962677243,"wgArticleId":10109112,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Use dmy dates from May 2017","Use British English from May 2017","Articles using KML from Wikidata","Glasgow",

Locating the table amongst the html code.

In [4]:
right_table=soup.find('table', class_='wikitable sortable')
right_table

<table class="wikitable sortable" style="font-size:95%">
<tbody><tr style="white-space:nowrap">
<th>Postcode district
</th>
<th><a href="/wiki/Post_town" title="Post town">Post town</a>
</th>
<th>Coverage
</th>
<th>Local authority area
</th></tr>
<tr>
<td>G1
</td>
<td><a href="/wiki/Glasgow" title="Glasgow">GLASGOW</a>
</td>
<td><i>Former C1 district:</i> <a href="/wiki/Merchant_City" title="Merchant City">Merchant City</a>
</td>
<td><a href="/wiki/Politics_of_Glasgow" title="Politics of Glasgow">Glasgow City</a>
</td></tr>
<tr>
<td>G2
</td>
<td>GLASGOW
</td>
<td><i>Former C2 district:</i> <a href="/wiki/Blythswood_Hill" title="Blythswood Hill">Blythswood Hill</a>, <a href="/wiki/Anderston" title="Anderston">Anderston</a> (part)
</td>
<td>Glasgow City
</td></tr>
<tr>
<td>G3
</td>
<td>GLASGOW
</td>
<td><i>Former C3 district:</i> <a href="/wiki/Anderston" title="Anderston">Anderston</a>, <a href="/wiki/Finnieston" title="Finnieston">Finnieston</a>, <a href="/wiki/Garnethill" title="Garne

Using a FOR loop to find all the rows of the table.

In [5]:
A=[]
B=[]
C=[]
D=[]

for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==4:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].findAll(text=True))
        D.append(cells[3].find(text=True))

Converting the list to a pandas dataframe.

In [6]:
df=pd.DataFrame(A,columns=['Postcode district'])
df['Post town']=B
df['Coverage']=C
df['Local authority area']=D
df

Unnamed: 0,Postcode district,Post town,Coverage,Local authority area
0,G1,GLASGOW,"[Former C1 district:, , Merchant City, ]",Glasgow City
1,G2,GLASGOW,"[Former C2 district:, , Blythswood Hill, , , ...",Glasgow City
2,G3,GLASGOW,"[Former C3 district:, , Anderston, , , Finnie...",Glasgow City
3,G4,GLASGOW,"[Former C4 district:, , Calton, (part), , Co...",Glasgow City
4,G5,GLASGOW,"[Former C5 district:, , Gorbals, ]",Glasgow City
5,G9,GLASGOW,"[Newspaper Competitions, ]",non-geographic
6,G11,GLASGOW,"[Former W1 district:, , Broomhill, , , Partic...",Glasgow City
7,G12,GLASGOW,"[Former W2 district:, , West End, (part), Cl...",Glasgow City
8,G13,GLASGOW,"[Former W3 district:, , Anniesland, , , Knigh...",Glasgow City
9,G14,GLASGOW,"[Former W4 district:, , Whiteinch, , , Scotst...",Glasgow City


Converting the Coverage column from object to string datatype to enable string operations to be performed.

In [7]:
df['Coverage'] = df['Coverage'].astype('str')
df

Unnamed: 0,Postcode district,Post town,Coverage,Local authority area
0,G1,GLASGOW,"['Former C1 district:', ' ', 'Merchant City', ...",Glasgow City
1,G2,GLASGOW,"['Former C2 district:', ' ', 'Blythswood Hill'...",Glasgow City
2,G3,GLASGOW,"['Former C3 district:', ' ', 'Anderston', ', '...",Glasgow City
3,G4,GLASGOW,"['Former C4 district:', ' ', 'Calton', ' (part...",Glasgow City
4,G5,GLASGOW,"['Former C5 district:', ' ', 'Gorbals', '\n']",Glasgow City
5,G9,GLASGOW,"['Newspaper Competitions', '\n']",non-geographic
6,G11,GLASGOW,"['Former W1 district:', ' ', 'Broomhill', ', '...",Glasgow City
7,G12,GLASGOW,"['Former W2 district:', ' ', 'West End', ' (pa...",Glasgow City
8,G13,GLASGOW,"['Former W3 district:', ' ', 'Anniesland', ', ...",Glasgow City
9,G14,GLASGOW,"['Former W4 district:', ' ', 'Whiteinch', ', '...",Glasgow City


Removing the square brackets from Coverage column.

In [8]:
df['Coverage'] = df['Coverage'].str.replace('[', '')
df['Coverage'] = df['Coverage'].str.replace(']', '')
df

Unnamed: 0,Postcode district,Post town,Coverage,Local authority area
0,G1,GLASGOW,"'Former C1 district:', ' ', 'Merchant City', '\n'",Glasgow City
1,G2,GLASGOW,"'Former C2 district:', ' ', 'Blythswood Hill',...",Glasgow City
2,G3,GLASGOW,"'Former C3 district:', ' ', 'Anderston', ', ',...",Glasgow City
3,G4,GLASGOW,"'Former C4 district:', ' ', 'Calton', ' (part)...",Glasgow City
4,G5,GLASGOW,"'Former C5 district:', ' ', 'Gorbals', '\n'",Glasgow City
5,G9,GLASGOW,"'Newspaper Competitions', '\n'",non-geographic
6,G11,GLASGOW,"'Former W1 district:', ' ', 'Broomhill', ', ',...",Glasgow City
7,G12,GLASGOW,"'Former W2 district:', ' ', 'West End', ' (par...",Glasgow City
8,G13,GLASGOW,"'Former W3 district:', ' ', 'Anniesland', ', '...",Glasgow City
9,G14,GLASGOW,"'Former W4 district:', ' ', 'Whiteinch', ', ',...",Glasgow City


Merging the G51 postcodes and dropping one row.

In [9]:
df.at[26, "Postcode district"]= "G51"
df.at[26, "Coverage"]= "Govan, Ibrox, Drumoyne, Cessnock, Plantation"
df.drop([27], inplace=True)
df = df.reset_index(drop=True)
df

Unnamed: 0,Postcode district,Post town,Coverage,Local authority area
0,G1,GLASGOW,"'Former C1 district:', ' ', 'Merchant City', '\n'",Glasgow City
1,G2,GLASGOW,"'Former C2 district:', ' ', 'Blythswood Hill',...",Glasgow City
2,G3,GLASGOW,"'Former C3 district:', ' ', 'Anderston', ', ',...",Glasgow City
3,G4,GLASGOW,"'Former C4 district:', ' ', 'Calton', ' (part)...",Glasgow City
4,G5,GLASGOW,"'Former C5 district:', ' ', 'Gorbals', '\n'",Glasgow City
5,G9,GLASGOW,"'Newspaper Competitions', '\n'",non-geographic
6,G11,GLASGOW,"'Former W1 district:', ' ', 'Broomhill', ', ',...",Glasgow City
7,G12,GLASGOW,"'Former W2 district:', ' ', 'West End', ' (par...",Glasgow City
8,G13,GLASGOW,"'Former W3 district:', ' ', 'Anniesland', ', '...",Glasgow City
9,G14,GLASGOW,"'Former W4 district:', ' ', 'Whiteinch', ', ',...",Glasgow City


Reading the .csv file with all UK postcodes coordinates into a pandas dataframe.

In [10]:
coord = pd.read_csv('https://www.freemaptools.com/download/outcode-postcodes/postcode-outcodes.csv')
coord.head()

Unnamed: 0,id,postcode,latitude,longitude
0,2,AB10,57.13514,-2.11731
1,3,AB11,57.13875,-2.09089
2,4,AB12,57.101,-2.1106
3,5,AB13,57.10801,-2.23776
4,6,AB14,57.10076,-2.27073


Selecting only the Glasgow (G) post codes.

In [11]:
coord = coord[coord.postcode.str.contains('G',case=False)]
coord

Unnamed: 0,id,postcode,latitude,longitude
630,632,DG1,55.07353,-3.58045
631,633,DG10,55.31977,-3.43735
632,634,DG11,55.11331,-3.33584
633,635,DG12,54.98928,-3.25044
634,636,DG13,55.17291,-3.02849
635,637,DG14,55.08069,-2.98539
636,638,DG16,54.99731,-3.06777
637,639,DG2,55.06442,-3.65683
638,640,DG3,55.23678,-3.79927
639,641,DG4,55.37555,-3.95236


Sorting by postcode to group the Glasgow postcodes together.

In [12]:
coord = coord.sort_values(by=['postcode'])
coord

Unnamed: 0,id,postcode,latitude,longitude
630,632,DG1,55.07353,-3.58045
631,633,DG10,55.31977,-3.43735
632,634,DG11,55.11331,-3.33584
633,635,DG12,54.98928,-3.25044
634,636,DG13,55.17291,-3.02849
635,637,DG14,55.08069,-2.98539
636,638,DG16,54.99731,-3.06777
637,639,DG2,55.06442,-3.65683
638,640,DG3,55.23678,-3.79927
639,641,DG4,55.37555,-3.95236


Resetting the index.

In [13]:
coord = coord.reset_index(drop=True)
coord

Unnamed: 0,id,postcode,latitude,longitude
0,632,DG1,55.07353,-3.58045
1,633,DG10,55.31977,-3.43735
2,634,DG11,55.11331,-3.33584
3,635,DG12,54.98928,-3.25044
4,636,DG13,55.17291,-3.02849
5,637,DG14,55.08069,-2.98539
6,638,DG16,54.99731,-3.06777
7,639,DG2,55.06442,-3.65683
8,640,DG3,55.23678,-3.79927
9,641,DG4,55.37555,-3.95236


Slicing the Glasgow postcodes.

In [14]:
coord = coord.iloc[15:70]
coord

Unnamed: 0,id,postcode,latitude,longitude
15,905,G1,55.86038,-4.24671
16,906,G11,55.87356,-4.31142
17,907,G12,55.88006,-4.30061
18,908,G13,55.89358,-4.3462
19,909,G14,55.88095,-4.34864
20,910,G15,55.9094,-4.36476
21,911,G2,55.86382,-4.2549
22,912,G20,55.8858,-4.28176
23,913,G21,55.88063,-4.22069
24,914,G22,55.88998,-4.25002


Sorting the postcodes into the correct order.

In [15]:
!pip install natsort
from natsort import natsorted

coord = coord.set_index('postcode').reindex(natsorted(coord.postcode.tolist(), key=lambda y: y.lower())).reset_index()
coord

Collecting natsort
  Downloading https://files.pythonhosted.org/packages/0f/65/81883897f4aaa1e53deaa65137318cfe80b36ce013c2e86f8fd0843cfa02/natsort-7.0.1-py3-none-any.whl
Installing collected packages: natsort
Successfully installed natsort-7.0.1


Unnamed: 0,postcode,id,latitude,longitude
0,G1,905,55.86038,-4.24671
1,G2,911,55.86382,-4.2549
2,G3,916,55.86619,-4.27262
3,G4,921,55.86837,-4.25196
4,G5,929,55.84769,-4.25237
5,G9,2905,55.868635,-4.241903
6,G11,906,55.87356,-4.31142
7,G12,907,55.88006,-4.30061
8,G13,908,55.89358,-4.3462
9,G14,909,55.88095,-4.34864


Dropping the id column as it is not needed.

In [16]:
coord = coord.drop(['id'], axis=1)
coord.head()

Unnamed: 0,postcode,latitude,longitude
0,G1,55.86038,-4.24671
1,G2,55.86382,-4.2549
2,G3,55.86619,-4.27262
3,G4,55.86837,-4.25196
4,G5,55.84769,-4.25237


Renaming postcode column to match the main dataframe.

In [17]:
coord.rename(columns={"postcode": "Postcode district"}, inplace=True)
coord

Unnamed: 0,Postcode district,latitude,longitude
0,G1,55.86038,-4.24671
1,G2,55.86382,-4.2549
2,G3,55.86619,-4.27262
3,G4,55.86837,-4.25196
4,G5,55.84769,-4.25237
5,G9,55.868635,-4.241903
6,G11,55.87356,-4.31142
7,G12,55.88006,-4.30061
8,G13,55.89358,-4.3462
9,G14,55.88095,-4.34864


Setting Postcode district columns equal so we can merge dataframes.

In [18]:
df['Postcode district'] = coord['Postcode district'].copy()
df

Unnamed: 0,Postcode district,Post town,Coverage,Local authority area
0,G1,GLASGOW,"'Former C1 district:', ' ', 'Merchant City', '\n'",Glasgow City
1,G2,GLASGOW,"'Former C2 district:', ' ', 'Blythswood Hill',...",Glasgow City
2,G3,GLASGOW,"'Former C3 district:', ' ', 'Anderston', ', ',...",Glasgow City
3,G4,GLASGOW,"'Former C4 district:', ' ', 'Calton', ' (part)...",Glasgow City
4,G5,GLASGOW,"'Former C5 district:', ' ', 'Gorbals', '\n'",Glasgow City
5,G9,GLASGOW,"'Newspaper Competitions', '\n'",non-geographic
6,G11,GLASGOW,"'Former W1 district:', ' ', 'Broomhill', ', ',...",Glasgow City
7,G12,GLASGOW,"'Former W2 district:', ' ', 'West End', ' (par...",Glasgow City
8,G13,GLASGOW,"'Former W3 district:', ' ', 'Anniesland', ', '...",Glasgow City
9,G14,GLASGOW,"'Former W4 district:', ' ', 'Whiteinch', ', ',...",Glasgow City


Checking that they are in fact equal.

In [19]:
df['Postcode district'].equals(coord['Postcode district'])

True

Joining the dataframes on the postcodes.

In [20]:
df1 = pd.merge(df, coord, on='Postcode district', how='inner')
df1

Unnamed: 0,Postcode district,Post town,Coverage,Local authority area,latitude,longitude
0,G1,GLASGOW,"'Former C1 district:', ' ', 'Merchant City', '\n'",Glasgow City,55.86038,-4.24671
1,G2,GLASGOW,"'Former C2 district:', ' ', 'Blythswood Hill',...",Glasgow City,55.86382,-4.2549
2,G3,GLASGOW,"'Former C3 district:', ' ', 'Anderston', ', ',...",Glasgow City,55.86619,-4.27262
3,G4,GLASGOW,"'Former C4 district:', ' ', 'Calton', ' (part)...",Glasgow City,55.86837,-4.25196
4,G5,GLASGOW,"'Former C5 district:', ' ', 'Gorbals', '\n'",Glasgow City,55.84769,-4.25237
5,G9,GLASGOW,"'Newspaper Competitions', '\n'",non-geographic,55.868635,-4.241903
6,G11,GLASGOW,"'Former W1 district:', ' ', 'Broomhill', ', ',...",Glasgow City,55.87356,-4.31142
7,G12,GLASGOW,"'Former W2 district:', ' ', 'West End', ' (par...",Glasgow City,55.88006,-4.30061
8,G13,GLASGOW,"'Former W3 district:', ' ', 'Anniesland', ', '...",Glasgow City,55.89358,-4.3462
9,G14,GLASGOW,"'Former W4 district:', ' ', 'Whiteinch', ', ',...",Glasgow City,55.88095,-4.34864


Dropping the non-geographic post codes and those that aren't in the city of Glasgow.

In [21]:
df1 = df1[df1['Post town'].str.contains('Glasgow',case=False)]
df1 = df1[~df1['Local authority area'].str.contains("non-geographic")]
df1

Unnamed: 0,Postcode district,Post town,Coverage,Local authority area,latitude,longitude
0,G1,GLASGOW,"'Former C1 district:', ' ', 'Merchant City', '\n'",Glasgow City,55.86038,-4.24671
1,G2,GLASGOW,"'Former C2 district:', ' ', 'Blythswood Hill',...",Glasgow City,55.86382,-4.2549
2,G3,GLASGOW,"'Former C3 district:', ' ', 'Anderston', ', ',...",Glasgow City,55.86619,-4.27262
3,G4,GLASGOW,"'Former C4 district:', ' ', 'Calton', ' (part)...",Glasgow City,55.86837,-4.25196
4,G5,GLASGOW,"'Former C5 district:', ' ', 'Gorbals', '\n'",Glasgow City,55.84769,-4.25237
6,G11,GLASGOW,"'Former W1 district:', ' ', 'Broomhill', ', ',...",Glasgow City,55.87356,-4.31142
7,G12,GLASGOW,"'Former W2 district:', ' ', 'West End', ' (par...",Glasgow City,55.88006,-4.30061
8,G13,GLASGOW,"'Former W3 district:', ' ', 'Anniesland', ', '...",Glasgow City,55.89358,-4.3462
9,G14,GLASGOW,"'Former W4 district:', ' ', 'Whiteinch', ', ',...",Glasgow City,55.88095,-4.34864
10,G15,GLASGOW,"'Former W5 district:', ' ', 'Drumchapel', '\n'",Glasgow City,55.9094,-4.36476


Tidying the Coverage column by removing unwanted symbols.

In [22]:
df1['Coverage'] = df1['Coverage'].str.replace(r"[\"\']", '')
df1['Coverage'] = df1['Coverage'].replace(r'\\n',' ', regex=True)
df1

Unnamed: 0,Postcode district,Post town,Coverage,Local authority area,latitude,longitude
0,G1,GLASGOW,"Former C1 district:, , Merchant City,",Glasgow City,55.86038,-4.24671
1,G2,GLASGOW,"Former C2 district:, , Blythswood Hill, , , A...",Glasgow City,55.86382,-4.2549
2,G3,GLASGOW,"Former C3 district:, , Anderston, , , Finnies...",Glasgow City,55.86619,-4.27262
3,G4,GLASGOW,"Former C4 district:, , Calton, (part), , Cow...",Glasgow City,55.86837,-4.25196
4,G5,GLASGOW,"Former C5 district:, , Gorbals,",Glasgow City,55.84769,-4.25237
6,G11,GLASGOW,"Former W1 district:, , Broomhill, , , Partick...",Glasgow City,55.87356,-4.31142
7,G12,GLASGOW,"Former W2 district:, , West End, (part), Cle...",Glasgow City,55.88006,-4.30061
8,G13,GLASGOW,"Former W3 district:, , Anniesland, , , Knight...",Glasgow City,55.89358,-4.3462
9,G14,GLASGOW,"Former W4 district:, , Whiteinch, , , Scotsto...",Glasgow City,55.88095,-4.34864
10,G15,GLASGOW,"Former W5 district:, , Drumchapel,",Glasgow City,55.9094,-4.36476


Removing extra commas.

In [23]:
df1['Coverage'] = df1['Coverage'].replace(r', , ',' ', regex=True)
df1

Unnamed: 0,Postcode district,Post town,Coverage,Local authority area,latitude,longitude
0,G1,GLASGOW,"Former C1 district:, , Merchant City,",Glasgow City,55.86038,-4.24671
1,G2,GLASGOW,"Former C2 district:, , Blythswood Hill , Ande...",Glasgow City,55.86382,-4.2549
2,G3,GLASGOW,"Former C3 district:, , Anderston , Finnieston...",Glasgow City,55.86619,-4.27262
3,G4,GLASGOW,"Former C4 district:, , Calton, (part) Cowcad...",Glasgow City,55.86837,-4.25196
4,G5,GLASGOW,"Former C5 district:, , Gorbals,",Glasgow City,55.84769,-4.25237
6,G11,GLASGOW,"Former W1 district:, , Broomhill , Partick , ...",Glasgow City,55.87356,-4.31142
7,G12,GLASGOW,"Former W2 district:, , West End, (part), Cle...",Glasgow City,55.88006,-4.30061
8,G13,GLASGOW,"Former W3 district:, , Anniesland , Knightswo...",Glasgow City,55.89358,-4.3462
9,G14,GLASGOW,"Former W4 district:, , Whiteinch , Scotstoun,",Glasgow City,55.88095,-4.34864
10,G15,GLASGOW,"Former W5 district:, , Drumchapel,",Glasgow City,55.9094,-4.36476


Separating Coverage out into individual rows.

In [24]:
from itertools import chain

# return list from series of comma-separated strings
def chainer(s):
    return list(chain.from_iterable(s.str.split(',')))

# calculate lengths of splits
lens = df1['Coverage'].str.split(',').map(len)

# create new dataframe, repeating or chaining as appropriate
df1 = pd.DataFrame({'Postcode district': np.repeat(df1['Postcode district'], lens),
                    'Post town': np.repeat(df1['Post town'], lens),
                    'Coverage': chainer(df1['Coverage']),
                    'Local authority area': np.repeat(df1['Local authority area'], lens),
                    'latitude': np.repeat(df1['latitude'], lens),
                    'longitude': np.repeat(df1['longitude'], lens)})

df1

Unnamed: 0,Postcode district,Post town,Coverage,Local authority area,latitude,longitude
0,G1,GLASGOW,Former C1 district:,Glasgow City,55.86038,-4.24671
0,G1,GLASGOW,,Glasgow City,55.86038,-4.24671
0,G1,GLASGOW,Merchant City,Glasgow City,55.86038,-4.24671
0,G1,GLASGOW,,Glasgow City,55.86038,-4.24671
1,G2,GLASGOW,Former C2 district:,Glasgow City,55.86382,-4.2549
1,G2,GLASGOW,,Glasgow City,55.86382,-4.2549
1,G2,GLASGOW,Blythswood Hill,Glasgow City,55.86382,-4.2549
1,G2,GLASGOW,Anderston,Glasgow City,55.86382,-4.2549
1,G2,GLASGOW,(part),Glasgow City,55.86382,-4.2549
2,G3,GLASGOW,Former C3 district:,Glasgow City,55.86619,-4.27262


Sorting by Coverage and resetting the index.

In [25]:
df1.sort_values(by=['Coverage'], inplace=True)
df1 = df1.reset_index(drop=True)
df1

Unnamed: 0,Postcode district,Post town,Coverage,Local authority area,latitude,longitude
0,G78,GLASGOW,,East Renfrewshire,55.79259,-4.40873
1,G33,GLASGOW,,North Lanarkshire,55.87351,-4.16593
2,G66,GLASGOW,,East Dunbartonshire,55.94024,-4.15364
3,G32,GLASGOW,,Glasgow City,55.8484,-4.16293
4,G68,GLASGOW,,North Lanarkshire,55.95345,-4.01019
5,G32,GLASGOW,,Glasgow City,55.8484,-4.16293
6,G31,GLASGOW,,Glasgow City,55.85748,-4.20819
7,G23,GLASGOW,,Glasgow City,55.90193,-4.28431
8,G23,GLASGOW,,Glasgow City,55.90193,-4.28431
9,G22,GLASGOW,,Glasgow City,55.88998,-4.25002


Removing some unnecessary rows and resetting the index.

In [26]:
df1 = df1[~df1.Coverage.str.contains("(south)")]
df1 = df1[~df1.Coverage.str.contains("Former")]
df1 = df1.reset_index(drop=True)
df1

  if __name__ == '__main__':


Unnamed: 0,Postcode district,Post town,Coverage,Local authority area,latitude,longitude
0,G78,GLASGOW,,East Renfrewshire,55.79259,-4.40873
1,G33,GLASGOW,,North Lanarkshire,55.87351,-4.16593
2,G66,GLASGOW,,East Dunbartonshire,55.94024,-4.15364
3,G32,GLASGOW,,Glasgow City,55.8484,-4.16293
4,G68,GLASGOW,,North Lanarkshire,55.95345,-4.01019
5,G32,GLASGOW,,Glasgow City,55.8484,-4.16293
6,G31,GLASGOW,,Glasgow City,55.85748,-4.20819
7,G23,GLASGOW,,Glasgow City,55.90193,-4.28431
8,G23,GLASGOW,,Glasgow City,55.90193,-4.28431
9,G22,GLASGOW,,Glasgow City,55.88998,-4.25002


Dropping the rows where Coverage is blank.

In [27]:
df1.drop(df1.index[range(0,67)], axis=0, inplace=True)
df1 = df1.reset_index(drop=True)
df1

Unnamed: 0,Postcode district,Post town,Coverage,Local authority area,latitude,longitude
0,G68,GLASGOW,(north) Dullatur,North Lanarkshire,55.95345,-4.01019
1,G74,GLASGOW,(north) Thorntonhall,South Lanarkshire,55.76912,-4.17289
2,G12,GLASGOW,(part),Glasgow City,55.88006,-4.30061
3,G4,GLASGOW,(part),Glasgow City,55.86837,-4.25196
4,G31,GLASGOW,(part),Glasgow City,55.85748,-4.20819
5,G4,GLASGOW,(part),Glasgow City,55.86837,-4.25196
6,G2,GLASGOW,(part),Glasgow City,55.86382,-4.2549
7,G4,GLASGOW,(part) Cowcaddens,Glasgow City,55.86837,-4.25196
8,G42,GLASGOW,(part) Toryglen,Glasgow City,55.83234,-4.25627
9,G4,GLASGOW,(part) Woodside,Glasgow City,55.86837,-4.25196


Dropping more unnecessary rows.

In [28]:
df1.drop(df1.index[range(2,7)], axis=0, inplace=True)
df1 = df1.reset_index(drop=True)
df1

Unnamed: 0,Postcode district,Post town,Coverage,Local authority area,latitude,longitude
0,G68,GLASGOW,(north) Dullatur,North Lanarkshire,55.95345,-4.01019
1,G74,GLASGOW,(north) Thorntonhall,South Lanarkshire,55.76912,-4.17289
2,G4,GLASGOW,(part) Cowcaddens,Glasgow City,55.86837,-4.25196
3,G42,GLASGOW,(part) Toryglen,Glasgow City,55.83234,-4.25627
4,G4,GLASGOW,(part) Woodside,Glasgow City,55.86837,-4.25196
5,G3,GLASGOW,(part) Yorkhill,Glasgow City,55.86619,-4.27262
6,G2,GLASGOW,Anderston,Glasgow City,55.86382,-4.2549
7,G3,GLASGOW,Anderston,Glasgow City,55.86619,-4.27262
8,G13,GLASGOW,Anniesland,Glasgow City,55.89358,-4.3462
9,G46,GLASGOW,Arden,"East Renfrewshire, Glasgow City",55.80446,-4.30573


Renaming to the correct coverage area.

In [29]:
df1.at[0, "Coverage"] = "Dullatur"
df1.at[1, "Coverage"] = "Thorntonhall"
df1.at[2, "Coverage"] = "Cowcaddens"
df1.at[3, "Coverage"] = "Toryglen"
df1.at[4, "Coverage"] = "Woodside"
df1.at[5, "Coverage"] = "Yorkhill"
df1

Unnamed: 0,Postcode district,Post town,Coverage,Local authority area,latitude,longitude
0,G68,GLASGOW,Dullatur,North Lanarkshire,55.95345,-4.01019
1,G74,GLASGOW,Thorntonhall,South Lanarkshire,55.76912,-4.17289
2,G4,GLASGOW,Cowcaddens,Glasgow City,55.86837,-4.25196
3,G42,GLASGOW,Toryglen,Glasgow City,55.83234,-4.25627
4,G4,GLASGOW,Woodside,Glasgow City,55.86837,-4.25196
5,G3,GLASGOW,Yorkhill,Glasgow City,55.86619,-4.27262
6,G2,GLASGOW,Anderston,Glasgow City,55.86382,-4.2549
7,G3,GLASGOW,Anderston,Glasgow City,55.86619,-4.27262
8,G13,GLASGOW,Anniesland,Glasgow City,55.89358,-4.3462
9,G46,GLASGOW,Arden,"East Renfrewshire, Glasgow City",55.80446,-4.30573


Sorting by Postcode district.

In [30]:
df1 = pd.DataFrame(
      sorted(df1.values, key=lambda x: int(x[0].split('G')[1])),
      columns=df1.columns
      )
df1

Unnamed: 0,Postcode district,Post town,Coverage,Local authority area,latitude,longitude
0,G1,GLASGOW,Merchant City,Glasgow City,55.86038,-4.24671
1,G2,GLASGOW,Anderston,Glasgow City,55.86382,-4.2549
2,G2,GLASGOW,Blythswood Hill,Glasgow City,55.86382,-4.2549
3,G3,GLASGOW,Yorkhill,Glasgow City,55.86619,-4.27262
4,G3,GLASGOW,Anderston,Glasgow City,55.86619,-4.27262
5,G3,GLASGOW,Finnieston,Glasgow City,55.86619,-4.27262
6,G3,GLASGOW,Garnethill,Glasgow City,55.86619,-4.27262
7,G3,GLASGOW,Park,Glasgow City,55.86619,-4.27262
8,G3,GLASGOW,Woodlands,Glasgow City,55.86619,-4.27262
9,G4,GLASGOW,Cowcaddens,Glasgow City,55.86837,-4.25196


The data is now ready for use!

Using Nomanatim to obtain the coordinates for Glasgow

In [31]:
address = 'Glasgow, UK'

geolocator = Nominatim(user_agent="glasgow_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Glasgow are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Glasgow are 55.8609825, -4.2488787.


Visualising Glasgow and its Neighborhoods/areas.

In [32]:
# create map of Glasgow using latitude and longitude values
map_glasgow = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df1['latitude'], df1['longitude'], df1['Coverage']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_glasgow)  
    
map_glasgow

Defining Foursquare credentials and version. This cell is hidden for security.

In [33]:
# @hidden_cell
CLIENT_ID = 'YYTNANF4RMMVWNV0QPPHUZ5H21CSMBFFAHDMDZ1JQWMMKYMJ' # your Foursquare ID
CLIENT_SECRET = 'PGFZC0SDTSBGHDS3COSYG4ULB514NUWAX445VWTCVHOIH4X2' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentials:
CLIENT_ID: YYTNANF4RMMVWNV0QPPHUZ5H21CSMBFFAHDMDZ1JQWMMKYMJ
CLIENT_SECRET:PGFZC0SDTSBGHDS3COSYG4ULB514NUWAX445VWTCVHOIH4X2


Defining a function that gets nearby gyms/fitness clubs in Glasgow.

In [34]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000, LIMIT=100, search_query='Gym'):

    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)

        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&query={}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng,
            search_query,
            radius, 
            LIMIT)

        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']

        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']

    return(nearby_venues)

Creating a new dataframe with gyms in Glasgow.

In [35]:
glasgow_gyms = getNearbyVenues(names=df1['Coverage'],
                                   latitudes=df1['latitude'],
                                   longitudes=df1['longitude']
                                  )

glasgow_gyms.head()

 Merchant City
 Anderston
 Blythswood Hill 
Yorkhill
 Anderston 
 Finnieston 
 Garnethill 
 Park 
 Woodlands
Cowcaddens
Woodside
 Calton
 Drygate Kelvinbridge 
 Townhead 
 Woodlands
 Gorbals
 Broomhill 
 Partick 
 Partickhill
 Botanic Gardens 
 Cleveden Dowanhill 
 Hillhead 
 Hyndland 
 Kelvindale 
 Kelvinside 
 University of Glasgow
 West End
 Anniesland 
 Knightswood 
 Yoker
 Scotstoun
 Whiteinch 
 Drumchapel
 Maryhill 
 North Kelvinside 
 Ruchill
 Balornock 
 Barmulloch 
 Cowlairs 
 Royston 
 Sighthill
 Springburn 
 Milton 
 Parkhouse 
 Possilpark
 Lambhill 
 Summerston
 Dennistoun 
 Haghill 
 Parkhead
 Carmyle 
 Lightburn 
 Mount Vernon 
 Sandyhills 
 Shettleston 
 Springboig
 Tollcross 
 Cardowan 
 Carntyne 
 Craigend 
 Cranhill 
 Garthamlock 
 Millerston 
 Provanmill 
 Queenslie 
 Riddrie 
 Robroyston 
 Ruchazie 
 Stepps 
 Wellhouse
 Easterhouse 
 Easthall 
 Provanhall
 Bridgeton 
 Calton 
 Dalmarnock
 Pollokshields 
 Shawlands
Toryglen
 Battlefield 
 Govanhill 
 Mount Florida 
 

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Merchant City,55.86038,-4.24671,PureGym,55.86428,-4.260227,Gym / Fitness Center
1,Merchant City,55.86038,-4.24671,The Gym,55.857813,-4.25651,Gym / Fitness Center
2,Merchant City,55.86038,-4.24671,PureGym,55.859799,-4.25901,Gym / Fitness Center
3,Merchant City,55.86038,-4.24671,Strathclyde Uni Gym,55.862249,-4.24842,College Gym
4,Merchant City,55.86038,-4.24671,The Club Gym,55.858546,-4.248849,Gym


Using one hot encoding to tally the number of gyms in each area.

In [36]:
glasgow_onehot = pd.get_dummies(glasgow_gyms[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
glasgow_onehot['Neighborhood'] = glasgow_gyms['Neighborhood'] 

# define a list of column names
cols = glasgow_onehot.columns.tolist()
cols

# move the column name to the beginning
cols.insert(0, cols.pop(cols.index('Neighborhood')))
cols

#then use .reindex() function to reorder
glasgow_onehot = glasgow_onehot.reindex(columns= cols)

#check result
glasgow_onehot.head()

Unnamed: 0,Neighborhood,Athletics & Sports,Climbing Gym,College Gym,Gym,Gym / Fitness Center,Gym Pool,Martial Arts Dojo,Pool,Rugby Pitch,Soccer Field,Yoga Studio
0,Merchant City,0,0,0,0,1,0,0,0,0,0,0
1,Merchant City,0,0,0,0,1,0,0,0,0,0,0
2,Merchant City,0,0,0,0,1,0,0,0,0,0,0
3,Merchant City,0,0,1,0,0,0,0,0,0,0,0
4,Merchant City,0,0,0,1,0,0,0,0,0,0,0


Group the dataframe by neighborhood and calculate the mean frequencies.

In [37]:
glasgow_grouped = glasgow_onehot.groupby('Neighborhood').mean().reset_index()
glasgow_grouped

Unnamed: 0,Neighborhood,Athletics & Sports,Climbing Gym,College Gym,Gym,Gym / Fitness Center,Gym Pool,Martial Arts Dojo,Pool,Rugby Pitch,Soccer Field,Yoga Studio
0,Anderston,0.0,0.0,0.157895,0.368421,0.473684,0.0,0.0,0.0,0.0,0.0,0.0
1,Anderston,0.0,0.0,0.066667,0.333333,0.533333,0.0,0.0,0.0,0.0,0.0,0.066667
2,Arden,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Balornock,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Barmulloch,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Battlefield,0.0,0.0,0.0,0.5,0.25,0.0,0.0,0.0,0.0,0.25,0.0
6,Blythswood Hill,0.0,0.0,0.157895,0.368421,0.473684,0.0,0.0,0.0,0.0,0.0,0.0
7,Botanic Gardens,0.0,0.0,0.0,0.333333,0.0,0.333333,0.0,0.0,0.0,0.0,0.333333
8,Bridgeton,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Broomhill,0.0,0.0,0.333333,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Defining a function to sort the most common venues in descending order.

In [38]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Creating a new dataframe and displaying top 5 venues in each neighborhood.

In [39]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = glasgow_grouped['Neighborhood']

for ind in np.arange(glasgow_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(glasgow_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Anderston,Gym / Fitness Center,Gym,College Gym,Yoga Studio,Soccer Field
1,Anderston,Gym / Fitness Center,Gym,Yoga Studio,College Gym,Soccer Field
2,Arden,Gym,Yoga Studio,Soccer Field,Rugby Pitch,Pool
3,Balornock,Gym,Yoga Studio,Soccer Field,Rugby Pitch,Pool
4,Barmulloch,Gym,Yoga Studio,Soccer Field,Rugby Pitch,Pool


Run k-means to cluster the neighborhoods into 5 clusters.

In [40]:
# set number of clusters
kclusters = 5

glasgow_grouped_clustering = glasgow_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(glasgow_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([2, 2, 0, 0, 0, 2, 2, 2, 2, 0], dtype=int32)

Creating a new dataframe that includes the cluster as well as the top 5 venues for each neighborhood.

In [41]:
# rename 'Coverage' to 'Neighborhood' so the dataframes can be merged
df1.rename(columns={"Coverage": "Neighborhood"}, inplace=True)

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

glasgow_merged = df1

# merge glasgow_grouped with glasgow_data to add latitude/longitude for each neighborhood
glasgow_merged = glasgow_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood', how='right')

glasgow_merged.head() # check the last columns

Unnamed: 0,Postcode district,Post town,Neighborhood,Local authority area,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,G1,GLASGOW,Merchant City,Glasgow City,55.86038,-4.24671,2,Gym / Fitness Center,Gym,College Gym,Yoga Studio,Soccer Field
1,G2,GLASGOW,Anderston,Glasgow City,55.86382,-4.2549,2,Gym / Fitness Center,Gym,College Gym,Yoga Studio,Soccer Field
2,G2,GLASGOW,Blythswood Hill,Glasgow City,55.86382,-4.2549,2,Gym / Fitness Center,Gym,College Gym,Yoga Studio,Soccer Field
3,G3,GLASGOW,Yorkhill,Glasgow City,55.86619,-4.27262,2,Gym / Fitness Center,Gym,Yoga Studio,College Gym,Soccer Field
4,G3,GLASGOW,Anderston,Glasgow City,55.86619,-4.27262,2,Gym / Fitness Center,Gym,Yoga Studio,College Gym,Soccer Field


Visualise the resulting clusters.

In [42]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster, common in zip(glasgow_merged['latitude'], glasgow_merged['longitude'], glasgow_merged['Neighborhood'], glasgow_merged['Cluster Labels'], glasgow_merged['1st Most Common Venue']):
    label = folium.Popup(str(poi) + ', Cluster ' + str(cluster) + ',  Most Common Venue: ' + str(common), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Creating a dataframe with the total Gyms & Venues for each postcode district.

In [43]:
Sum_column = glasgow_onehot["Athletics & Sports"] + glasgow_onehot["Climbing Gym"] + glasgow_onehot["College Gym"] + glasgow_onehot["Gym"] + glasgow_onehot["Gym / Fitness Center"] + glasgow_onehot["Gym Pool"] + glasgow_onehot["Martial Arts Dojo"] + glasgow_onehot["Pool"] + glasgow_onehot["Rugby Pitch"] + glasgow_onehot["Soccer Field"] + glasgow_onehot["Yoga Studio"]
glasgow_onehot["Gyms & Venues"] = Sum_column
glasgow_totals = pd.merge(glasgow_merged, glasgow_onehot, on='Neighborhood', how='inner')
glasgow_totals = glasgow_totals.drop(['1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue', '4th Most Common Venue', '5th Most Common Venue', 'Athletics & Sports', 'Climbing Gym', 'College Gym', 'Gym', 'Gym / Fitness Center', 'Gym Pool', 'Martial Arts Dojo', 'Pool', 'Rugby Pitch', 'Soccer Field', 'Yoga Studio', 'Post town', 'Neighborhood', 'Local authority area'], axis=1)

In [44]:
glasgow_totals = glasgow_totals.groupby(['Postcode district', 'latitude', 'longitude', 'Cluster Labels'], as_index=False)['Gyms & Venues'].sum()
glasgow_totals

Unnamed: 0,Postcode district,latitude,longitude,Cluster Labels,Gyms & Venues
0,G1,55.86038,-4.24671,2,13
1,G11,55.87356,-4.31142,0,9
2,G12,55.88006,-4.30061,2,24
3,G14,55.88095,-4.34864,2,4
4,G2,55.86382,-4.2549,2,38
5,G20,55.8858,-4.28176,0,3
6,G21,55.88063,-4.22069,0,6
7,G22,55.88998,-4.25002,1,1
8,G3,55.86619,-4.27262,2,101
9,G31,55.85748,-4.20819,0,3


In [45]:
# @hidden_cell
# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.
from project_lib import Project
project = Project(project_id='a247b18d-81a6-46fa-a07c-d24e1ca9ccb6', project_access_token='p-77553792d2f8c98e0233d823d45a592923e770e9')
pc = project.project_context


Downloading a geojson file containing the boundary coordinates for each postcode district from my github repository.

In [46]:
# download Postcode District geojson file
!wget --quiet https://raw.githubusercontent.com/rossh95/Coursera_Capstone/master/GPostcodeBoundaries.json -O GPostcodeBoundaries.json
    
print('GeoJSON file downloaded!')

GeoJSON file downloaded!


In [47]:
glasgow_geo = r'GPostcodeBoundaries.json' # geojson file

Creating a map of Glasgow again and generating a choropleth map, showing the concentration of Gyms & Sports Venues in each postcode district.

In [48]:
#Creating a plain map of glasgow for the choropleth
choropleth_glasgow = folium.Map(location=[latitude, longitude], zoom_start=11)

In [50]:
#Generating choropleth map
choropleth_glasgow.choropleth(
    geo_data=glasgow_geo,
    data=glasgow_totals,
    columns=['Postcode district', 'Gyms & Venues'],
    key_on='properties.name',
    fill_color='YlOrRd', 
    fill_opacity=0.7, 
    line_opacity=0.2,
    legend_name='Number of Gyms/Fitness Venues in Glasgow'
)

In [51]:
#Display map
choropleth_glasgow