In [1]:
# library for BeautifulSoup, for web scrapping
from bs4 import BeautifulSoup
# library to handle data in a vectorized manner
import numpy as np
# library for data analsysis
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# library to handle JSON files
import json
print('numpy, pandas, ..., imported...')
!pip -q install geopy
print('geopy installed...')
# convert an address into latitude and longitude values
from geopy.geocoders import Nominatim
print('Nominatim imported...')
# library to handle requests
import requests
print('requests imported...')
# tranform JSON file into a pandas dataframe
from pandas.io.json import json_normalize
print('json_normalize imported...')
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
print('matplotlib imported...')
# import k-means from clustering stage
from sklearn.cluster import KMeans
print('Kmeans imported...')
# install the Geocoder
!pip -q install geocoder
import geocoder
# import time
import time
!pip -q install folium
print('folium installed...')
import folium # map rendering library
print('folium imported...')
print('...Done')

numpy, pandas, ..., imported...
geopy installed...
Nominatim imported...
requests imported...
json_normalize imported...
matplotlib imported...
Kmeans imported...
folium installed...
folium imported...
...Done


# Data
This project will rely on public data from Wikipedia and Foursquare.
Dataset 1
In this project, London will be used as synonymous to the “Greater London Area” in this project. Within the Greater London Area, there are areas that are within the London Area Postcode. The focus of this project will be the neighbourhoods are that are within the London Post Code area.
The London Area consists of 32 Boroughs and the “City of London”. Our data will be from the link — Greater London Area <https://en.wikipedia.org/wiki/List_of_areas_of_London >
The web scrapped of the Wikipedia page for the Greater London Area data is provided below:
The BeautifulSoup package is used to scrap the needed data from Wikipedia.

In [2]:
# library for BeautifulSoup
from bs4 import BeautifulSoup
wikipedia_link = 'https://en.wikipedia.org/wiki/List_of_areas_of_London'
wikipedia_page = requests.get(wikipedia_link)

Then the html is cleaned and parsed accordingly.

In [9]:
soup = BeautifulSoup(wikipedia_page.content, 'html.parser')
# This extracts the "tbody" within the table where class is "wikitable sortable"
table = soup.find('table', {'class':'wikitable sortable'}).tbody
# Extracts all "tr" (table rows) within the table above
rows = table.find_all('tr')
# Extracts the column headers, removes and replaces possible '\n' with space for the "th" tag
columns = [i.text.replace('\n', '')
           for i in rows[0].find_all('th')]
# Converts columns to pd dataframe
df = pd.DataFrame(columns = columns)
'''
Extracts every row with corresponding columns then appends the values to the create pd dataframe "df". The first row (row[0]) is skipped because it is already the header
'''
for i in range(1, len(rows)):
    tds = rows[i].find_all('td')    
    if len(tds) == 7:
        values = [tds[0].text, tds[1].text, tds[2].text.replace('\n', ''.replace('\xa0','')), tds[3].text, tds[4].text.replace('\n', ''.replace('\xa0','')), tds[5].text.replace('\n', ''.replace('\xa0','')), tds[6].text.replace('\n', ''.replace('\xa0',''))]
    else:
        values = [td.text.replace('\n', '').replace('\xa0','') for td in tds]
        
        df = df.append(pd.Series(values, index = columns), ignore_index = True)
        
df.head(5)

Unnamed: 0,Location,London borough,Post town,Postcode district,Dial code,OS grid ref
0,Abbey Wood,"Bexley, Greenwich [7]",LONDON,SE2,20,TQ465785
1,Acton,"Ealing, Hammersmith and Fulham[8]",LONDON,"W3, W4",20,TQ205805
2,Addington,Croydon[8],CROYDON,CR0,20,TQ375645
3,Addiscombe,Croydon[8],CROYDON,CR0,20,TQ345665
4,Albany Park,Bexley,"BEXLEY, SIDCUP","DA5, DA14",20,TQ478728


In [22]:
df.columns = ['Location', 'Borough', 'Post town', 'Postcode', 'Dial code', 'OS grid ref']

In [23]:
df.columns

Index(['Location', 'Borough', 'Post town', 'Postcode', 'Dial code',
       'OS grid ref'],
      dtype='object')

In [24]:
# Remove Borough reference numbers with []
df['Borough'] = df['Borough'].map(lambda x: x.rstrip(']').rstrip('0123456789').rstrip('['))
df.head(5)

Unnamed: 0,Location,Borough,Post town,Postcode,Dial code,OS grid ref
0,Abbey Wood,"Bexley, Greenwich",LONDON,SE2,20,TQ465785
1,Acton,"Ealing, Hammersmith and Fulham",LONDON,"W3, W4",20,TQ205805
2,Addington,Croydon,CROYDON,CR0,20,TQ375645
3,Addiscombe,Croydon,CROYDON,CR0,20,TQ345665
4,Albany Park,Bexley,"BEXLEY, SIDCUP","DA5, DA14",20,TQ478728


In [28]:
df0 = df.drop('Postcode', axis=1).join(df['Postcode'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('Postcode'))

In [31]:
df1 = df0[['Location', 'Borough', 'Postcode','Post town']].reset_index(drop=True)

In [32]:
df1.shape

(637, 4)

In [36]:
df2 = df1 # assigns df1 to df2
df21 = df2[df2['Post town'].str.contains('LONDON')]
df21.head(5)

Unnamed: 0,Location,Borough,Postcode,Post town
0,Abbey Wood,"Bexley, Greenwich",SE2,LONDON
1,Acton,"Ealing, Hammersmith and Fulham",W3,LONDON
2,Acton,"Ealing, Hammersmith and Fulham",W4,LONDON
8,Aldgate,City,EC3,LONDON
9,Aldwych,Westminster,WC2,LONDON


In [37]:
df21.shape

(381, 4)

In [40]:
df3 = df21[['Location', 'Borough', 'Postcode']].reset_index(drop=True)
df3.head(5)

Unnamed: 0,Location,Borough,Postcode
0,Abbey Wood,"Bexley, Greenwich",SE2
1,Acton,"Ealing, Hammersmith and Fulham",W3
2,Acton,"Ealing, Hammersmith and Fulham",W4
3,Aldgate,City,EC3
4,Aldwych,Westminster,WC2


In [42]:
df_london = df3 # re-assigns to df_london
# Strips whitespaces before postcode
df_london.Postcode = df_london.Postcode.str.strip()
# New dataframe for South East London postcodes - df_se
df_se = df_london[df_london['Postcode'].str.startswith(('SE'))].reset_index(drop=True)

In [64]:
df_london.head()

Unnamed: 0,Location,Borough,Postcode
0,Abbey Wood,"Bexley, Greenwich",SE2
1,Acton,"Ealing, Hammersmith and Fulham",W3
2,Acton,"Ealing, Hammersmith and Fulham",W4
3,Aldgate,City,EC3
4,Aldwych,Westminster,WC2


In [51]:
demograph_link = 'https://en.wikipedia.org/wiki/Demography_of_London'
demograph_page = requests.get(demograph_link)
soup1 = BeautifulSoup(demograph_page.content, 'html.parser')
table1 = soup1.find('table', {'class':'wikitable sortable'}).tbody
rows1 = table1.find_all('tr')
columns1 = [i.text.replace('\n', '')
 for i in rows1[0].find_all('th')]
demo_london = pd.DataFrame(columns = columns1)
for j in range(1, len(rows1)):
    tds1 = rows1[j].find_all('td')
    if len(tds1) == 7:
        values1 = [tds1[0].text, tds1[1].text, tds1[2].text.replace('\n', ''.replace('\xa0','')), tds1[3].text, tds1[4].text.replace('\n', ''.replace('\xa0','')), tds1[5].text.replace('\n', ''.replace('\xa0',''))]
    else:
        values1 = [td1.text.replace('\n', '').replace('\xa0','') for td1 in tds1]
        
        demo_london = demo_london.append(pd.Series(values1, index = columns1), ignore_index = True)

In [52]:
demo_london

Unnamed: 0,Local authority,White,Mixed,Asian,Black,Other
0,Barnet,64.1,4.8,18.5,7.7,4.8
1,Barking and Dagenham,58.3,4.2,15.9,20.0,1.6
2,Bexley,81.9,2.3,6.6,8.5,0.8
3,Brent,36.3,5.1,34.1,18.8,5.8
4,Bromley,84.3,3.5,5.2,6.0,0.9
5,Camden,66.3,5.6,16.1,8.2,3.8
6,City of London,78.6,3.9,12.7,2.6,2.1
7,Croydon,55.1,6.6,16.4,20.2,1.8
8,Ealing,49.0,4.5,29.7,10.9,6.0
9,Enfield,61.0,5.5,11.2,17.2,5.1


In [54]:
demo_london['Asian'] = demo_london['Asian'].astype('float')
demo_london_sorted = demo_london.sort_values(by='Asian', ascending = False)
demo_london_sorted.head(5)
#AREAS WITH HIGHEST ASIAN POPULATION

Unnamed: 0,Local authority,White,Mixed,Asian,Black,Other
24,Newham,29.0,4.5,43.5,19.6,3.5
13,Harrow,42.2,4.0,42.6,8.2,2.9
25,Redbridge,42.5,4.1,41.8,8.9,2.7
29,Tower Hamlets,45.2,4.1,41.1,7.3,2.3
17,Hounslow,51.4,4.1,34.4,6.6,3.6


In [66]:
df_top = df_london[df_london['Borough'].isin(['Newham', 'Harrow', 'Redbridge', 'Tower Hamlets', 'Hounslow'])].reset_index(drop=True)
df_top

Unnamed: 0,Location,Borough,Postcode
0,Beckton,Newham,E6
1,Beckton,Newham,E16
2,Beckton,Newham,IG11
3,Bethnal Green,Tower Hamlets,E2
4,Blackwall,Tower Hamlets,E14
5,Bow,Tower Hamlets,E3
6,Bromley (also Bromley-by-Bow),Tower Hamlets,E3
7,Cambridge Heath,Tower Hamlets,E2
8,Canary Wharf,Tower Hamlets,E14
9,Canning Town,Newham,E16


## Dataset 2

In [67]:
# Geocoder starts here
# Defining a function to use --> get_latlng()'''
def get_latlng(arcgis_geocoder):
    
    # Initialize the Location (lat. and long.) to "None"
    lat_lng_coords = None
    
    # While loop helps to create a continous run until all the location coordinates are geocoded
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, London, United Kingdom'.format(arcgis_geocoder))
        lat_lng_coords = g.latlng
    return lat_lng_coords
# Geocoder ends here

In [71]:
start = time.time()
postal_codes = df_top['Postcode']    
coordinates = [get_latlng(postal_code) for postal_code in postal_codes.tolist()]
end = time.time()
print("Time of execution: ", end - start, "seconds")

Time of execution:  24.686478853225708 seconds


In [74]:
df_loc = df_top
# The obtained coordinates (latitude and longitude) are joined with the dataframe as shown
df_coordinates = pd.DataFrame(coordinates, columns = ['Latitude', 'Longitude'])
df_loc['Latitude'] = df_coordinates['Latitude']
df_loc['Longitude'] = df_coordinates['Longitude']
df_loc.head(5)

Unnamed: 0,Location,Borough,Postcode,Latitude,Longitude
0,Beckton,Newham,E6,51.53292,0.05461
1,Beckton,Newham,E16,51.50913,0.01528
2,Beckton,Newham,IG11,51.53312,0.084077
3,Bethnal Green,Tower Hamlets,E2,51.52669,-0.06257
4,Blackwall,Tower Hamlets,E14,51.51122,-0.01264


In [125]:
with open('fs.json', 'r') as f:
    config = json.load(f)

In [126]:
df_loc.head()

Unnamed: 0,Location,Borough,Postcode,Latitude,Longitude
0,Beckton,Newham,E6,51.53292,0.05461
1,Beckton,Newham,E16,51.50913,0.01528
2,Beckton,Newham,IG11,51.53312,0.084077
3,Bethnal Green,Tower Hamlets,E2,51.52669,-0.06257
4,Blackwall,Tower Hamlets,E14,51.51122,-0.01264


# Methodology
Single Neighbourhood — An initial exploration of a single Neighbourhood within the London area was done to examine the Foursquare workability. The Newham Borough postcode  and Location - beckton is used for this.

In [129]:
beckton_lat = tdf.loc[0, 'Latitude']
beckton_long = tdf.loc[0, 'Longitude']
beckton_loc = tdf.loc[0, 'Location']
beckton_postcode = tdf.loc[0, 'Postcode']
print('The latitude and longitude values of {} with postcode {}, are {}, {}.'.format(beckton_loc, beckton_postcode, beckton_lat, beckton_long))

The latitude and longitude values of Beckton with postcode E6, are 51.53292000000005, 0.05461000000002514.


In [None]:
tdf.loc[tdf['Location'] == 'Beckton']

Let’s explore the top 100 venues that are within a 2000 metres radius of Lewisham. And then, let’s create the GET request URL, and then the url is named.
Since there is a limit to Foursquare usage → https://developer.foursquare.com/docs/api/troubleshooting/rate-limits

In [130]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 2000 # define radius
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    config['ID'], 
    config['SECRET'], 
    config['VERSION'], 
    beckton_lat, 
    beckton_long, 
    radius, 
    LIMIT)
# displays URL
url

'https://api.foursquare.com/v2/venues/explore?&client_id=KGT5ISZPULPVYNBWDHY0TKFLZUXDT3RIYCBAIE2QNUT23FJM&client_secret=YT0TARYRJRKE2SPL4GDWN2HU0WXIKHGOZZ5ZOAOUIXQQ1SFU&v=20120609&ll=51.53292000000005,0.05461000000002514&radius=2000&limit=100'

In [131]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5ed26cd39388d7001bd55afa'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'East Ham Central',
  'headerFullLocation': 'East Ham Central, London',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 90,
  'suggestedBounds': {'ne': {'lat': 51.55092001800006,
    'lng': 0.08349189090848615},
   'sw': {'lat': 51.51491998200003, 'lng': 0.02572810909156413}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 1,
       'items': [{'summary': 'Lots of people like this place',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4d06283cc2e537044020c267',
       'name': "McDonald's",
       'contact': {},
       'location': {'address': '32 HIGH STREET NORTH',
        'lat': 51.534031,
        'lng': 0.053797,
        'labeledLatLngs': [{'label': 'dis

From the results, the necessary information needs to be obtained from items key. To do this, the get_category_type function is used from the Foursquare lab.

In [132]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

The result is then cleaned up from json to a structured pandas dataframe as shown below:

In [134]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]
# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
nearby_venues.head(10)

Unnamed: 0,name,categories,lat,lng
0,McDonald's,Fast Food Restaurant,51.534031,0.053797
1,The Miller's Well (Wetherspoon),Pub,51.533406,0.056379
2,Central Park,Park,51.528808,0.052901
3,Taste Of India,Indian Restaurant,51.542572,0.050107
4,The Who Shop & Museum,Toy / Game Store,51.530577,0.039778
5,Saravanaa Bhavan,Indian Restaurant,51.542468,0.050299
6,Costa Coffee,Coffee Shop,51.534517,0.053365
7,Vijay's Chawalla,Indian Restaurant,51.53865,0.032979
8,Barking Abbey,Park,51.535352,0.076054
9,Ananthapuram (Traditional Kerala Restaurant),Indian Restaurant,51.540517,0.050633


In [137]:
nearby_venues_Beckton_unique = nearby_venues['categories'].value_counts().to_frame(name='Count')
nearby_venues_Beckton_unique.head(5)
#Beckton top 5 venues

Unnamed: 0,Count
Indian Restaurant,9
Grocery Store,9
Supermarket,7
Coffee Shop,6
Fast Food Restaurant,5


# Multiple Neighbourhoods 
Now we will explore (Multiple) Neighborhoods in the east London area. 
To do this, the function getNearbyVenues is used and its created to repeat the same process neighborhoods.

## Multiple Neighbourhoods

In [143]:
def getNearbyVenues(names, latitudes, longitudes, radius=2000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            config['ID'], 
            config['SECRET'], 
            config['VERSION'],
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [144]:
ld_venues = getNearbyVenues(names=tdf['Location'],
                                   latitudes=tdf['Latitude'],
                                   longitudes=tdf['Longitude']
                                  )

Beckton
Beckton
Beckton
Bethnal Green
Blackwall
Bow
Bromley (also Bromley-by-Bow)
Cambridge Heath
Canary Wharf
Canning Town
Cubitt Town
Custom House
East Ham
Forest Gate
Grove Park
Gunnersbury
Isle of Dogs
Leamouth
Limehouse
Little Ilford
Manor Park
Maryland
Mile End
Millwall
North Woolwich
Old Ford
Plaistow
Poplar
Ratcliff
Shadwell
Silvertown
South Woodford
Spitalfields
Stepney
Stratford
Tower Hill
Upton Park
Upton Park
Wanstead
Wapping
West Ham
West Ham
Whitechapel
Woodford
Woodford


In [148]:
ld_venues.head(5)

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Beckton,51.53292,0.05461,McDonald's,51.534031,0.053797,Fast Food Restaurant
1,Beckton,51.53292,0.05461,The Miller's Well (Wetherspoon),51.533406,0.056379,Pub
2,Beckton,51.53292,0.05461,Central Park,51.528808,0.052901,Park
3,Beckton,51.53292,0.05461,Taste Of India,51.542572,0.050107,Indian Restaurant
4,Beckton,51.53292,0.05461,The Who Shop & Museum,51.530577,0.039778,Toy / Game Store


The number of venues returned for each neighbourhoods is then explored as follows:

In [149]:
ld_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Beckton,259,259,259,259,259,259
Bethnal Green,100,100,100,100,100,100
Blackwall,100,100,100,100,100,100
Bow,100,100,100,100,100,100
Bromley (also Bromley-by-Bow),100,100,100,100,100,100
Cambridge Heath,100,100,100,100,100,100
Canary Wharf,100,100,100,100,100,100
Canning Town,100,100,100,100,100,100
Cubitt Town,100,100,100,100,100,100
Custom House,100,100,100,100,100,100


In [150]:
print('There are {} uniques categories.'.format(len(ld_venues['Venue Category'].unique())))

There are 209 uniques categories.


In [151]:
ld_venue_unique_count = ld_venues['Venue Category'].value_counts().to_frame(name='Count')

# Clustering
For this section, the neighbourhoods in East London will be clustered based on the processed data obtained above.
# Map Visualization —
Using the geopy library, the latitude and longitude values of London is obtained.

In [152]:
address = 'London, United Kingdom'
geolocator = Nominatim(user_agent="ln_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of London are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of London are 51.5073219, -0.1276474.


The folium library is then used to obtain the coordinates.

In [190]:
map_london = folium.Map(location = [latitude, longitude], zoom_start = 12)
map_london

The East London neighbourhoods are then superimposed on top as shown below, still using the `folium`library. Please note due to the location of the East London, you might need to zoom to see the superimposed areas.

In [154]:
# Adding markers to map
for lat, lng, borough, loc in zip(tdf['Latitude'], 
                                  tdf['Longitude'],
                                  tdf['Borough'],
                                  tdf['Location']):
    label = '{} - {}'.format(loc, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_london)  
    
display(map_london)

# Analysing Each Neighbourhood — 
In this section, the objective is to check and explore the venues in each neighbourhood.
## One Hot Encoding

In [158]:
ld_onehot = pd.get_dummies(ld_venues[['Venue Category']], prefix = "", prefix_sep = "")

In [162]:
# add neighborhood column back to dataframe
ld_onehot['Neighbourhood'] = ld_venues['Neighbourhood']

In [163]:
# move neighborhood column to the first column
fixed_columns = [ld_onehot.columns[-1]] + list(ld_onehot.columns[:-1])
ld_onehot = ld_onehot[fixed_columns]

In [165]:
ld_onehot.head(5)

Unnamed: 0,Neighbourhood,Accessories Store,American Restaurant,Antique Shop,Argentinian Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Bagel Shop,Bakery,Bar,Bed & Breakfast,Beer Bar,Beer Garden,Beer Store,Betting Shop,Bookstore,Boutique,Boxing Gym,Brasserie,Breakfast Spot,Brewery,Bridge,Bubble Tea Shop,Buddhist Temple,Buffet,Burger Joint,Burrito Place,Bus Station,Bus Stop,Business Service,Butcher,Café,Canal,Canal Lock,Castle,Caucasian Restaurant,Cheese Shop,Chinese Restaurant,Chocolate Shop,Circus School,Climbing Gym,Clothing Store,Cocktail Bar,Coffee Shop,Comfort Food Restaurant,Comic Shop,Concert Hall,Convenience Store,Cosmetics Shop,Cycle Studio,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Doner Restaurant,Eastern European Restaurant,Electronics Store,Empanada Restaurant,English Restaurant,Event Space,Fabric Shop,Falafel Restaurant,Farm,Farmers Market,Fast Food Restaurant,Fish & Chips Shop,Fish Market,Flea Market,Flower Shop,Food & Drink Shop,Food Court,Food Stand,Food Truck,Forest,Fountain,French Restaurant,Furniture / Home Store,Garden,Garden Center,Gas Station,Gastropub,General Entertainment,Gift Shop,Golf Course,Golf Driving Range,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Gym Pool,Harbor / Marina,Hardware Store,Historic Site,History Museum,Hockey Field,Hostel,Hotel,Hotel Bar,Ice Cream Shop,Indian Restaurant,Indie Movie Theater,Indie Theater,Indoor Play Area,Irish Pub,Italian Restaurant,Japanese Curry Restaurant,Japanese Restaurant,Jazz Club,Jewelry Store,Kebab Restaurant,Korean Restaurant,Lake,Light Rail Station,Lighthouse,Lingerie Store,Liquor Store,Lounge,Market,Martial Arts Dojo,Mediterranean Restaurant,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Modern European Restaurant,Monument / Landmark,Movie Theater,Moving Target,Multiplex,Museum,Music Venue,Nail Salon,Nature Preserve,Neighborhood,Nightclub,Opera House,Optical Shop,Organic Grocery,Outdoor Sculpture,Outlet Mall,Pakistani Restaurant,Park,Pedestrian Plaza,Performing Arts Venue,Persian Restaurant,Pet Store,Pharmacy,Pier,Pilates Studio,Pizza Place,Platform,Playground,Plaza,Poke Place,Pool,Portuguese Restaurant,Print Shop,Pub,Rafting,Ramen Restaurant,Record Shop,Recording Studio,Rental Car Location,Restaurant,Roof Deck,Salon / Barbershop,Sandwich Place,Scenic Lookout,Science Museum,Seafood Restaurant,Shopping Mall,Shopping Plaza,Skate Park,Snack Place,Soccer Field,Soccer Stadium,Spanish Restaurant,Speakeasy,Sporting Goods Shop,Sports Bar,Stadium,Stationery Store,Steakhouse,Street Food Gathering,Supermarket,Sushi Restaurant,Tapas Restaurant,Tea Room,Tennis Court,Thai Restaurant,Theater,Toy / Game Store,Trail,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Yoga Studio
0,Beckton,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Beckton,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Beckton,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Beckton,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Beckton,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


To check the Asian Restaurants:

In [None]:
ld_onehot.loc[ld_onehot['Asian Restaurant'] != 0]
#One hot encoded dataframe (showing Asian Restaurants)

Regrouping and Category Statistics

In [167]:
ld_grouped = ld_onehot.groupby('Neighbourhood').mean().reset_index()

Grouping of each Neighbourhoods with 10 common venues:

In [172]:
num_top_venues = 10 # Top common venues needed
for hood in ld_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = ld_grouped[ld_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue', 'freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending = False).reset_index(drop = True).head(num_top_venues))
    print('\n')

----Beckton----
                  venue  freq
0                 Hotel  0.08
1           Coffee Shop  0.07
2         Grocery Store  0.07
3                   Pub  0.05
4           Supermarket  0.05
5     Indian Restaurant  0.04
6                  Park  0.03
7  Fast Food Restaurant  0.03
8        Sandwich Place  0.03
9        Clothing Store  0.02


----Bethnal Green----
          venue  freq
0   Coffee Shop  0.14
1          Café  0.06
2        Bakery  0.05
3         Hotel  0.04
4           Pub  0.04
5     Bookstore  0.04
6   Pizza Place  0.03
7  Cocktail Bar  0.03
8      Wine Bar  0.03
9      Beer Bar  0.03


----Blackwall----
                  venue  freq
0           Coffee Shop  0.07
1                 Hotel  0.06
2                   Pub  0.06
3                  Park  0.05
4          Burger Joint  0.04
5    Italian Restaurant  0.04
6                   Bar  0.03
7  Gym / Fitness Center  0.03
8                 Plaza  0.03
9                 Diner  0.02


----Bow----
                venue  f

## Creating new dataframe

In [173]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending = False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Then we create a new panda dataframe with 10 most common venues as shown below:

In [176]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = ld_grouped['Neighbourhood']
for ind in np.arange(ld_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(ld_grouped.iloc[ind, :], num_top_venues)
neighbourhoods_venues_sorted.head(5)

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Beckton,Hotel,Coffee Shop,Grocery Store,Pub,Supermarket,Indian Restaurant,Sandwich Place,Fast Food Restaurant,Park,Café
1,Bethnal Green,Coffee Shop,Café,Bakery,Bookstore,Hotel,Pub,Wine Bar,Restaurant,Cocktail Bar,Beer Bar
2,Blackwall,Coffee Shop,Hotel,Pub,Park,Burger Joint,Italian Restaurant,Bar,Gym / Fitness Center,Plaza,Steakhouse
3,Bow,Pub,Café,Park,Coffee Shop,Canal Lock,Art Gallery,Turkish Restaurant,Bar,Thai Restaurant,Pizza Place
4,Bromley (also Bromley-by-Bow),Pub,Café,Park,Coffee Shop,Canal Lock,Art Gallery,Turkish Restaurant,Bar,Thai Restaurant,Pizza Place


## Clustering of Neighbourhoods
We create the grouped clustering for the neighbourhood as shown below:

In [177]:
ld_grouped_clustering = ld_grouped.drop('Neighbourhood', 1)

And then create clusters of the neighbourhood using the k-means to cluster the neighbourhood into 5 clusters

In [178]:
# set number of clusters
kclusters = 5
# run k-means clustering
kmeans = KMeans(n_clusters = kclusters, random_state=0).fit(ld_grouped_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([2, 1, 4, 3, 3, 1, 4, 2, 4, 2], dtype=int32)

In [179]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
ld_merged = tdf
# match/merge SE London data with latitude/longitude for each neighborhood
ld_merged_latlong = ld_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on = 'Location')
ld_merged_latlong.head(5)

Unnamed: 0,Location,Borough,Postcode,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Beckton,Newham,E6,51.53292,0.05461,2,Hotel,Coffee Shop,Grocery Store,Pub,Supermarket,Indian Restaurant,Sandwich Place,Fast Food Restaurant,Park,Café
1,Beckton,Newham,E16,51.50913,0.01528,2,Hotel,Coffee Shop,Grocery Store,Pub,Supermarket,Indian Restaurant,Sandwich Place,Fast Food Restaurant,Park,Café
2,Beckton,Newham,IG11,51.53312,0.084077,2,Hotel,Coffee Shop,Grocery Store,Pub,Supermarket,Indian Restaurant,Sandwich Place,Fast Food Restaurant,Park,Café
3,Bethnal Green,Tower Hamlets,E2,51.52669,-0.06257,1,Coffee Shop,Café,Bakery,Bookstore,Hotel,Pub,Wine Bar,Restaurant,Cocktail Bar,Beer Bar
4,Blackwall,Tower Hamlets,E14,51.51122,-0.01264,4,Coffee Shop,Hotel,Pub,Park,Burger Joint,Italian Restaurant,Bar,Gym / Fitness Center,Plaza,Steakhouse


## Visualizing the Resulting Clusters —
To visualize the clusters, we have the following:

In [184]:
ld_clusters = ld_merged_latlong
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ld_clusters['Latitude'], ld_clusters['Longitude'], ld_clusters['Location'], ld_clusters['Cluster Labels']):
    label = folium.Popup(str(poi) + 'Cluster '  + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=20,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
display(map_clusters)

In [187]:
# Cluster 1
ld_clusters.loc[ld_clusters['Cluster Labels'] == 0, ld_clusters.columns[[1] + list(range(5, ld_clusters.shape[1]))]]
# Cluster 2
ld_clusters.loc[ld_clusters['Cluster Labels'] == 1, ld_clusters.columns[[1] + list(range(5, ld_clusters.shape[1]))]]
# Cluster 3
ld_clusters.loc[ld_clusters['Cluster Labels'] == 2, ld_clusters.columns[[1] + list(range(5, ld_clusters.shape[1]))]]
# Cluster 4
ld_clusters.loc[ld_clusters['Cluster Labels'] == 3, ld_clusters.columns[[1] + list(range(5, ld_clusters.shape[1]))]]
# Cluster 5
ld_clusters.loc[ld_clusters['Cluster Labels'] == 4, ld_clusters.columns[[1] + list(range(5, ld_clusters.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Tower Hamlets,4,Coffee Shop,Hotel,Pub,Park,Burger Joint,Italian Restaurant,Bar,Gym / Fitness Center,Plaza,Steakhouse
8,Tower Hamlets,4,Coffee Shop,Hotel,Pub,Park,Burger Joint,Italian Restaurant,Bar,Gym / Fitness Center,Plaza,Steakhouse
10,Tower Hamlets,4,Coffee Shop,Hotel,Pub,Park,Burger Joint,Italian Restaurant,Bar,Gym / Fitness Center,Plaza,Steakhouse
16,Tower Hamlets,4,Coffee Shop,Hotel,Pub,Park,Burger Joint,Italian Restaurant,Bar,Gym / Fitness Center,Plaza,Steakhouse
17,Tower Hamlets,4,Coffee Shop,Hotel,Pub,Park,Burger Joint,Italian Restaurant,Bar,Gym / Fitness Center,Plaza,Steakhouse
18,Tower Hamlets,4,Coffee Shop,Hotel,Pub,Park,Burger Joint,Italian Restaurant,Bar,Gym / Fitness Center,Plaza,Steakhouse
23,Tower Hamlets,4,Coffee Shop,Hotel,Pub,Park,Burger Joint,Italian Restaurant,Bar,Gym / Fitness Center,Plaza,Steakhouse
27,Tower Hamlets,4,Coffee Shop,Hotel,Pub,Park,Burger Joint,Italian Restaurant,Bar,Gym / Fitness Center,Plaza,Steakhouse
31,Redbridge,4,Grocery Store,Coffee Shop,Pub,Italian Restaurant,Metro Station,Café,Restaurant,Park,Supermarket,English Restaurant
35,Tower Hamlets,4,Hotel,Coffee Shop,Cocktail Bar,Art Gallery,Gym / Fitness Center,Seafood Restaurant,Grocery Store,Italian Restaurant,Theater,French Restaurant


# Results
The following are the highlights of the 5 clusters above:
1. Pub,Cafe, Coffee Shops are popular in east London.
2. As for restaurants, the Indian,Italian Restaurants and turkish restaurants are very popular in the east London area. 
3. With the above areas being the most condensed area of Asians in the London , it is surprising to see how in the top 10 venues, you can barely see Asian restaurants in the top 5 venues.
4. Although, the Clusters have variations, a very visible presence is the predominance of Hotels and coffee shops.

# Discussion and Conclusion:

Canary Warf, bethnal green and Blackwall are viable options for an asian restaurant. They have a large Asian popullation but very few Asian restaurants.
In conclusion, this project would have had better results if there were more data in terms of crime data within the area, traffic access and allowance of more venues exploration with the Foursquare (limited venues for free calls).
Also, getting the ratings and feedback of the current restaurants within the clusters would have helped in providing more insight into the best location.