# Which borough for a London restaurant

## Table of Contents

1. <a href="#item1">Import necessary Libraries</a>    
2. <a href="#item2">Use BeautifulSoup to pull out from of the HTML and XML files</a> 
3. <a href="#item2">Use geopy library to get the latitude and longitude values</a> 
4. <a href="#item2">Use Foursquare to get the top 100 venues from each Borough</a> 


### 1. Import necessary Libraries

In [1]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner

from bs4 import BeautifulSoup # library for pulling data out of HTML and XML files
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values


print('Libraries imported.')

Libraries imported.


### 2. Use BeautifulSoup to pull out from of the HTML and XML files

In [2]:
# url
url = 'https://en.wikipedia.org/wiki/London_boroughs'
data_xml = requests.get(url).text
soup_postal_codes = BeautifulSoup(data_xml,'lxml')

# find the table in xml code
Table_boroughs = soup_postal_codes.find('table',{'class':'wikitable sortable'})

# instantiate the dataframe
column_names = ['Borough', 'Designation', 'Former areas 1', 'Former areas 2', 'Former areas 3', 'Former areas 4', 'Former areas 5'] 
Borough_table = pd.DataFrame(columns=column_names)

#Table_boroughs

Table_boroughs = Table_boroughs.find_all('tr')
boroughs_list = []

for balise_tr in Table_boroughs:
    balise_td = balise_tr.find_all('td')
    row = [i.text for i in balise_td]
    boroughs_list.append(row)

del boroughs_list[0]

for data in boroughs_list:
    Borough = data[0]
    Designation = data[1]
    Former_areas_1 = data[2]
    Former_areas_2 = data[3]
    Former_areas_3 = data[4]
    Former_areas_4 = data[5]
    Former_areas_5 = data[6]

    Borough_table = Borough_table.append({'Borough': Borough,
                                   'Designation': Designation,
                                   'Former areas 1':Former_areas_1,
                                    'Former areas 2':Former_areas_2,
                                    'Former areas 3':Former_areas_3,
                                    'Former areas 4':Former_areas_4,
                                    'Former areas 5':Former_areas_5 }, 
                                    ignore_index=True)

# clean data from :\n and [notes]
Borough_table.Borough = Borough_table.Borough.apply(lambda ele : ele.strip('\n'))
Borough_table.Designation = Borough_table.Designation.apply(lambda ele : ele.strip('\n'))

result_split = Borough_table['Borough'].str.split('[')


Borough_list = []
for ele in result_split:
    #print(ele[0])
    Borough_list.append(ele[0])

Borough_table['Borough'] = Borough_list

Borough_table_IO = Borough_table[['Borough','Designation']]



In [3]:
Borough_table_IO.head()

Unnamed: 0,Borough,Designation
0,Greenwich,Inner
1,Hackney,Inner
2,Hammersmith,Inner
3,Islington,Inner
4,Kensington and Chelsea,Inner


### 3. Use geopy library to get the latitude and longitude values

In [4]:
# function that extracts the coordinate of a city
def get_coordinate(address):
    
    geolocator = Nominatim(user_agent="explorer")
    location = geolocator.geocode(address)
    return location

In [5]:
Borough_list_IO = []

for Borough in Borough_table_IO.Borough :
    LD_loc = get_coordinate(Borough + ', London, UK')
    row = [Borough, LD_loc.latitude,LD_loc.longitude]
    #print('row : ', row)
    Borough_list_IO.append(row)


In [6]:
# instantiate the dataframe

column_names = ['Borough', 'latitude', 'longitude'] 
Borough_lat_long = pd.DataFrame(columns=column_names)


In [7]:
for data in Borough_list_IO:
    Borough = data[0]
    latitude = data[1]
    longitude = data[2]


    Borough_lat_long = Borough_lat_long.append({'Borough': Borough,
                                                'latitude': latitude,
                                                'longitude':longitude}, 
                                               ignore_index=True)


In [8]:
Borough_lat_long.head()

Unnamed: 0,Borough,latitude,longitude
0,Greenwich,51.482084,-0.004542
1,Hackney,51.54324,-0.049362
2,Hammersmith,51.492038,-0.22364
3,Islington,51.538429,-0.099905
4,Kensington and Chelsea,51.498995,-0.199123


### 4. Use Foursquare to get the top 100 venues from each Borough

In [9]:
# The code was removed by Watson Studio for sharing.

In [10]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT= 100, VERSION = '20120618' ):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Borough', 
                  'Borough Latitude', 
                  'Borough Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [11]:
london_venues = getNearbyVenues(names=Borough_lat_long['Borough'],
                                   latitudes=Borough_lat_long['latitude'],
                                   longitudes=Borough_lat_long['longitude']
                                  )

Greenwich
Hackney
Hammersmith
Islington
Kensington and Chelsea
Lambeth
Lewisham
Southwark
Tower Hamlets
Wandsworth
Westminster
Barking
Barnet
Bexley
Brent
Bromley
Croydon
Ealing
Enfield
Haringey
Harrow
Havering
Hillingdon
Hounslow
Kingston upon Thames
Merton
Newham
Redbridge
Richmond upon Thames
Sutton
Camden
Waltham Forest


In [16]:
london_venues.head()

Unnamed: 0,Borough,Borough Latitude,Borough Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Greenwich,51.482084,-0.004542,Old Royal Naval College,51.483234,-0.005579,Historic Site
1,Greenwich,51.482084,-0.004542,Painted Hall,51.482889,-0.00642,Museum
2,Greenwich,51.482084,-0.004542,National Maritime Museum,51.481329,-0.005581,History Museum
3,Greenwich,51.482084,-0.004542,Greenwich Naval College Gardens,51.483007,-0.008362,Garden
4,Greenwich,51.482084,-0.004542,The Plume of Feathers,51.481945,-0.001126,Pub


### Let's find out how many unique categories can be curated from all the returned venues

In [12]:
#london_venues['Venue Category'].value_counts()
print('There are {} uniques categories.'.format(len(london_venues['Venue Category'].unique())))


There are 222 uniques categories.


### Analyze Each Neighborhood


In [19]:
london_onehot = pd.get_dummies(london_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
london_onehot['Borough'] = london_venues['Borough'] 

# move neighborhood column to the first column
Borough_pos = london_onehot.columns.get_loc("Borough")
fixed_columns = ([london_onehot.columns[Borough_pos]] + 
                 list(london_onehot.columns[:Borough_pos]) + 
                 list(london_onehot.columns[(Borough_pos+1):]))

london_onehot = london_onehot[fixed_columns]
london_onehot.head()

Unnamed: 0,Borough,Afghan Restaurant,African Restaurant,American Restaurant,Antique Shop,Aquarium,Arcade,Arepa Restaurant,Art Gallery,Arts & Crafts Store,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Greenwich,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Greenwich,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Greenwich,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Greenwich,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Greenwich,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Let's print each borough along with the top 5 most common venues


In [21]:
# Next, let's group rows by Borough and by taking the mean of the frequency of occurrence of each category
london_grouped = london_onehot.groupby('Borough').mean().reset_index()

num_top_venues = 5

for hood in london_grouped['Borough']:
    print("----"+hood+"----")
    temp = london_grouped[london_grouped['Borough'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Barking----
                venue  freq
0       Grocery Store  0.15
1         Supermarket  0.10
2            Pharmacy  0.05
3                 Gym  0.05
4  Chinese Restaurant  0.05


----Barnet----
                  venue  freq
0                   Pub  0.29
1  Fast Food Restaurant  0.14
2     French Restaurant  0.14
3           Supermarket  0.14
4                   Gym  0.14


----Bexley----
                  venue  freq
0                   Pub  0.18
1  Fast Food Restaurant  0.18
2    Italian Restaurant  0.09
3         Train Station  0.09
4        Breakfast Spot  0.09


----Brent----
                  venue  freq
0          Cupcake Shop   0.2
1     Indian Restaurant   0.2
2                  Park   0.2
3  Gym / Fitness Center   0.2
4  Fast Food Restaurant   0.2


----Bromley----
                  venue  freq
0           Coffee Shop  0.13
1        Clothing Store  0.11
2                   Pub  0.07
3  Gym / Fitness Center  0.04
4           Pizza Place  0.04


----Camden----
          v

### Let's put that into a pandas dataframe

In [22]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [23]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Borough']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
borough_venues_sorted = pd.DataFrame(columns=columns)
borough_venues_sorted['Borough'] = london_grouped['Borough']

for ind in np.arange(london_grouped.shape[0]):
    borough_venues_sorted.iloc[ind, 1:] = return_most_common_venues(london_grouped.iloc[ind, :], num_top_venues)

borough_venues_sorted.head()

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Barking,Grocery Store,Supermarket,Pub,Café,Steakhouse,Theater,Chinese Restaurant,Spa,Sandwich Place,Gym
1,Barnet,Pub,Gym,Supermarket,Fast Food Restaurant,Train Station,French Restaurant,Event Space,Food & Drink Shop,Food,Flea Market
2,Bexley,Fast Food Restaurant,Pub,Indian Restaurant,Train Station,Greek Restaurant,Bar,Toy / Game Store,Breakfast Spot,Italian Restaurant,Dim Sum Restaurant
3,Brent,Indian Restaurant,Gym / Fitness Center,Park,Fast Food Restaurant,Cupcake Shop,Hostel,Event Space,Food,Flea Market,Fish Market
4,Bromley,Coffee Shop,Clothing Store,Pub,Pizza Place,Gym / Fitness Center,Burger Joint,Mexican Restaurant,Department Store,Bookstore,Furniture / Home Store
