## Imports and such things

In [65]:
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from geopy.geocoders import Nominatim
import folium
import requests
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import re

## Scrape list of largest cities from Wikipedia

The wikipedia page [List of largest cities](https://en.wikipedia.org/wiki/List_of_largest_cities) has a list of the largest cities in the world.  
We are interested in table that contains the actual cities, and want the cityname, the nation, and its city proper population.  
In addition, we grab the link to said cities' page, to later scrape more information.

We will first use urllib to fetch the page, then BeautifulSoup to parse it and find the first (and only) sortable table.

In [6]:
# location of the wikipedia article
url = "https://en.wikipedia.org/wiki/List_of_largest_cities"

# fetch the article
req = urllib.request.urlopen(url)
article = req.read().decode()

# parse with BeautifulSoup and find the first sortable table
soup = BeautifulSoup(article, 'html.parser')
table = soup.find('table', class_='sortable')

Every row in the table will also have a link to the cities wikipedia page. We will use the following function to scrape specific information from that page.

In [140]:
# Scrape an individual cities page for its coordinates
def scrape_wiki_city(url):
    req = urllib.request.urlopen(url)
    article = req.read().decode()
    reg = re.search(r'"lat":(.*?),"lon":(.*?)}', article)
    lat = float(reg.group(1))
    lon = float(reg.group(2))
    return lat,lon

In [159]:
# create an empty DataFrame
cols=["City", "Nation", "Population", "Latitude", "Longitude", "URL"]
df_cities = pd.DataFrame(columns=cols)
df_cities['Population'].astype(int)
    
# iterate trough all the rows in the table:
for tr in table.find_all('tr', limit=7):
    tds = tr.find_all('td')
    if not tds:
        continue                            # skips first row with headings
    nation = tds[0].find('a').string        # first td column contains nation   
    try:
        pop = int(re.compile(r'\[.*\]').sub("",tds[2].text).replace(',',''))  # rough but working way to parse the population count
    except ValueError:
        pop = 0
    city_a = tr.find('th').find('a')        # the first column contains th tag and contains the <a> link to the city
    city = city_a.string
    url = "https://en.wikipedia.org" + city_a['href']
    lat, lon = scrape_wiki_city(url)
    df_cities = df_cities.append({
        'City': city, 
        'Nation': nation, 
        'Population': pop, 
        'Latitude': lat,
        'Longitude': lon,
        'URL': url
    }, ignore_index=True)

In [160]:
df_cities.head()

Unnamed: 0,City,Nation,Population,Latitude,Longitude,URL
0,Chongqing,China,30751600,29.558333,106.566667,https://en.wikipedia.org/wiki/Chongqing
1,Shanghai,China,24256800,31.228611,121.474722,https://en.wikipedia.org/wiki/Shanghai
2,Delhi,India,11034555,28.61,77.23,https://en.wikipedia.org/wiki/Delhi
3,Beijing,China,21516000,39.916667,116.383333,https://en.wikipedia.org/wiki/Beijing
4,Dhaka,Bangladesh,14399000,23.716111,90.396111,https://en.wikipedia.org/wiki/Dhaka


## Putting these cities on the map

In [158]:
# create map of the world using latitude and longitude values
map_world = folium.Map(location=[0,0],zoom_start=2)

# add markers to map
for lat, lng, city, nation, pop in zip(df_cities['Latitude'], df_cities['Longitude'], df_cities['City'], df_cities['Nation'], df_cities['Population']):
    label = '{}, {}: {}'.format(city, nation, pop)
    label = folium.Popup(label)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
    ).add_to(map_world)  
    
map_world

## Getting more information from Foursquare

This information is needed to connect with Foursquare API

In [162]:
CLIENT_ID = 'YPBVFDUZOP1M24BKCWGXIYZ3RFACOE3V35WSFY4DSCMRU44L' # your Foursquare ID
CLIENT_SECRET = 'VYHYTBSRIZBPYAOCP5ZEFV3YM4C40YEQCQWCUO4NC1JTPNJM' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100

In [164]:
def getRecommendedVenues(cities, latitudes, longitudes):
    
    venues_list=[]
    for city, lat, lon in zip(cities, latitudes, longitudes):
        
        print("city:" + city)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&section=topPicks&client_id={}&client_secret={}&v={}&ll={},{}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lon, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            city,
            lat, 
            lon, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])


    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City',
                  'Latitude', 
                  'Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [165]:
world_venues = getRecommendedVenues(df_cities['City'], df_cities['Latitude'], df_cities['Longitude'])
world_venues.head()

city:Chongqing
city:Shanghai
city:Delhi
city:Beijing
city:Dhaka


Unnamed: 0,City,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Chongqing,29.558333,106.566667,Hongyadong (洪崖洞),29.564942,106.574742,Shopping Mall
1,Chongqing,29.558333,106.566667,Hyatt Regency Chongqing (重庆富力凯悦酒店),29.583032,106.532228,Hotel
2,Chongqing,29.558333,106.566667,一棵树,29.54833,106.59814,Scenic Lookout
3,Chongqing,29.558333,106.566667,重庆国泰艺术中心 Cathay Pacific Art Centre,29.563591,106.574421,Public Art
4,Chongqing,29.558333,106.566667,Paulaner Brauhaus,29.538571,106.557791,German Restaurant


In [166]:
world_venues

Unnamed: 0,City,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Chongqing,29.558333,106.566667,Hongyadong (洪崖洞),29.564942,106.574742,Shopping Mall
1,Chongqing,29.558333,106.566667,Hyatt Regency Chongqing (重庆富力凯悦酒店),29.583032,106.532228,Hotel
2,Chongqing,29.558333,106.566667,一棵树,29.54833,106.59814,Scenic Lookout
3,Chongqing,29.558333,106.566667,重庆国泰艺术中心 Cathay Pacific Art Centre,29.563591,106.574421,Public Art
4,Chongqing,29.558333,106.566667,Paulaner Brauhaus,29.538571,106.557791,German Restaurant
5,Chongqing,29.558333,106.566667,TT酒吧,29.557011,106.571404,Nightclub
6,Chongqing,29.558333,106.566667,The Harp Irish Pub,29.565116,106.57531,Bar
7,Chongqing,29.558333,106.566667,重庆八一路好吃街,29.558669,106.573631,Chinese Restaurant
8,Chongqing,29.558333,106.566667,重庆Muse酒吧,29.556987,106.571565,Nightclub
9,Chongqing,29.558333,106.566667,Blue Frog (蓝蛙),29.580209,106.528782,American Restaurant
