## Creating Heatmap for the Busiest Airports in The world

### Scrape data from wikipedia using scrapy

In [1]:
import scrapy
import csv
from scrapy.crawler import CrawlerProcess
from scrapy import Request





class BusyAirportSpider(scrapy.Spider):
    name = 'busy_airport'
    allowed_domains = ['en.wikipedia.org']
    start_urls = ['https://en.wikipedia.org/wiki/List_of_busiest_airports_by_passenger_traffic']

    
    def parse(self, response):
        country = response.xpath('//*[@id="mw-content-text"]/div/table[1]/tbody/tr/td/span/a/@title').extract()
        airport =  response.xpath('//*[@id="mw-content-text"]/div/table[1]/tbody/tr/td[2]/a/@title').extract()
        location = response.xpath('//*[@id="mw-content-text"]/div/table[2]/tbody/tr/td[3]/a[1]/text()').extract()
        no_of_passengers = response.xpath('//*[@id="mw-content-text"]/div/table[2]/tbody/tr/td[6]/text()').extract()
        for i in range(len(response.xpath('//*[@id="mw-content-text"]/div/table[1]/tbody/tr/td[2]/a/@title').extract())):
            yield {'Country':country[i],'Airport':airport[i],'location':location[i],'no_of_passengers':no_of_passengers[i]}



In [2]:
process = CrawlerProcess({'FEED_URI': 'Busy_airport.csv','FEED_FORMAT': 'csv'})
process.crawl(BusyAirportSpider)
process.start()

2020-06-15 17:26:06 [scrapy.utils.log] INFO: Scrapy 1.4.0 started (bot: scrapybot)
2020-06-15 17:26:06 [scrapy.utils.log] INFO: Overridden settings: {'FEED_FORMAT': 'csv', 'FEED_URI': 'Busy_airport.csv'}
2020-06-15 17:26:06 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2020-06-15 17:26:06 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.

2020-06-15 17:26:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://en.wikipedia.org/wiki/List_of_busiest_airports_by_passenger_traffic>
{'Country': 'United States', 'Airport': 'San Francisco International Airport', 'location': 'Barajas', 'no_of_passengers': '57,891,340'}
2020-06-15 17:26:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://en.wikipedia.org/wiki/List_of_busiest_airports_by_passenger_traffic>
{'Country': 'China', 'Airport': 'Chengdu Shuangliu International Airport', 'location': 'San Mateo County', 'no_of_passengers': '57,793,313'}
2020-06-15 17:26:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://en.wikipedia.org/wiki/List_of_busiest_airports_by_passenger_traffic>
{'Country': 'China', 'Airport': "Shenzhen Bao'an International Airport", 'location': 'Shuangliu', 'no_of_passengers': '52,950,000'}
2020-06-15 17:26:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://en.wikipedia.org/wiki/List_of_busiest_airports_by_passenger_traffic>
{'Country': 'Sp

2020-06-15 17:26:07 [scrapy.core.engine] INFO: Spider closed (finished)


In [3]:
#load the data using pandas
import pandas as pd
df3 = pd.read_csv('Busy_airport.csv')

In [4]:
df3.head()

Unnamed: 0,Country,Airport,location,no_of_passengers
0,United States,Hartsfield–Jackson Atlanta International Airport,Atlanta,107394029
1,China,Beijing Capital International Airport,Chaoyang,100983290
2,United States,Los Angeles International Airport,Garhoud,89149387
3,Japan,Haneda Airport,Los Angeles,87534384
4,United Arab Emirates,Dubai International Airport,Ōta,87131973


In [5]:
df3.shape

(50, 4)

In [6]:
df3.info

<bound method DataFrame.info of                  Country                                            Airport  \
0          United States   Hartsfield–Jackson Atlanta International Airport   
1                  China              Beijing Capital International Airport   
2          United States                  Los Angeles International Airport   
3                  Japan                                     Haneda Airport   
4   United Arab Emirates                        Dubai International Airport   
5          United States                       O'Hare International Airport   
6         United Kingdom                            London Heathrow Airport   
7                  China              Shanghai Pudong International Airport   
8                 France                          Charles de Gaulle Airport   
9          United States            Dallas/Fort Worth International Airport   
10                 China             Guangzhou Baiyun International Airport   
11           Netherl

In [7]:
import numpy as np
# Create a empty Column to insert lattitude and longitude values of the aiport locations
df3['lat'] = np.nan
df3['lon'] = np.nan

In [8]:
#Using Geocoder we get the lattitude and longitude values of the airport locations
#! pip install geocoder
import geocoder
#to show the progress of the process
from tqdm import tqdm

for i in tqdm(range(df3.shape[0])):
    co_ord = geocoder.osm(('{s}').format(s = df3['Airport'][i]))
    if co_ord.lat and co_ord.lng  != np.nan:
        df3['lat'][i] = co_ord.lat
        df3['lon'][i] = co_ord.lng
    else: 
        co_ord = geocoder.osm(('{l}').format(l = df3['Country'][i]))
        df3['lat'][i] = co_ord.lat
        df3['lon'][i] = co_ord.lng

  0%|          | 0/50 [00:00<?, ?it/s]2020-06-15 17:26:17 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): nominatim.openstreetmap.org:443
2020-06-15 17:26:18 [urllib3.connectionpool] DEBUG: https://nominatim.openstreetmap.org:443 "GET /search?q=Hartsfield%E2%80%93Jackson+Atlanta+International+Airport&format=jsonv2&addressdetails=1&limit=1 HTTP/1.1" 200 None
2020-06-15 17:26:18 [geocoder.base] INFO: Requested https://nominatim.openstreetmap.org/search?q=Hartsfield%E2%80%93Jackson+Atlanta+International+Airport&format=jsonv2&addressdetails=1&limit=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexi

 28%|██▊       | 14/50 [00:06<00:18,  1.98it/s]2020-06-15 17:26:24 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): nominatim.openstreetmap.org:443
2020-06-15 17:26:25 [urllib3.connectionpool] DEBUG: https://nominatim.openstreetmap.org:443 "GET /search?q=Denver+International+Airport&format=jsonv2&addressdetails=1&limit=1 HTTP/1.1" 200 None
2020-06-15 17:26:25 [geocoder.base] INFO: Requested https://nominatim.openstreetmap.org/search?q=Denver+International+Airport&format=jsonv2&addressdetails=1&limit=1
 30%|███       | 15/50 [00:07<00:17,  1.98it/s]2020-06-15 17:26:25 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): nominatim.openstreetmap.org:443
2020-06-15 17:26:25 [urllib3.connectionpool] DEBUG: https://nominatim.openstreetmap.org:443 "GET /search?q=Seoul+Incheon+International+Airport&format=jsonv2&addressdetails=1&limit=1 HTTP/1.1" 200 None
2020-06-15 17:26:25 [geocoder.base] INFO: Requested https://nominatim.openstreetmap.org/search?q=Seoul+Inche

2020-06-15 17:26:33 [geocoder.base] INFO: Requested https://nominatim.openstreetmap.org/search?q=McCarran+International+Airport&format=jsonv2&addressdetails=1&limit=1
 60%|██████    | 30/50 [00:15<00:10,  1.91it/s]2020-06-15 17:26:33 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): nominatim.openstreetmap.org:443
2020-06-15 17:26:33 [urllib3.connectionpool] DEBUG: https://nominatim.openstreetmap.org:443 "GET /search?q=Orlando+International+Airport&format=jsonv2&addressdetails=1&limit=1 HTTP/1.1" 200 None
2020-06-15 17:26:33 [geocoder.base] INFO: Requested https://nominatim.openstreetmap.org/search?q=Orlando+International+Airport&format=jsonv2&addressdetails=1&limit=1
 62%|██████▏   | 31/50 [00:15<00:09,  1.93it/s]2020-06-15 17:26:33 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): nominatim.openstreetmap.org:443
2020-06-15 17:26:34 [urllib3.connectionpool] DEBUG: https://nominatim.openstreetmap.org:443 "GET /search?q=Toronto+Pearson+International+Air

 90%|█████████ | 45/50 [00:23<00:02,  1.96it/s]2020-06-15 17:26:40 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): nominatim.openstreetmap.org:443
2020-06-15 17:26:41 [urllib3.connectionpool] DEBUG: https://nominatim.openstreetmap.org:443 "GET /search?q=Shanghai+Hongqiao+International+Airport&format=jsonv2&addressdetails=1&limit=1 HTTP/1.1" 200 None
2020-06-15 17:26:41 [geocoder.base] INFO: Requested https://nominatim.openstreetmap.org/search?q=Shanghai+Hongqiao+International+Airport&format=jsonv2&addressdetails=1&limit=1
 92%|█████████▏| 46/50 [00:23<00:01,  2.11it/s]2020-06-15 17:26:41 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): nominatim.openstreetmap.org:443
2020-06-15 17:26:41 [urllib3.connectionpool] DEBUG: https://nominatim.openstreetmap.org:443 "GET /search?q=George+Bush+Intercontinental+Airport&format=jsonv2&addressdetails=1&limit=1 HTTP/1.1" 200 None
2020-06-15 17:26:41 [geocoder.base] INFO: Requested https://nominatim.openstreetmap.o

In [9]:
df3.head()

Unnamed: 0,Country,Airport,location,no_of_passengers,lat,lon
0,United States,Hartsfield–Jackson Atlanta International Airport,Atlanta,107394029,33.637799,-84.429271
1,China,Beijing Capital International Airport,Chaoyang,100983290,40.079285,116.594561
2,United States,Los Angeles International Airport,Garhoud,89149387,33.942168,-118.421376
3,Japan,Haneda Airport,Los Angeles,87534384,35.545721,139.780587
4,United Arab Emirates,Dubai International Airport,Ōta,87131973,25.251417,55.368541


In [10]:
#check for null values
df3.isna().sum()

Country             0
Airport             0
location            0
no_of_passengers    0
lat                 0
lon                 0
dtype: int64

In [11]:
df3.dtypes

Country              object
Airport              object
location             object
no_of_passengers     object
lat                 float64
lon                 float64
dtype: object

In [12]:
#change the datatype to integere
df3['no_of_passengers'] = df3['no_of_passengers'].str.replace(',', '').astype(int)

In [13]:
df3.dtypes

Country              object
Airport              object
location             object
no_of_passengers      int64
lat                 float64
lon                 float64
dtype: object

In [14]:
passengers = df3['no_of_passengers']/1000

In [15]:
passengers[0:5]

0    107394.029
1    100983.290
2     89149.387
3     87534.384
4     87131.973
Name: no_of_passengers, dtype: float64

In [16]:
#getting the lattitude longitude values with number of passengers
lat_lon_no= []
for i in range(df3.shape[0]):
    lat_lon_no.append([df3['lat'][i],df3['lon'][i],passengers[i]])

In [17]:
import folium
map_heatmap = folium.Map([40, -99], tiles='CartoDB Positron', zoom_start=4)

from folium import plugins
plugins.HeatMap(lat_lon_no).add_to(map_heatmap)
map_heatmap.save('map_busiest_heatmap.html')


map_heatmap