# web scraping to get the cities data

In [1]:


import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import requests
import re


city_names = ['Berlin', 'Vienna', 'Rome', 'Paris', 'Barcelona', 'Milan', 'Amsterdam', 'Stockholm', 'Oslo', 'Helsinki', 'Prague',
          'Madrid', 'Budapest', 'Madrid', 'Copenhagen', 'Athens', 'London', 'Cologne', 'Hamburg', 'Munich', 'Bucharest', 
          'Dublin', 'Edinburgh', 'Sofia']

country = []
coordinates = []
population = []

def extract_population(text):
    numbers = re.findall(r'\b\d{1,3}(?:,\d{3})+\b', text)
    return max(map(lambda num: int(num.replace(',', '')), numbers))

for city in city_names:
    url = f"https://en.wikipedia.org/wiki/{city}"
    
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        # selecting country
        for s in soup.select("table.infobox tbody tr th"):
            if s.text == "Country":
                try:
                    country.append(s.find_next_sibling("td").select("a")[0].get_text())
                except:
                    country.append(s.find_next_sibling("td").get_text())
                break
        # selecting coordinates
        for s in soup.select("table.infobox tbody tr td"):
            if s.text.startswith("Coordinates: "):
                coordinates.append(s.text.split("/")[-1].split(";"))
        # selecting population
        population.append(extract_population(soup.find('table', class_='infobox').text))

cities_df = pd.DataFrame(
    {"City" : city_names,
    "Country" : country,
    "Coordinates" : coordinates,
    "Population" : population
    }
)

In [2]:
cities_df

Unnamed: 0,City,Country,Coordinates,Population
0,Berlin,Germany,"[ 52.52000, 13.40500]",6144600
1,Vienna,Austria,"[ 48.20833, 16.37250]",951354
2,Rome,Italy,"[ 41.89333, 12.48278]",860009
3,Paris,France,"[ 48.85667, 2.35222]",13024518
4,Barcelona,Spain,"[ 41.38278, 2.17694]",840000
5,Milan,Italy,"[ 45.46694, 9.19000]",371498
6,Amsterdam,Netherlands,"[ 52.37278, 4.89361]",480394
7,Stockholm,Sweden,"[ 59.32944, 18.06861]",2121000
8,Oslo,Norway,"[ 59.91333, 10.73889]",64235
9,Helsinki,Finland,"[ 60.17083, 24.93750]",559558


# getting the weather data using API

In [5]:
import pandas as pd
import requests
from datetime import datetime
import pytz

In [6]:
def get_weather_loop(cities):

  API_key = "155ac89acb627b07f1c68cb640aef942"

  tz = pytz.timezone('Europe/Berlin')
  now = datetime.now().astimezone(tz)

  weather_dict = {'city': [],
                'country': [],
                'forecast_time': [],
                'outlook': [],
                'detailed_outlook': [],
                'temperature': [],
                'temperature_feels_like': [],
                'clouds': [],
                'rain': [],
                'snow': [],
                'wind_speed': [],
                'wind_deg': [],
                'humidity': [],
                'pressure': [],
                'information_retrieved_at': []}

  for city in cities:
    url = (f"http://api.openweathermap.org/data/2.5/forecast?q={city}&appid={API_key}&units=metric")
    response = requests.get(url)
    json = response.json()

    for i in json['list']:
      weather_dict['city'].append(json['city']['name'])
      weather_dict['country'].append(json['city']['country'])
      weather_dict['forecast_time'].append(i['dt_txt'])
      weather_dict['outlook'].append(i['weather'][0]['main'])
      weather_dict['detailed_outlook'].append(i['weather'][0]['description'])
      weather_dict['temperature'].append(i['main']['temp'])
      weather_dict['temperature_feels_like'].append(i['main']['feels_like'])
      weather_dict['clouds'].append(i['clouds']['all'])
      try:
          weather_dict['rain'].append(i['rain']['3h'])
      except:
          weather_dict['rain'].append('0')
      try:
          weather_dict['snow'].append(i['snow']['3h'])
      except:
          weather_dict['snow'].append('0')
      weather_dict['wind_speed'].append(i['wind']['speed'])
      weather_dict['wind_deg'].append(i['wind']['deg'])
      weather_dict['humidity'].append(i['main']['humidity'])
      weather_dict['pressure'].append(i['main']['pressure'])
      weather_dict['information_retrieved_at'].append(now.strftime("%d/%m/%Y %H:%M:%S"))

  return pd.DataFrame(weather_dict)

In [8]:
cities_weather = get_weather_loop(['Berlin', 'London','Ottawa'])
cities_weather

Unnamed: 0,city,country,forecast_time,outlook,detailed_outlook,temperature,temperature_feels_like,clouds,rain,snow,wind_speed,wind_deg,humidity,pressure,information_retrieved_at
0,Berlin,DE,2023-08-13 15:00:00,Clear,clear sky,26.84,27.05,0,0,0,2.71,261,46,1005,13/08/2023 15:59:32
1,Berlin,DE,2023-08-13 18:00:00,Clouds,scattered clouds,26.04,26.04,29,0,0,1.08,178,47,1009,13/08/2023 15:59:32
2,Berlin,DE,2023-08-13 21:00:00,Clouds,broken clouds,23.34,23.24,59,0,0,0.78,174,58,1012,13/08/2023 15:59:32
3,Berlin,DE,2023-08-14 00:00:00,Clouds,scattered clouds,19.94,19.92,45,0,0,1.77,150,74,1016,13/08/2023 15:59:32
4,Berlin,DE,2023-08-14 03:00:00,Clear,clear sky,18.91,18.92,8,0,0,1.63,207,79,1016,13/08/2023 15:59:32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,Ottawa,CA,2023-08-18 00:00:00,Rain,light rain,20.47,20.77,99,0.22,0,3.47,187,84,1005,13/08/2023 15:59:32
116,Ottawa,CA,2023-08-18 03:00:00,Rain,light rain,18.41,18.66,84,0.13,0,2.45,245,90,1004,13/08/2023 15:59:32
117,Ottawa,CA,2023-08-18 06:00:00,Rain,moderate rain,17.21,17.55,92,6.17,0,4.61,205,98,1003,13/08/2023 15:59:32
118,Ottawa,CA,2023-08-18 09:00:00,Rain,light rain,15.45,15.48,100,1.33,0,5.67,274,93,1004,13/08/2023 15:59:32


# getting the airports data using API

In [11]:
def icao_airport_codes(latitudes, longitudes):

  #assert len(latitudes) == len(longitudes)

  list_for_df = []

  for index, value in enumerate(latitudes):
    url = "https://aviation-reference-data.p.rapidapi.com/airports/search"

    querystring = {"lat":latitudes[index],"lon":longitudes[index],"radius":"100"}

    headers = {
      "X-RapidAPI-Key": "94f5115229mshc91b32bba10ca7ap184966jsne87554a73ea6",
      "X-RapidAPI-Host": "aviation-reference-data.p.rapidapi.com"
    }

    response = requests.get(url, headers=headers, params=querystring)



    list_for_df.append(pd.json_normalize(response.json()))

  return pd.concat(list_for_df, ignore_index=True)

In [12]:
# coordinates for Berlin, Paris, London
latitudes = [52.5200, 48.8567, 51.5072]
longitudes = [13.4050, 2.3522, -0.1275]

icao_airport_codes(latitudes, longitudes)

Unnamed: 0,iataCode,icaoCode,name,alpha2countryCode,latitude,longitude
0,BER,,BRANDENBURG,DE,52.3621,13.5017
1,SXF,EDDB,SCHOENEFELD,DE,52.38,13.5225
2,THF,,Berlin Tempelhof Apt,DE,52.4736,13.4017
3,TXL,EDDT,TEGEL,DE,52.5597,13.2877
4,GWW,,Berlin Royal Air Force Gatow,DE,52.4833,13.1333
5,QPK,EDAY,STRAUSBERG,DE,52.5803,13.9172
6,REB,EDAX,LAERZ,DE,53.3048,12.746
7,BVA,LFOB,Paris Beauvais-Tille Airport,FR,49.4544,2.1128
8,CDG,LFPG,Paris/ Ch.de Gaulle,FR,49.0097,2.5478
9,TNF,LFPN,Toussus-Le-Noble,FR,48.7497,2.1111


# getting the flights data using API

In [19]:
from datetime import datetime, timedelta
def get_arrival_date(response, i):
    if 'arrivals' in response and len(response['arrivals']) > i and 'movement' in response['arrivals'][i] and 'actualTimeLocal' in response['arrivals'][i]['movement']:
        return response['arrivals'][i]['movement']['actualTimeLocal'].split(' ')[0]
    else:
        return response['arrivals'][i]['movement']['scheduledTimeLocal'].split(' ')[0]

def get_actual_arr_local_time(response, i):
    if 'arrivals' in response and len(response['arrivals']) > i and 'movement' in response['arrivals'][i] and 'actualTimeLocal' in response['arrivals'][i]['movement']:
        return response['arrivals'][i]['movement']['actualTimeLocal'].split(' ')[1].split('+')[0]
    else:
        return response['arrivals'][i]['movement']['scheduledTimeLocal'].split(' ')[1].split('+')[0]

def get_delay_time(response, i):
    if 'arrivals' in response and len(response['arrivals']) > i and 'movement' in response['arrivals'][i] and 'actualTimeLocal' in response['arrivals'][i]['movement']:
        return response['arrivals'][i]['movement']['actualTimeLocal'].split(' ')[1].split('+')[1]
    else:
        return response['arrivals'][i]['movement']['scheduledTimeLocal'].split(' ')[1].split('+')[1]
    
def flights_information(df):
    flights_data = []

    for _, row in df.iterrows():

        icao = row['icaoCode']
        iata = row['iataCode']
        airport_id = row['airport_id']
        tommorow_date = (datetime.now() + timedelta(days=1)).strftime('%Y-%m-%d')

        url = f"https://aerodatabox.p.rapidapi.com/flights/airports/icao/{icao}/{tommorow_date}T11:00/{tommorow_date}T23:00"

        querystring = {"withLeg":"false","direction":"Arrival","withCancelled":"true","withCodeshared":"true","withCargo":"false","withPrivate":"true","withLocation":"false"}

        headers = {
            "X-RapidAPI-Key": '0b7cffd425mshbf932b1b5f7e633p187a96jsna5aedb4ce276',
            "X-RapidAPI-Host": "aerodatabox.p.rapidapi.com"
        }

        responses = requests.get(url, headers=headers, params=querystring)
        
        
        
        if responses.status_code != 200: 
            print(f"Error - Status Code: {responses.status_code} at line{_}")
            print(f"Response Content: {responses.text}")
            print('Problem with status code')
            continue
            
        response = responses.json()
        

        for i in range(len(response['arrivals'])):

            output = {
                'airport_id': airport_id,
                'arrival_date' : get_arrival_date(response,i),
                'flight_number' : response['arrivals'][i]['number'],
                'airline' : response['arrivals'][i]['airline']['name'],
                'flight_status' : response['arrivals'][i]['status'],
                
                'scheduled_arr_local_time' : response['arrivals'][i]['movement']['scheduledTimeLocal'].split(' ')[1].split('+')[0],
                
                'actual_arr_local_time' : get_actual_arr_local_time(response, i),
              
                'scheduled_arr_UTC_time' : str(pd.to_datetime(response['arrivals'][i]['movement']['scheduledTimeUtc'])).split(' ')[1].split('+')[0],
                
                'delay_time' : get_delay_time(response, i)
                
            }

            flights_data.append(output)

    flights_df = pd.DataFrame(flights_data)
    flights_df['scheduled_arr_local_time'] = pd.to_datetime(flights_df['scheduled_arr_local_time']).dt.time
    flights_df['actual_arr_local_time'] = pd.to_datetime(flights_df['actual_arr_local_time']).dt.time
    flights_df['delay_time'] = flights_df['delay_time'].apply(lambda x: datetime.strptime(x, "%M:%S").strftime("%H:%M:%S"))

    return flights_df