# Flight delays and weather condition â€“ Data Exploration

**Purpose**
- Explore data for flight delays and weather condition use cases
- Validate assumptions before adding API endpoints
- Prototype logic for FastAPI services

**Author:** Rashed  
**Date:** 2025-24-12



The `autoreload` feature of Jupyter ensures that if an imported module (.py file) changes, the notebook will automatically import the new version.  
(However, objects that were defined with the old version of the class won't change.)

In [23]:
# Executing this cell does some magic
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import requests
import pandas as pd

from dotenv import load_dotenv
import os

load_dotenv()


API_URL_Delay = "https://airlabs.co/api/v9/delays"
API_KEY = os.getenv("API_KEY")

airports = ["KJFK","KLAX","KORD","KDEN","KSFO","KSEA","KATL","KDFW","KMSP"]

params = {
    "api_key": API_KEY,
    "delay": 60,
    "type": "departures",
    "dep_icao": ",".join(airports),
    "max": 100,
}

API_URL = f"{API_URL_Delay}"

response = requests.get(API_URL, params=params)

data = response.json()
data

{'request': {'lang': 'en',
  'currency': 'USD',
  'time': 13,
  'id': 'd839qysxweo',
  'server': 'a',
  'host': 'airlabs.co',
  'pid': 2929279,
  'key': {'id': 37729,
   'api_key': '2f1914c4-fa35-4b93-9894-de62a5d49aa7',
   'type': 'free',
   'expired': '2026-01-25T00:00:00.000Z',
   'registered': '2025-12-25T02:41:57.000Z',
   'upgraded': None,
   'limits_by_hour': 2500,
   'limits_by_minute': 250,
   'limits_by_month': 1000,
   'limits_total': 998},
  'params': {'delay': '60',
   'type': 'departures',
   'dep_icao': 'KJFK,KLAX,KORD,KDEN,KSFO,KSEA,KATL,KDFW,KMSP',
   'max': '100',
   'lang': 'en'},
  'version': 9,
  'method': 'delays',
  'client': {'ip': '2607:fb91:4b0c:a7e3:352c:2adb:7af2:fa5d',
   'geo': {'country_code': 'US',
    'country': 'United States',
    'continent': 'North America',
    'lat': 37.751,
    'lng': -97.822,
    'timezone': 'America/Chicago'},
   'connection': {},
   'device': {},
   'agent': {},
   'karma': {'is_blocked': False,
    'is_crawler': False,
    'i

In [5]:
df = pd.DataFrame(data['response'])

Now I will Collect Data from the flight delay API with different delays parameter and eggragate the data for analysis.

In [11]:
# delays = [0, 30, 45, 90, 120]
params = {
    "api_key": API_KEY,
    "delay": 120,
    "type": "departures",
    "dep_icao": "EGLL,LFPG,EDDF,EHAM,LEMD,LIRF",
    "max": 100,}

response2 = requests.get(API_URL, params=params)

data2 = response2.json()
data2

{'request': {'lang': 'en',
  'currency': 'USD',
  'time': 11,
  'id': 'lzv4xk20v0g',
  'server': 'b',
  'host': 'airlabs.co',
  'pid': 860155,
  'key': {'id': 37729,
   'api_key': '2f1914c4-fa35-4b93-9894-de62a5d49aa7',
   'type': 'free',
   'expired': '2026-01-25T00:00:00.000Z',
   'registered': '2025-12-25T02:41:57.000Z',
   'upgraded': None,
   'limits_by_hour': 2500,
   'limits_by_minute': 250,
   'limits_by_month': 1000,
   'limits_total': 1000},
  'params': {'delay': '120',
   'type': 'departures',
   'dep_icao': 'EGLL,LFPG,EDDF,EHAM,LEMD,LIRF',
   'max': '100',
   'lang': 'en'},
  'version': 9,
  'method': 'delays',
  'client': {'ip': '2607:fb91:4b0c:a7e3:cdbb:ed9c:7976:e9be',
   'geo': {'country_code': 'US',
    'country': 'United States',
    'continent': 'North America',
    'lat': 37.751,
    'lng': -97.822,
    'timezone': 'America/Chicago'},
   'connection': {},
   'device': {},
   'agent': {},
   'karma': {'is_blocked': False,
    'is_crawler': False,
    'is_bot': False,

In [15]:
df2 = pd.DataFrame(data2['response'])
df = pd.concat([df, df2], ignore_index=True)
df.count()

airline_iata         126
airline_icao         126
flight_iata          126
flight_icao          126
flight_number        126
dep_iata             126
dep_icao             126
dep_terminal          86
dep_gate              74
dep_time             126
dep_time_utc         126
dep_estimated        126
dep_estimated_utc    126
dep_actual            97
dep_actual_utc        97
arr_iata             126
arr_icao             126
arr_terminal          83
arr_gate              87
arr_baggage           74
arr_time             126
arr_time_utc         126
arr_estimated        117
arr_estimated_utc    117
arr_actual            30
arr_actual_utc        30
cs_airline_iata       78
cs_flight_number      78
cs_flight_iata        78
status               126
duration             126
delayed              126
dep_delayed          123
arr_delayed          117
aircraft_icao         22
arr_time_ts          126
dep_time_ts          126
arr_estimated_ts     117
dep_estimated_ts     126
arr_actual_ts         30


In [None]:
AIRPORTS = [
    # North America
    ["KJFK","KLAX","KORD","KDEN","KSFO","KSEA","KATL","KDFW","KMSP"],
    # Canada
    ["CYYZ","CYVR","CYUL"],
    # Europe
    ["EGLL","LFPG","EDDF","EHAM","EDDM","LSZH","LEMD","ENGM","LTFM"],
    # Middle East
    ["OMDB","OTHH","OERK","OEJN","OMAA","OKBK","OBBI"],
    # East & SE Asia
    ["RJTT","RJAA","RKSI","ZSPD","ZBAA","ZGGG","VHHH","WSSS","VTBS"],
    # Tropics
    ["KMIA","MMUN","RPLL","WIII"]
]


DELAYS = [30, 45, 60, 90, 120]
OUTPUT_CSV = "airlabs_delays_raw.csv"

API_URL_Delay = "https://airlabs.co/api/v9/delays"
API_URL_Flights = "https://airlabs.co/api/v9/flights"

for delay in DELAYS:
    for airport in AIRPORTS:
      params_delay = {
          "api_key": API_KEY,
          "delay": delay, # maximum delay in minutes
          "type": "departures",
          "dep_icao": ",".join(airport),
          "max": 100,
      }
      params_flights = {
          "api_key": API_KEY,
          "dep_icao": ",".join(airport),
      }


      # step 1: get delays  
      response = requests.get(API_URL_Delay, params=params_delay)
      data = response.json()
      df_delay = pd.DataFrame(data['response'])

      # step 2: get flights
      response_flights = requests.get(API_URL_Flights, params=params_flights)
      data_flights = response_flights.json()
      df_flights = pd.DataFrame(data_flights['response'])

      # step 3: build fligh id to join on


      df = pd.concat([df, df_delay], ignore_index=True)
      print(f"Collected {df_delay.shape[0]} records for delay {delay} at airports {', '.join(airport)}.")

Collected 100 records for delay 45 at KJFK
Collected 100 records for delay 45 at KLAX
Collected 100 records for delay 45 at KORD
Collected 100 records for delay 45 at EGLL
Collected 100 records for delay 45 at LFPG
Collected 100 records for delay 45 at EDDF
Collected 100 records for delay 45 at YSSY
Collected 100 records for delay 45 at YMML
Collected 100 records for delay 45 at YBBN
Collected 100 records for delay 45 at OMDB
Collected 100 records for delay 45 at OTHH
Collected 100 records for delay 45 at OERK
Collected 100 records for delay 45 at RJTT
Collected 100 records for delay 45 at RKSI
Collected 100 records for delay 45 at ZSPD
Collected 100 records for delay 60 at KJFK
Collected 100 records for delay 60 at KLAX
Collected 100 records for delay 60 at KORD
Collected 100 records for delay 60 at EGLL
Collected 100 records for delay 60 at LFPG
Collected 100 records for delay 60 at EDDF
Collected 100 records for delay 60 at YSSY
Collected 100 records for delay 60 at YMML
Collected 1

In [17]:
df.to_csv(OUTPUT_CSV, index=False)

Now let's explore the aggregated data.

In [2]:
import pandas as pd
OUTPUT_CSV = "airlabs_delays_raw.csv"
df = pd.read_csv(OUTPUT_CSV)
df.head()

Unnamed: 0,airline_iata,airline_icao,flight_iata,flight_icao,flight_number,dep_iata,dep_icao,dep_terminal,dep_gate,dep_time,...,delayed,dep_delayed,arr_delayed,aircraft_icao,arr_time_ts,dep_time_ts,arr_estimated_ts,dep_estimated_ts,arr_actual_ts,dep_actual_ts
0,JQ,JST,JQ8993,JST8993,8993,BNE,YBBN,,,2025-12-25 10:00,...,68,63.0,68.0,,1766626500,1766620800,1766631000.0,1766624580,1766631000.0,1766625000.0
1,AA,AAL,AA1208,AAL1208,1208,ONT,KONT,4.0,408,2025-12-24 11:54,...,310,310.0,296.0,,1766609100,1766606040,1766627000.0,1766624640,1766627000.0,1766625000.0
2,F8,FLE,F8759,FLE759,759,YEG,CYEG,,8,2025-12-24 17:50,...,65,14.0,65.0,B38M,1766628900,1766623800,1766633000.0,1766624640,,1766625000.0
3,KL,KLM,KL9604,KLM9604,9604,YYC,CYYC,,A1,2025-12-24 16:00,...,157,124.0,157.0,,1766623500,1766617200,1766633000.0,1766624640,,1766625000.0
4,U6,SVR,U62448,SVR2448,2448,OSS,UCFO,,,2025-12-25 03:30,...,204,214.0,204.0,A321,1766628600,1766611800,1766641000.0,1766624640,,1766625000.0


In [14]:
hours = df.describe()['delayed'] / 60
df_exp = pd.concat([df.describe(), pd.DataFrame(hours)], axis=1)
df_exp.head()

Unnamed: 0,flight_number,cs_flight_number,duration,delayed,dep_delayed,arr_delayed,arr_time_ts,dep_time_ts,arr_estimated_ts,dep_estimated_ts,arr_actual_ts,dep_actual_ts,delayed.1
count,6111.0,4158.0,6066.0,6111.0,6108.0,5517.0,6111.0,6111.0,5532.0,6111.0,750.0,3757.0,101.85
mean,4383.61872,1862.713083,265.957798,240.368352,243.131631,234.995106,1766640000.0,1766623000.0,1766653000.0,1766638000.0,1766634000.0,1766631000.0,4.006139
std,3001.914933,2125.51932,202.40067,378.59134,379.52138,390.767028,15577.19,11915.08,26209.49,17765.39,1094.45,2137.851,6.309856
min,13.0,13.0,47.0,48.0,14.0,48.0,1766536000.0,1766492000.0,1766627000.0,1766625000.0,1766621000.0,1766625000.0,0.8
25%,2030.0,247.0,101.0,99.0,92.0,94.0,1766629000.0,1766620000.0,1766638000.0,1766630000.0,1766634000.0,1766629000.0,1.65


We can See that the average delay time is around 4 hours, so we need to reduce the bias by adding data with no delay.

In [1]:
import requests
import pandas as pd
from dotenv import load_dotenv
import os

load_dotenv()


API_URL = os.getenv("API_URL")
API_KEY = os.getenv("API_KEY")

params = {
    "api_key": API_KEY,
}

API_URL = f"{API_URL}"

response = requests.get(API_URL, params=params)

data = response.json()
data

{'request': {'lang': 'en',
  'currency': 'USD',
  'time': 23,
  'id': 'l2dxr51xyk0',
  'server': 'a',
  'host': 'airlabs.co',
  'pid': 186134,
  'key': {'id': 37729,
   'api_key': '2f1914c4-fa35-4b93-9894-de62a5d49aa7',
   'type': 'free',
   'expired': '2026-01-25T00:00:00.000Z',
   'registered': '2025-12-25T02:41:57.000Z',
   'upgraded': None,
   'limits_by_hour': 2500,
   'limits_by_minute': 250,
   'limits_by_month': 1000,
   'limits_total': 1000},
  'params': {'lang': 'en'},
  'version': 9,
  'method': 'flights',
  'client': {'ip': '2607:fb91:4b0c:a7e3:352c:2adb:7af2:fa5d',
   'geo': {'country_code': 'US',
    'country': 'United States',
    'continent': 'North America',
    'lat': 37.751,
    'lng': -97.822,
    'timezone': 'America/Chicago'},
   'connection': {},
   'device': {},
   'agent': {},
   'karma': {'is_blocked': False,
    'is_crawler': False,
    'is_bot': False,
    'is_friend': False,
    'is_regular': True}}},
 'response': [{'hex': '801623',
   'reg_number': 'VT-BXH