<a href="https://colab.research.google.com/github/shloak17107/airline-delay-prediction/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# On-time Performance data for each month of 2024
dataset_links = [
    "https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_1.zip",
    "https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_2.zip",
    "https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_3.zip",
    "https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_4.zip",
    "https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_5.zip",
    "https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_6.zip",
    "https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_7.zip",
    "https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_8.zip",
    "https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_9.zip",
    "https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_10.zip",
    "https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_11.zip",
    "https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_12.zip"
]

In [9]:
import aiohttp
import asyncio
import zipfile
import os
from tqdm.asyncio import tqdm
import aiofiles
import nest_asyncio

# Enable nested event loops
nest_asyncio.apply()

async def download_file(session, url, filename):
    async with session.get(url) as response:
        if response.status == 200:
            async with aiofiles.open(filename, mode='wb') as f:
                await f.write(await response.read())
            return True
    return False

async def process_file(session, link):
    filename = os.path.join('data', os.path.basename(link))
    csv_name = filename.replace('.zip', '.csv')
    
    if os.path.exists(csv_name):
        return f'Skipping {filename} - CSV already exists'
    
    try:
        # Download file
        success = await download_file(session, link, filename)
        if not success:
            return f'Failed to download {filename}'
        
        tqdm.write(f'Downloaded {filename}')
            
        # Unzip file
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall('data')
        
        # Remove zip file
        os.remove(filename)
        return f'Extracted {filename}'
        
    except Exception as e:
        return f'Error processing {filename}: {str(e)}'

async def main(links):
    os.makedirs('data', exist_ok=True)
    
    async with aiohttp.ClientSession() as session:
        tasks = [process_file(session, link) for link in links]
        results = []
        for result in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
            results.append(await result)
            tqdm.write(results[-1])

# Run the async code using asyncio.run()
asyncio.run(main(dataset_links))

  0%|          | 0/12 [00:20<?, ?it/s]

Downloaded data/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_10.zip


  8%|▊         | 1/12 [00:21<03:51, 21.02s/it]

Extracted data/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_10.zip
Downloaded data/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_8.zip


 17%|█▋        | 2/12 [00:21<01:32,  9.22s/it]

Extracted data/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_8.zip


 17%|█▋        | 2/12 [00:46<01:32,  9.22s/it]

Downloaded data/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_6.zip


 25%|██▌       | 3/12 [00:46<02:27, 16.40s/it]

Extracted data/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_6.zip


 25%|██▌       | 3/12 [00:58<02:27, 16.40s/it]

Downloaded data/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_2.zip


 33%|███▎      | 4/12 [00:59<01:58, 14.85s/it]

Extracted data/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_2.zip


 33%|███▎      | 4/12 [01:02<01:58, 14.85s/it]

Downloaded data/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_11.zip


 42%|████▏     | 5/12 [01:03<01:16, 10.93s/it]

Extracted data/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_11.zip


 42%|████▏     | 5/12 [01:08<01:16, 10.93s/it]

Downloaded data/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_12.zip


 50%|█████     | 6/12 [01:09<00:55,  9.32s/it]

Extracted data/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_12.zip
Downloaded data/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_4.zip


 58%|█████▊    | 7/12 [01:10<00:33,  6.64s/it]

Extracted data/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_4.zip


 58%|█████▊    | 7/12 [01:12<00:33,  6.64s/it]

Downloaded data/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_5.zip


 67%|██████▋   | 8/12 [01:13<00:22,  5.53s/it]

Extracted data/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_5.zip


 67%|██████▋   | 8/12 [01:19<00:22,  5.53s/it]

Downloaded data/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_1.zip


 75%|███████▌  | 9/12 [01:21<00:18,  6.07s/it]

Extracted data/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_1.zip


 75%|███████▌  | 9/12 [01:26<00:18,  6.07s/it]

Downloaded data/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_9.zip


 83%|████████▎ | 10/12 [01:28<00:13,  6.51s/it]

Extracted data/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_9.zip
Downloaded data/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_3.zip


 92%|█████████▏| 11/12 [01:31<00:05,  5.54s/it]

Extracted data/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_3.zip


 92%|█████████▏| 11/12 [01:39<00:05,  5.54s/it]

Downloaded data/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_7.zip


100%|██████████| 12/12 [01:44<00:00,  8.69s/it]

Extracted data/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2024_7.zip





In [1]:
import pandas as pd
import glob

# Get all CSV files in the data directory
csv_files = glob.glob('data/*.csv')

# Read and combine all CSV files
df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)

# Print the size and first few rows
print(f"Dataset shape: {df.shape}")
print("\nFirst few rows of the dataset:")
print(df.head())

  df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)
  df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)
  df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)
  df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)
  df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)
  df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)


ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.