In [None]:
from pathlib import Path
import pandas as pd

def getFile(fileName):
    data_path = "../data/processed" +"/"+ fileName
    return pd.read_csv(data_path)

airports_df = getFile("extract_airports.csv")
delay_df = getFile("extract_delay.csv")

In [None]:
# Remove duplicate records
airports_df = airports_df.drop_duplicates()
delay_df = delay_df.drop_duplicates()

In [None]:
# Handle missing/null values
airports_df.fillna({'name': 'Unknown', 'city': 'Unknown', 'iata': 'UNK', 'lat': 0, 'lon': 0, 'alt': 0}, inplace=True)
delay_df.fillna(0, inplace=True)

In [None]:
# Standardize column names
airports_df.columns = airports_df.columns.str.lower().str.replace(' ', '_')
delay_df.columns = delay_df.columns.str.lower().str.replace(' ', '_')

In [None]:
# Convert columns to appropriate data types
airports_df = airports_df.astype({'id': 'int64', 'name': 'string', 'city': 'string', 'iata': 'string', 'lat': 'float64', 'lon': 'float64', 'alt': 'int64'})
delay_df = delay_df.astype({'year': 'int64', 'month': 'int64', 'carrier': 'string', 'carrier_name': 'string', 'airport': 'string', 'airport_name': 'string'})
numeric_cols = ['arr_flights', 'arr_del15', 'carrier_ct', 'weather_ct', 'nas_ct', 'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted', 'arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
delay_df[numeric_cols] = delay_df[numeric_cols].astype('float64')

In [None]:
# Save transformed data
output_path = Path(__file__).parent.parent / "data" / "processed"
output_path.mkdir(exist_ok=True)
airports_df.to_csv(output_path / "transformed_airports.csv", index=False)
delay_df.to_csv(output_path / "transformed_delay.csv", index=False)

In [None]:
data = pd.read_csv("../data/processed/extract_delay.csv")


duplicated_rows = data.duplicated()
clean_data = data[~duplicated_rows]

clean_data.columns = clean_data \
    .columns \
    .str.lower() \
    .str.replace(" ", "_")


CRITICAL_COLUMNS = ['year',
                    'month',
                    'carrier',
                    'carrier_name',
                    'airport',
                    'airport_name']
rows_before_drop = len(clean_data)
clean_data = clean_data.dropna(subset=CRITICAL_COLUMNS)
rows_dropped = rows_before_drop - len(clean_data)

rows_before_fill = len(clean_data)
clean_data = clean_data.fillna(0)
rows_modified = rows_before_fill - len(clean_data)

# arrays are AI generated
str_cols = ['carrier',
            'carrier_name',
            'airport',
            'airport_name']
int_cols = ['year',
            'month',
            'arr_flights',
            'arr_del15',
            'arr_cancelled',
            'arr_diverted',
            'arr_delay',
            'carrier_delay',
            'weather_delay',
            'nas_delay',
            'security_delay',
            'late_aircraft_delay']
float_cols = ['carrier_ct',
                'weather_ct',
                'nas_ct',
                'security_ct',
                'late_aircraft_ct']
clean_data[str_cols] = clean_data[str_cols].astype("string")
clean_data[int_cols] = clean_data[int_cols].astype("int64")
clean_data[float_cols] = clean_data[float_cols].astype("float64")

clean_data