In [1]:
import pandas as pd
import glob

# Path to the folder containing all monthly CSV files (update as needed)
file_paths = glob.glob("data/*.csv")

# Initialize an empty list to store each monthly dataframe
dataframes = []

# Process each file individually
for file_path in file_paths:
    # Load each file into a dataframe
    df = pd.read_csv(file_path)
    
    # Convert FlightDate to datetime format
    df['FlightDate'] = pd.to_datetime(df['FlightDate'], errors='coerce', format='%m/%d/%y')
    
    # Append the dataframe to the list
    dataframes.append(df)

# Concatenate all dataframes into a single dataframe
combined_df = pd.concat(dataframes, ignore_index=True)

# Fill missing values in delay columns with zeros
delay_cols = ['DepDelayMinutes', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']
combined_df[delay_cols] = combined_df[delay_cols].fillna(0)

# Fill missing values in the CancellationCode column with a placeholder
combined_df['CancellationCode'] = combined_df['CancellationCode'].fillna("NotCancelled")

# Save the processed data to a CSV file
combined_df.to_csv("processed_data.csv", index=False)

print("Processed data has been saved to 'processed_data.csv'")


Processed data has been saved to 'processed_data.csv'
