In [1]:
import pandas as pd
import numpy as np
import glob
import os

# Define the feature engineering function
def feature_engineering_weather_delay(df):
    """
    Perform feature engineering focusing solely on Weather Delay.

    Args:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: The DataFrame with additional features related to Weather Delay.
    """
    # Handle date-related features
    if 'FlightDate' in df.columns:
        df['FlightDate'] = pd.to_datetime(df['FlightDate'], errors='coerce')
        df['DayOfWeek'] = df['FlightDate'].dt.dayofweek  # Monday=0, Sunday=6
        df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)  # Weekend indicator

    # Add weather-related features
    if 'wind_speed_10m_max' in df.columns and 'direction_10m_dominant' in df.columns:
        df['WindComponent'] = df['wind_speed_10m_max'] * np.cos(np.radians(df['direction_10m_dominant']))

    if 'temperature_2m_max' in df.columns and 'temperature_2m_min' in df.columns:
        df['TempDiff'] = df['temperature_2m_max'] - df['temperature_2m_min']

    # Map weather codes to categories
    if 'weather_code' in df.columns:
        weather_mapping = {53: 'Rain', 3: 'Clear', 99: 'Snow'}  # Example mapping
        df['WeatherCategory'] = df['weather_code'].map(weather_mapping)
        df = pd.get_dummies(df, columns=['WeatherCategory'], drop_first=True)

    # Focus on Weather Delay
    if 'WeatherDelay' in df.columns:
        df['HasWeatherDelay'] = (df['WeatherDelay'] > 0).astype(int)  # Binary indicator for weather delays

    return df


# Process all files in the processed_data folder
data_folder = "processed_data/"
all_files = glob.glob(os.path.join(data_folder, "*.csv"))

# Initialize a list to store processed DataFrames
processed_dataframes = []

for file in all_files:
    print(f"Processing file: {file}")
    try:
        # Load the CSV file with low_memory=False to avoid mixed-type warnings
        df = pd.read_csv(file, low_memory=False)
        
        # Check and fix mixed data types in relevant columns
        for col in df.columns:
            if df[col].dtype == "object":
                try:
                    df[col] = pd.to_numeric(df[col], errors="coerce")  # Coerce to numeric if possible
                except Exception as e:
                    print(f"Could not convert column '{col}' in file '{file}' to numeric: {e}")

        # Apply weather-focused feature engineering
        df = feature_engineering_weather_delay(df)
        
        # Append the processed DataFrame
        processed_dataframes.append(df)
    except Exception as e:
        print(f"Error processing file {file}: {e}")

# Concatenate all processed DataFrames
if processed_dataframes:
    # Check for common columns
    common_columns = set.intersection(*(set(df.columns) for df in processed_dataframes))
    common_columns = list(common_columns)  # Convert the set to a list for indexing
    processed_dataframes = [df[common_columns] for df in processed_dataframes]

    # Ensure there are no duplicate columns
    for idx, df in enumerate(processed_dataframes):
        if df.columns.duplicated().any():
            print(f"Duplicate columns found in DataFrame {idx}: {df.columns[df.columns.duplicated()].tolist()}")
            processed_dataframes[idx] = df.loc[:, ~df.columns.duplicated()]

    # Concatenate the DataFrames
    final_df = pd.concat(processed_dataframes, ignore_index=True)
    print("Final dataset shape:", final_df.shape)

    # Ensure the directory exists
    output_dir = "CS506-Final-Project"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Save the final processed dataset
    try:
        final_df.to_csv(f"{output_dir}/weather_engineered_data.csv", index=False)
        print(f"All files processed and saved to '{output_dir}/weather_engineered_data.csv'")
    except Exception as e:
        print(f"Error saving final dataset: {e}")
else:
    print("No files were successfully processed.")

Processing file: processed_data/weather_engineered_data.csv


  df = pd.read_csv(file)


Processing file: processed_data/February.csv
Processing file: processed_data/January.csv
Processing file: processed_data/June.csv
Processing file: processed_data/April.csv
Processing file: processed_data/July.csv
Processing file: processed_data/March.csv
Processing file: processed_data/November.csv
Processing file: processed_data/August.csv
Processing file: processed_data/May.csv
Processing file: processed_data/December.csv
Processing file: processed_data/September.csv
Processing file: processed_data/October.csv
Final dataset shape: (3254193, 25)
All files processed and saved to 'CS506-Final-Project/weather_engineered_data.csv'
