In [None]:
import os
os.environ['JAVA_TOOL_OPTIONS'] = '-Djava.security.manager=allow'

In [None]:
# =================== 1. Setup Spark and Import Libraries ===================
from pyspark.sql import SparkSession
from pyspark.ml.regression import RandomForestRegressor, DecisionTreeRegressor, LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressor, DecisionTreeRegressor, LinearRegression, GBTRegressor, GeneralizedLinearRegression, IsotonicRegression
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
import argparse
from pyspark.ml import Pipeline
from pyspark.ml import PipelineModel
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import *
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation

import traceback
import pickle
import shutil

from pyspark import keyword_only
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml import Pipeline, Transformer, Estimator, PipelineModel
from pyspark.ml.feature import *
from pyspark.sql import functions as F
import json
from pyspark.sql.types import IntegerType, DoubleType, FloatType, StructField, Row
from pyspark.sql.functions import col, sum
from math import pi, cos, sin
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params, TypeConverters
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.context import SparkContext as sc
import os

# Initialize Spark Session
# spark = SparkSession.builder.appName("MachineLearningProject").getOrCreate()


TARGET_COLUMN = "ArrDelay"
# Path to Parquet file
FLIGHT_PARQUET_PATH = './data/flights.parquet'
PLANES_PARQUET_PATH = './data/planes.parquet'
PROCESSING_DIR = "data/processing/"
# Path to schema file
PLANE_SCHEMA_PATH = './data/plane-schema.json'
FLIGHT_SCHEMA_PATH = './data/flight-schema.json'
# Load paths
FLIGHT_RAW_PATH = './data/*.csv.bz2'
PLANE_RAW_PATH = './data/plane-data.csv'
# Result paths
PROCESSED_DIR = './data/processed/'
PROCESSED_TRAIN_PARQUET = os.path.join(PROCESSED_DIR, "train.parquet")
PROCESSED_TEST_PARQUET = os.path.join(PROCESSED_DIR, "test.parquet")
PROCESSED_SCHEMA = os.path.join(PROCESSED_DIR, "schema.json")

# **SECTION 1 (EDA)**

## **Load Data**

In [None]:
csv_path = './data/2008.csv'
plane_data_path = './data/plane-data.csv'

In [None]:
def load_csv(spark, df_path, planes_data_path) -> DataFrame:
    # Read csv
    df = spark.read.csv(
        df_path,
        header=True,
        inferSchema=True
    )
    forbidden_cols = [
        "ArrTime",
        "ActualElapsedTime",
        "AirTime",
        "TaxiIn",
        "Diverted",
        "CarrierDelay",
        "WeatherDelay",
        "NASDelay",
        "SecurityDelay",
        "LateAircraftDelay"
    ]
    df = df.drop(*forbidden_cols)

    df_planes = spark.read.csv(
        planes_data_path,
        header=True,
        inferSchema=True
    )
    df_planes = df_planes.withColumnRenamed("tailnum", "TailNum")
    df_planes = df_planes.withColumnRenamed("year", "PlaneIssueYear")
    df_planes = df_planes.withColumnRenamed("engine_type", "EngineType")
    df_planes = df_planes.withColumnRenamed("aircraft_type", "AircraftType")
    df_planes = df_planes.withColumnRenamed("model", "Model")
    df_planes = df_planes.withColumnRenamed("manufacturer", "Manufacturer")

    data = df.join(df_planes, on="TailNum", how="inner")
    return data

In [None]:
def organize_data(df):
    quant_time_features = [
        'DepTime',
        'CRSDepTime',
        'CRSArrTime'
    ]

    quantitative_features = [
            'CRSElapsedTime',
            'DepDelay',
            'Distance',
            'TaxiOut',
            'PlaneIssueYear'
        ]

    target_column = "ArrDelay"

    for column in quantitative_features + [target_column]:
        df = df.withColumn(column, col(column).cast(IntegerType()))
    df = df.dropna(subset=[target_column])
    null_count = df.filter(col(target_column).isNull()).count()

    for column in quant_time_features:  # They are strings hhmm
        df = df.withColumn(
            column + "_minutes",
            (F.col(column).substr(1, 2).cast("int") * 60 + F.col(column).substr(3, 2).cast("int"))
        )
        quantitative_features.append(column + "_minutes")
    df = df.drop(*quant_time_features)
    return df

In [None]:
df = load_csv(spark, csv_path, plane_data_path)
df = organize_data(df)
df.printSchema()

In [None]:
numeric_features = [
    'Month',
    'DayofMonth',
    'DayOfWeek',
    'Year',
    'PlaneIssueYear',
    'DepTime_minutes',
    'CRSDepTime_minutes',
    'CRSArrTime_minutes',
    'CRSElapsedTime',
    'DepDelay',
    'Distance',
    'TaxiOut'
]

In [None]:
categorical_features = [
    'UniqueCarrier',
    'FlightNum',
    'TailNum',
    'Origin',
    'Dest',
    'Cancelled',
    'CancellationCode',
    'EngineType',
    'AircraftType',
    'Manufacturer',
    'Model',
    "issue_date", "status",
    "type",
    "ArrDelay"
]

In [None]:
print(f"Number of rows: {df.count()}")
print(f"Number of columns: {len(df.columns)}")

## **Missing Values**

In [None]:
def null_values(data, features_list):
    # Calculate null values for each column in features_list
    null_data = data.select([
        count(when(col(c).isNull() | isnan(col(c)), c)).alias(c) for c in features_list
    ])
    
    # Show the results
    null_data.show()
    
    # Return the DataFrame with null counts
    return null_data

In [None]:
def plot_null_percentages(df, null_counts, numeric):# Convert to Pandas and compute percentage
    if numeric == True:
        type = 'Numerical'
        color = 'skyblue'
    else:
        type = 'Categorical'
        color = 'lightcoral'
    total_rows = df.count()
    null_counts_pandas = null_counts.toPandas().T  # Transpose for easier handling
    null_counts_pandas.columns = ["NullCount"]
    null_counts_pandas["Percentage"] = (null_counts_pandas["NullCount"] / total_rows) * 100
    null_counts_pandas = null_counts_pandas.sort_values("Percentage", ascending=False)

    # Plot the bar chart
    null_counts_pandas["Percentage"].plot(kind="barh", color=color)  # Use `barh` for horizontal bars
    plt.xlabel("Percentage of Null Values (%)")
    plt.ylabel(f"{type} Features")
    plt.title(f"Percentage of Null Values by {type} Features")
    os.makedirs(f"./output/{type.lower()}/img", exist_ok=True)
    plt.savefig(f"output/{type.lower()}/img/null_values_percentage_{type.lower()}.png", dpi=300, bbox_inches="tight")  # Save as PNG
    plt.close()

Numerical features:

In [None]:
null_counts_numeric = null_values(df, numeric_features)

In [None]:
plot_null_percentages(df, null_counts_numeric, True)

Categorical features:

In [None]:
null_counts_categorical = null_values(df, categorical_features)

In [None]:
plot_null_percentages(df, null_counts_categorical, False)

## **Statistics Summary**

In [None]:
def statistics_summary(data):
    summary_df = data.select(numeric_features).summary().toPandas()
    summary_df.set_index("summary", inplace=True)
    summary_numeric = summary_df.apply(pd.to_numeric, errors='coerce')
    summary_numeric = summary_numeric.T
    return summary_numeric

In [None]:
summary = statistics_summary(df)
summary

## **Features Distribution**

In [None]:
def features_distributions(data, features_list, is_numeric_features=True):
    if is_numeric_features == True:
        type = 'Numerical'
        numerical_df = data.select(features_list).toPandas()

        num_features = len(features_list)
        cols = 2  # Number of columns in the grid
        rows = (num_features // cols) + (num_features % cols > 0)  # Calculate rows needed

        fig, axes = plt.subplots(rows, cols, figsize=(15, rows * 5))  # Adjust the figure size
        axes = axes.flatten()  # Flatten the axes array for easy iteration

        # Plot each feature's distribution
        for i, col in enumerate(features_list):
            sns.histplot(numerical_df[col], bins=30, ax=axes[i])  # Use the subplot's axis
            axes[i].set_title(f"Distribution of {col}")
            axes[i].set_xlabel(col)
            axes[i].set_ylabel("Frequency")

            if col == "DepDelay":  # Modify based on feature name
                axes[i].set_xlim(0, 500)  # Set x-axis range (e.g., 0 to 500)
            elif col == "TaxiOut":
                axes[i].set_xlim(0, 150)

        # Remove any unused subplots
        for i in range(len(features_list), len(axes)):
            fig.delaxes(axes[i])

        plt.tight_layout()  # Adjust layout to avoid overlap
        plt.savefig(f"output/{type.lower()}/img/features_distribution_{type.lower()}.png", dpi=300, bbox_inches="tight")  # Save as PNG
        plt.close(fig)
    else:
        type = 'Categorical'
        num_features = len(features_list)
        cols = 2  # Number of columns in the grid
        rows = (num_features // cols) + (num_features % cols > 0)  # Calculate rows needed

        fig, axes = plt.subplots(rows, cols, figsize=(15, rows * 5))  # Adjust figure size
        axes = axes.flatten()  # Flatten the axes array for easy iteration

        # Plot each feature's distribution
        for i, col in enumerate(features_list):
            # Group by column and count occurrences
            # Limit to top 15 categories
            if col == "ArrDelay":  # Special case for "ArrDelay"
                # Group by column and count occurrences for all data
                category_counts = data.groupBy(col).count().orderBy("count", ascending=False)
            else:
                # Limit to top 15 categories for other features
                top_n = 15
                category_counts = data.groupBy(col).count().orderBy("count", ascending=False).limit(top_n)

            category_df = category_counts.toPandas()

            # Plot using the subplot axis
            sns.barplot(data=category_df, x=col, y="count", ax=axes[i])
            axes[i].set_title(f"Distribution of {col}")
            axes[i].set_xlabel(col)
            axes[i].set_ylabel("Count")
            axes[i].tick_params(axis="x", rotation=90)  # Rotate x-axis labels

            if col == "ArrDelay":
                axes[i].set_xlim(50, 120)
                axes[i].tick_params(axis="x", labelsize=8)

        # Remove any unused subplots
        for i in range(len(features_list), len(axes)):
            fig.delaxes(axes[i])

        plt.tight_layout()  # Adjust layout to avoid overlap
        plt.savefig(f"output/{type.lower()}/img/features_distribution_{type.lower()}.png", dpi=300, bbox_inches="tight")  # Save as PNG
        plt.close(fig)

Numerical features:

In [None]:
features_distributions(df, ['DayofMonth','DayOfWeek','CRSDepTime_minutes','CRSArrTime_minutes','CRSElapsedTime','DepDelay','Distance','TaxiOut'], is_numeric_features=True)

Categorical features:

In [None]:
features_distributions(df, ['UniqueCarrier','Origin','Dest','EngineType','AircraftType','Manufacturer','Model','ArrDelay'], is_numeric_features=False)

## **Features Proportions**

In [None]:
def proportions(data, features_list, is_numeric_features = True):
    if is_numeric_features == True:
        type = 'numerical'
    else:
        type = 'categorical'

    total_count = data.count()
    for feature in features_list:
        feature_counts = df.groupBy(feature).count()
        # Calculate proportions
        feature_proportions = feature_counts.withColumn(
            "Proportion", round((col("count") / total_count)*100,2)
        )
        feature_proportions.write.csv(f"output/{type}/{feature}_proportions.csv", header=True, mode="overwrite")

Numerical features:

In [None]:
proportions(df, numeric_features, is_numeric_features=True)

Categorical features:

In [None]:
proportions(df, categorical_features, is_numeric_features=False)

## **Factors Influencing Average Arrival Delays**

In [None]:
def avg_ArrDelay(data, features_list):
    # Number of features
    num_features = len(features_list)
    cols = 2  # Number of columns in the grid
    rows = (num_features // cols) + (num_features % cols > 0)  # Calculate rows needed

    # Create subplots
    fig, axes = plt.subplots(rows, cols, figsize=(15, rows * 6))  # Adjust figure size
    axes = axes.flatten()  # Flatten axes for easy iteration

    # Iterate over each categorical feature
    for i, col_name in enumerate(features_list):  # Use col_name for clarity
        if col_name in ["Origin", "Dest", "Model"]:  # Special case for "Origin" and "Dest"
            type = 'categorical'
            top_n = 20
            category_counts = data.groupBy(col_name).count().orderBy("count", ascending=False).limit(top_n)
            # Filter data for the top 20 categories
            top_categories = [row[col_name] for row in category_counts.collect()]
            data_filtered = data.filter(col(col_name).isin(top_categories))  # Correct usage of col()
        else:
            type = 'numerical'
            data_filtered = data

        # Group data and calculate average arrival delay
        grouped_df = data_filtered.groupBy(col_name).agg({"ArrDelay": "mean"})

        # Convert to Pandas
        grouped_pandas = grouped_df.toPandas()

        # Plot bar chart in the subplot
        sns.barplot(data=grouped_pandas, x=col_name, y="avg(ArrDelay)", ax=axes[i])
        axes[i].set_title(f"Average Arrival Delay by {col_name}", fontsize=12)
        axes[i].set_xlabel(col_name)
        axes[i].set_ylabel("Avg. Arrival Delay")
        axes[i].tick_params(axis="x", rotation=90)

        if col_name in ["DepTime_minutes", "DepDelay"]:
            xticks = axes[i].get_xticks()
            axes[i].set_xticks([xticks[0], xticks[-1]])
            axes[i].set_xticklabels([grouped_pandas[col_name].iloc[0], grouped_pandas[col_name].iloc[-1]])  # Set corresponding labels


    # Remove any unused subplots
    for i in range(len(features_list), len(axes)):
        fig.delaxes(axes[i])

    # Adjust layout
    plt.tight_layout()
    plt.savefig(f"output/{type.lower()}/img/avg_ArrDelay_{type.lower()}.png", dpi=300, bbox_inches="tight")  # Save as PNG
    plt.close(fig)

Numerical features:

In [None]:
avg_ArrDelay(df, ['Month','DayofMonth','DayOfWeek','PlaneIssueYear','DepTime_minutes','DepDelay'])

Categorical features:

In [None]:
avg_ArrDelay(df, ['Origin','Dest','EngineType','AircraftType','Manufacturer','Model'])

## **Correlation Matrix**

In [None]:
def corr_matrix(data, features_list):
    data = data.fillna(0, subset=features_list)
    vector_col = "features_corr"

    vector_assembler = VectorAssembler(inputCols=features_list, outputCol=vector_col)
    df_vector = vector_assembler.transform(data)

    # Compute Correlation Matrix
    correlation_matrix = Correlation.corr(df_vector, vector_col).head()[0]  # Get the DenseMatrix
    correlation_array = correlation_matrix.toArray()
    correlation_df = pd.DataFrame(correlation_array, index=features_list, columns=features_list)
    sns.heatmap(
        correlation_df,
        annot=True,              # Show the correlation values
        fmt=".1f",               # Format to two decimal places
        cmap="coolwarm",         # Color map
        annot_kws={"size": 8}    # Reduce annotation font size
    )
    plt.title("Correlation Matrix Heatmap")
    plt.savefig(f"output/numerical/img/correlation_matrix.png", dpi=300, bbox_inches="tight")  # Save as PNG
    plt.close()

In [None]:
corr_matrix(df, numeric_features)

# Hasta aca va el EDA

In [None]:
# =================== 1. Data Reading ===================
def load_data(spark, input_path, mode):
    """
    Load the input dataset, drop forbidden columns, and validate its structure.

    Args:
        spark (SparkSession): The active Spark session.
        input_path (str): Path to the input CSV file.
        mode (str): Mode of operation ("train" or "predict").
    
    Returns:
        DataFrame: Processed Spark DataFrame.
    """
    forbidden_columns = [
        "ArrTime", "ActualElapsedTime", "AirTime", "TaxiIn", "Diverted",
        "CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay"
    ]

    try:
        # Read the dataset
        data = spark.read.csv(input_path, header=True, inferSchema=True)

        # Drop forbidden columns
        data = data.drop(*forbidden_columns)

        # Check if the dataset is empty
        if data.count() == 0:
            raise ValueError("The dataset is empty.")

        # Validate the presence of the target variable for training
        if mode == "train" and "ArrDelay" not in data.columns:
            raise ValueError("The target variable 'ArrDelay' is missing.")
    
    except Exception as e:
        print(f"Error reading the dataset: {e}")
        spark.stop()
        raise
    
    return data


In [None]:
# =================== 2. Exploratory Data Analysis (EDA) ===================
def eda(data):
    """
    Perform exploratory data analysis on the dataset, including univariate and multivariate analysis.

    Args:
        data (DataFrame): Spark DataFrame to analyze.

    Returns:
        DataFrame: DataFrame with a feature vector column added for further processing.
    """
    # Univariate analysis: Display statistical summary
    print("Statistical Summary:")
    data.describe().show()

    # Multivariate analysis: Correlations and patterns
    # Select numeric columns for correlation analysis
    numeric_cols = [col for col, dtype in data.dtypes if dtype in ('int', 'double')]

    if numeric_cols:
        # Assemble numeric columns into a single feature vector
        vector_col = "features_vector"
        assembler = VectorAssembler(inputCols=numeric_cols, outputCol=vector_col)
        data = assembler.transform(data)

        # Display correlation matrix for the numeric features
        from pyspark.ml.stat import Correlation
        try:
            correlation_matrix = Correlation.corr(data, vector_col).head()[0]
            print(f"Correlation matrix:\n{correlation_matrix}")
        except Exception as e:
            print(f"Error calculating correlations: {e}")
    else:
        print("No numeric columns available for correlation analysis.")

    return data


In [None]:



def load_csv_save_parquet(spark, raw_path, parquet_path, schema_path) -> DataFrame:
    """
    Reads a CSV file using Spark, saves its schema to a file, and converts the data to Parquet format.
    
    Parameters:
    spark: SparkSession object
    raw_path: str - Path to the raw CSV file
    parquet_path: str - Path where the Parquet file will be saved
    schema_path: str - Path where the schema JSON will be saved
    
    Returns:
    DataFrame: Spark DataFrame loaded from the saved Parquet file
    """
    # Read csv
    df = spark.read.csv(
        raw_path,
        header=True,
        inferSchema=True
    )

    schema = df.schema
    schema_json = schema.json()

    # Write the schema JSON to a file
    with open(schema_path, 'w') as f:
        f.write(schema_json)

    # Save DataFrame as Parquet for future use
    df.repartition(1)
    df.write.parquet(parquet_path)

    df = spark.read.parquet(parquet_path, schema=schema)
    return df


def load_parquet(spark, parquet_path, schema_file) -> DataFrame:
    """
    Loads a Parquet file using a predefined schema from a JSON file.
    
    Parameters:
    spark: SparkSession object
    parquet_path: str - Path to the Parquet file
    schema_file: str - Path to the schema JSON file
    
    Returns:
    DataFrame: Spark DataFrame with the specified schema
    """
    with open(schema_file, 'r') as f:
        schema_json = f.read()

    # Deserialize the JSON string back into a StructType object
    schema_from_file = StructType.fromJson(json.loads(schema_json))

    df = spark.read.parquet(parquet_path, schema=schema_from_file)
    return df


def load(spark, parquet_path, schema_file_path, wildcard_path) -> (DataFrame, DataFrame):
    """
    Loads data either from existing Parquet files or from raw CSV files if Parquet doesn't exist.
    
    Parameters:
    spark: SparkSession object
    parquet_path: str - Path to the Parquet file
    schema_file_path: str - Path to the schema file
    wildcard_path: str - Path pattern for raw CSV files
    
    Returns:
    tuple(DataFrame, DataFrame): Tuple containing main DataFrame and planes DataFrame
    """
    if os.path.exists(parquet_path):
        # If Parquet exists, load it using the schema files
        df = load_parquet(spark, parquet_path, schema_file_path)
        df_planes = load_parquet(spark, PLANES_PARQUET_PATH, PLANE_SCHEMA_PATH)
    else:
        # If Parquet file does not exist, read CSV files and save as Parquet
        df = load_csv_save_parquet(spark, wildcard_path, parquet_path, schema_file_path)
        df_planes = load_csv_save_parquet(spark, PLANE_RAW_PATH, PLANES_PARQUET_PATH, PLANE_SCHEMA_PATH)
    return df, df_planes


def custom_polar_time_encode(df):
    """
    Transforms Month, DayofMonth, and DayofWeek columns into polar coordinates.
    
    Parameters:
    df: DataFrame - Input Spark DataFrame
    
    Returns:
    DataFrame: DataFrame with added polar coordinate columns and original time columns dropped
    """
    print(f"Transforming Month, DayofMonth and DayofWeek to polar coordinates.")

    def polar_encoding(value, max_value):
        frac = value / max_value
        circle = 2 * pi
        angle = frac * circle
        return cos(angle), sin(angle)

    # Register UDF for polar encoding
    polar_udf = udf(polar_encoding, "struct<cos:double, sin:double>")

    # Apply polar encoding on 'Month', 'DayofMonth', 'DayOfWeek'
    df = df.withColumn("Month_polar", polar_udf(col("Month"), lit(12))) \
        .withColumn("DayofMonth_polar", polar_udf(col("DayofMonth"),
                                                  when(col("Month") == 2, lit(28))  # February (can adjust for leap year
                                                  # if needed)
                                                  .when(col("Month").isin([4, 6, 9, 11]),
                                                        lit(30))  # Months with 30 days
                                                  .otherwise(lit(31)))) \
        .withColumn("DayOfWeek_polar", polar_udf(col("DayOfWeek"), lit(7)))
    # df = df.drop(*["DayofMonth", "DayOfWeek", "Month"])

    # Subdivide feature pairs into two columns each
    df = df.withColumn("Month_cos", col("Month_polar.cos")) \
        .withColumn("Month_sin", col("Month_polar.sin")) \
        .withColumn("DayofMonth_cos", col("DayofMonth_polar.cos")) \
        .withColumn("DayofMonth_sin", col("DayofMonth_polar.sin")) \
        .withColumn("DayOfWeek_cos", col("DayOfWeek_polar.cos")) \
        .withColumn("DayOfWeek_sin", col("DayOfWeek_polar.sin"))

    df = df.drop(*["DayofMonth_polar", "DayOfWeek_polar", "Month_polar"])

    return df


def static_preprocess(df, df_planes):
    """
    Performs static preprocessing on flight and plane data, including joining datasets,
    dropping forbidden columns, handling cancelled flights, and various feature transformations.
    
    Parameters:
    df: DataFrame - Main flight data
    df_planes: DataFrame - Plane data
    
    Returns:
    tuple: (processed DataFrame, quantitative features list, ordinal features list, nominal features list)
    """
    df_planes = df_planes.withColumnRenamed("tailnum", "TailNum")
    df = df.join(df_planes, on="TailNum", how="inner")

    print("Schema before static preprocessing")
    df.printSchema()

    forbidden_cols = [
        "ArrTime",
        "ActualElapsedTime",
        "AirTime",
        "TaxiIn",
        "Diverted",
        "CarrierDelay",
        "WeatherDelay",
        "NASDelay",
        "SecurityDelay",
        "LateAircraftDelay"
    ]
    df = df.drop(*forbidden_cols)

    target_column = "ArrDelay"

    # List of Ordinal features
    cyclic_ordinal_time = [
        'Month',
        'DayofMonth',
        'DayOfWeek'
    ]
    non_cyclic_ordinal_time = ['Year', 'PlaneIssueYear']

    # List of Time features
    quant_time_features = [
        'DepTime',
        'CRSDepTime',
        'CRSArrTime'
    ]

    # List of Quantitative features
    quantitative_features = [
        'CRSElapsedTime',
        'DepDelay',
        'Distance',
        'TaxiOut'
    ]

    # List of Nominal features
    nominal_features = [
        'UniqueCarrier',
        'FlightNum',
        'TailNum',
        'Origin',
        'Dest',
        'Cancelled',
        'CancellationCode',
        'EngineType',
        'AircraftType',
        'Manufacturer',
        'Model',
        "issue_date", "status",
        "type"
    ]

    # WE ARE PREDICTING DELAY. REMOVE CANCELLED FLIGHTS
    df = df.filter("Cancelled != 1")

    # DROP NOMINALS WITH TOO MANY GROUPS OR THAT ARE USELESS
    useless_fea = ["TailNum", "FlightNum", "UniqueCarrier", "CancellationCode", "Cancelled", "issue_date", "status",
                   "type"]
    for fea in useless_fea:
        print(f"Discarding {fea}.")
        nominal_features.remove(fea)
    df = df.drop(*useless_fea)

    # RENAME VARIABLES
    df = df.withColumnRenamed("year", "PlaneIssueYear")
    df = df.withColumnRenamed("engine_type", "EngineType")
    df = df.withColumnRenamed("aircraft_type", "AircraftType")
    df = df.withColumnRenamed("model", "Model")
    df = df.withColumnRenamed("manufacturer", "Manufacturer")

    # CAST QUANTITATIVE COLUMNS TO NUMERIC, SOME ARE STRINGS
    for column in quantitative_features + [target_column]:
        print(f"Forcing {column} to be read as integer.")
        df = df.withColumn(column, col(column).cast(IntegerType()))
    df = df.dropna(subset=[target_column])
    null_count = df.filter(col(target_column).isNull()).count()
    print(f"Number of nulls in {target_column}: {null_count}")

    # CAST HHMM COLUMNS TO MINUTE QUANTITIES
    for column in quant_time_features:  # They are strings hhmm
        print(f"Casting {column} from hhmm to minutes (integer).")
        df = df.withColumn(
            column + "_minutes",
            (F.col(column).substr(1, 2).cast("int") * 60 + F.col(column).substr(3, 2).cast("int"))
        )
        quantitative_features.append(column + "_minutes")
    df = df.drop(*quant_time_features)

    df = custom_polar_time_encode(df)
    ordinal_features = []
    ordinal_features += [fea + "_sin" for fea in cyclic_ordinal_time]
    ordinal_features += [fea + "_cos" for fea in cyclic_ordinal_time]

    return df, quantitative_features, ordinal_features, nominal_features + non_cyclic_ordinal_time


def train_preprocess(df, nominal_features, ordinal_features, quantitative_features, dir_save_params,
                     cardinality_threshold, frequency_threshold, high_cardinality_strategy):
    """
    Performs training preprocessing including imputation, nominal encoding, and vectorization.
    Saves all preprocessing parameters to files for later use.
    
    Parameters:
    df: DataFrame - Input data
    nominal_features: list - Nominal feature columns
    ordinal_features: list - Ordinal feature columns
    quantitative_features: list - Quantitative feature columns
    dir_save_params: str - Directory to save preprocessing parameters
    cardinality_threshold: int - Threshold for high cardinality features
    frequency_threshold: float - Minimum frequency threshold for feature values
    high_cardinality_strategy: str - Strategy for handling high cardinality features
    """
    spark = SparkSession.builder.getOrCreate()
    # -------------------------------- IMPUTER --------------------------------
    # This should be the column, the values considered nulls, and the value to be used to fill

    print("Analyzing medians")
    imputer_maps = {
        fea: {'extra_nulls': [],
              'fill_value': df.approxQuantile(col=fea, probabilities=[0.5], relativeError=0.025)[0]} for fea in
        quantitative_features
    }
    print("Current imputing dictionary: ")
    print(imputer_maps)
    print("Analyzing modes")
    imputer_maps.update({
        fea: {'extra_nulls': ['None'],
              'fill_value': df.groupby(fea).count().orderBy("count", ascending=False).first()[0]} for fea in
        ordinal_features + nominal_features
    })
    print("Filling dictionary: ")
    print(imputer_maps)
    # Convert to JSON and save it
    json_data = json.dumps(imputer_maps, indent=4)

    # Save to a file
    with open(os.path.join(dir_save_params, 'imputer_maps.json'), 'w') as f:
        f.write(json_data)

    # ----------------------------- NOMINAL ENCODER ----------------------------

    def get_sufficiently_frequent(df, fea, frequency_threshold=frequency_threshold):
        total_count = df.count()

        # Group by the column and calculate the normalized frequency
        proportions = df.groupBy(fea).agg(
            (F.count("*") / total_count).alias(f"{fea}_frequency")
        )
        result = proportions.filter(F.col(f"{fea}_frequency") > frequency_threshold).select(fea).collect()
        result = [row[fea] for row in result]
        return result

    feature_to_sufficiently_frequent = {
        fea: get_sufficiently_frequent(df, fea) for fea in nominal_features
    }
    print("Sufficiently frequent values per feature: ")
    print(feature_to_sufficiently_frequent)

    # Map between feature and the encoder and new column name
    nominal_encode_type = {}
    nominal_encoders = {}
    new_nominal = []
    for fea in nominal_features:
        elems_to_preserve = feature_to_sufficiently_frequent[fea]
        df = df.withColumn(
            f"{fea}_aggregated",
            (F.when(~F.col(fea).isin(elems_to_preserve), lit("Other")).otherwise(F.col(fea)))
        )

        if len(elems_to_preserve) + 1 <= cardinality_threshold:
            print(f"Performing One-Hot-Encoding to feature {fea}")
            indexer = StringIndexer(inputCol=f"{fea}_aggregated", outputCol=f"{fea}_index", handleInvalid='keep')
            encoder = OneHotEncoder(inputCol=f"{fea}_index", outputCol=f"{fea}_binary", handleInvalid='keep',
                                    dropLast=True)
            pipeline = Pipeline(stages=[indexer, encoder])
            pipeline_model = pipeline.fit(df)
            nominal_encode_type[f"{fea}_aggregated"] = "binary"
            new_nominal.append(f"{fea}_binary")
            pipeline_model.save(os.path.join(dir_save_params, f'{fea}_aggregated_encoder'))
        elif high_cardinality_strategy == "ignore":
            print(f"Ignoring feature {fea}")
        elif high_cardinality_strategy == "mean":
            print(f"Performing Mean-Target-Encoding to feature {fea}")
            mapping_df = df.groupBy(f"{fea}_aggregated").agg(F.avg("ArrDelay").alias(f"{fea}_mean_enc"))
            if "Other" not in mapping_df.select(f"{fea}_aggregated").distinct().collect():
                mean = float(df.groupBy(TARGET_COLUMN).agg(F.avg("ArrDelay")).collect()[0][0])
                print(mean)
                new_row = Row(f"{fea}_aggregated", f"{fea}_mean_enc")("Other", mean)
                print(new_row)
                # Convert the new row to a DataFrame with the same schema as mapping_df
                new_row_df = spark.createDataFrame([new_row], mapping_df.schema)
                print(new_row_df.show())
                mapping_df = mapping_df.union(new_row_df)
                print(mapping_df.show())
            mapping_df.write.csv(os.path.join(dir_save_params, f'{fea}_aggregated_encoder.csv'), header=True)
            new_nominal.append(f"{fea}_mean_enc")
            nominal_encode_type[f"{fea}_aggregated"] = "mean"
        else:
            raise NotImplementedError(f"Not implemented strategy {high_cardinality_strategy}")

    print("Feature to encoder types:")
    print(nominal_encode_type)
    print("Final nominal variables:")
    print(new_nominal)

    # Convert to JSON and save it
    json_data = json.dumps(nominal_encode_type, indent=4)
    with open(os.path.join(dir_save_params, 'encode_types.json'), 'w') as f:
        f.write(json_data)

    json_data = json.dumps(feature_to_sufficiently_frequent, indent=4)
    with open(os.path.join(dir_save_params, 'non_aggregated.json'), 'w') as f:
        f.write(json_data)

    # -------------------------------- VECTORIZER --------------------------------
    # Quantitative feature assembly
    quant_assembler = VectorAssembler(
        inputCols=quantitative_features,
        outputCol="quant_features_vector"
    )

    # Assemble encoded nominal features
    nominal_assembler = VectorAssembler(
        inputCols=new_nominal,
        outputCol="nominal_features_vector"
    )

    ordinal_assembler = VectorAssembler(
        inputCols=ordinal_features,
        outputCol="ordinal_features_vector"
    )

    # Final feature vector
    final_assembler = VectorAssembler(
        inputCols=["quant_features_vector", "nominal_features_vector", "ordinal_features_vector"],
        outputCol="features"
    )

    # Create a pipeline
    pipeline = Pipeline(stages=[ordinal_assembler,
                                quant_assembler,
                                nominal_assembler,
                                final_assembler
                                ])
    vectorizer = pipeline.fit(df)
    vectorizer.save(os.path.join(dir_save_params, 'vectorizer'))
    # -------------------------------- VECTORIZER --------------------------------


def dynamic_preprocess(df, nominal_features, ordinal_features, quantitative_features, dir_load_params):
    """
    Applies a series of preprocessing transformations to a DataFrame using previously saved parameters.
    This function is designed to be used after train_preprocess() has been run to ensure consistent
    transformations across training and validation/test data.
    
    The function performs three main steps:
    1. Imputation: Fills missing values using predefined values from training
    2. Nominal Encoding: Transforms categorical variables using either binary (one-hot) or mean target encoding
    3. Vectorization: Combines all features into a single vector column
    
    Parameters:
    df: DataFrame - Input Spark DataFrame to preprocess
    nominal_features: list - List of categorical feature column names
    ordinal_features: list - List of ordinal feature column names (ordered categories)
    quantitative_features: list - List of numerical feature column names
    dir_load_params: str - Directory path containing saved preprocessing parameters from training
                          (imputer_maps.json, encode_types.json, non_aggregated.json, encoders)
    
    Returns:
    DataFrame: Preprocessed DataFrame with:
              - Imputed missing values
              - Encoded categorical features
              - Additional columns for transformed features
              - A 'features' column containing the final vector representation
    """

    # -------------------------------- IMPUTER --------------------------------
    with open(os.path.join(dir_load_params, 'imputer_maps.json'), 'r') as f:
        imputer_maps = json.load(f)

    for fea in quantitative_features + ordinal_features + nominal_features:
        if len(imputer_maps[fea]['extra_nulls']) > 0:
            df = df.withColumn(fea, when(df[fea].isin(imputer_maps[fea]['extra_nulls']), lit(None)).otherwise(df[fea]))

        if df.filter(col(fea).isNull()).count() > 0:
            value = imputer_maps[fea]['fill_value']
            print(f"Imputing {fea} with {value}")
            df = df.fillna(value, subset=fea)

    # ----------------------------- NOMINAL ENCODER ----------------------------
    with open(os.path.join(dir_load_params, 'encode_types.json'), 'r') as f:
        encode_types = json.load(f)
    with open(os.path.join(dir_load_params, 'non_aggregated.json'), 'r') as f:
        fea_2_non_aggregated = json.load(f)

    for fea, non_aggregated in fea_2_non_aggregated.items():
        df = df.withColumn(
            f"{fea}_aggregated",
            (F.when(~F.col(fea).isin(non_aggregated), lit("Other")).otherwise(F.col(fea)))
        )

    for fea, encode_type in encode_types.items():
        if encode_type == 'binary':
            encoder = PipelineModel.load(os.path.join(dir_load_params, f'{fea}_encoder'))
            df = encoder.transform(df)
        elif encode_type == 'mean':
            encoder = SparkSession.builder.getOrCreate().read.csv(os.path.join(dir_load_params, f'{fea}_encoder.csv'),
                                                                  header=True, inferSchema=True)
            df = df.join(encoder, on=fea, how='left')
            new_var = f"{fea}_mean_enc".replace("_aggregated", "")
            imput_value = encoder.filter(encoder[fea] == "Other").select(new_var).collect()[0][0]
            print(f"Using the following encoder for {fea}")
            print(encoder.show(10))
            print(f"Imputing unrecognized values in {fea} with 'Other'->{imput_value}")
            df = df.fillna(imput_value, subset=new_var)
        else:
            raise NotImplementedError(f"Not implemented encode type {encode_type}")

    # ------------------------------ VECTORIZER --------------------------------
    vectorizer = PipelineModel.load(os.path.join(dir_load_params, 'vectorizer'))
    df = vectorizer.transform(df)
    return df


def assure_existence_directory(directory_path):
    """
    Creates a directory if it doesn't exist.
    
    Parameters:
    directory_path: str - Path to the directory to create
    """
    # Check if the directory exists
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)


def preprocess_fit_and_transform(df, df_planes, dir_save_params="./data/"):
    """
    Performs complete preprocessing pipeline including static and dynamic preprocessing.
    
    Parameters:
    df: DataFrame - Input flight data
    df_planes: DataFrame - Input plane data
    dir_save_params: str - Directory for saving preprocessing parameters
    
    Returns:
    DataFrame: Fully preprocessed DataFrame
    """
    df, quantitative_features, ordinal_features, nominal_features = static_preprocess(df, df_planes)

    if len(os.listdir(dir_save_params)) == 0:
        print("TRAINING DYNAMIC PREPROCESSING PARAMETERS")
        train_preprocess(df, nominal_features, ordinal_features, quantitative_features, dir_save_params,
                         cardinality_threshold=10, frequency_threshold=0.02, high_cardinality_strategy="mean")
    else:
        print("DYNAMIC PREPROCESSING PARAMETERS FOUND. SKIPPING LEARNING.")
    df = dynamic_preprocess(df, nominal_features, ordinal_features, quantitative_features, dir_save_params)
    return df


def split_and_preprocess(df, df_planes, train_frac=0.8, dir_save_params="./data/"):
    """
    Splits data into train and test sets and applies preprocessing to both.
    
    Parameters:
    df: DataFrame - Input flight data
    df_planes: DataFrame - Input plane data
    train_frac: float - Fraction of data to use for training
    dir_save_params: str - Directory for saving preprocessing parameters
    
    Returns:
    tuple(DataFrame, DataFrame): Preprocessed training and test DataFrames
    """
    train_df, test_df = df.randomSplit([train_frac, 1 - train_frac], seed=42)
    train_df = preprocess_fit_and_transform(train_df, df_planes, dir_save_params=dir_save_params)

    print("TESTING DATA PROCESSING")
    test_df, quantitative_features, ordinal_features, nominal_features = static_preprocess(test_df, df_planes)
    test_df = dynamic_preprocess(test_df, nominal_features, ordinal_features, quantitative_features, dir_save_params)
    return train_df, test_df


# THIS IS THE FUNCTION TO USE TO PREPROCESS VALIDATION DATA PASSED THROUGH CONSOLE <-------------------------------------
def validation_preprocess(df, dir_save_params="./data/"):
    """
    Preprocesses validation data using saved parameters from training.
    
    Parameters:
    df: DataFrame - Input validation data
    dir_save_params: str - Directory containing preprocessing parameters
    
    Returns:
    DataFrame: Preprocessed validation DataFrame
    """
    spark = SparkSession.builder.appName("MachineLearningProject").getOrCreate()
    df_planes = load_parquet(spark, PLANES_PARQUET_PATH, PLANE_SCHEMA_PATH)
    df, quantitative_features, ordinal_features, nominal_features = static_preprocess(df, df_planes)
    df = dynamic_preprocess(df, nominal_features, ordinal_features, quantitative_features, dir_save_params)
    return df


def load_split_and_preprocess(n_partitions=10, debug=False):
    """
    Complete pipeline for loading, splitting, and preprocessing data.
    Handles both initial processing and loading of previously processed data.
    
    Parameters:
    n_partitions: int - Number of partitions for Spark DataFrame
    debug: bool - If True, uses a small fraction of data for debugging
    """
    spark = (SparkSession.builder.appName("MachineLearningProject") # Change this as needed
             .config("spark.executor.memory", "4g")
             .config("spark.driver.memory", "48g")
             .config("spark.memory.fraction", "0.8")
             .config("spark.memory.storageFraction", "0.3")
             .config("spark.driver.maxResultSize", "4g")
             .config("spark.sql.caseSensitive", "true")
             .config("spark.sql.debug.maxToStringFields", "200")
             # .config("spark.local.dir", "./temp/")
             .getOrCreate())

    if not os.path.exists(PROCESSED_TRAIN_PARQUET):
        df, df_planes = load(spark, FLIGHT_PARQUET_PATH, PLANE_SCHEMA_PATH, FLIGHT_RAW_PATH)
        df = df.repartition(n_partitions)

        if debug:
            fraction = 0.01  # Adjust the fraction to select 10% of rows
            df = df.sample(withReplacement=True, fraction=fraction)
            df = df.repartition(1)

        # train_df, test_df = complete_preprocess(df, df_planes, train_frac=0.8)
        assure_existence_directory(PROCESSING_DIR)
        train_df, test_df = split_and_preprocess(df, df_planes, train_frac=0.8, dir_save_params=PROCESSING_DIR)
        print("Finished preprocessing")
        print(train_df.head())
        print(test_df.head())

        print(f"Saving schema to {PROCESSED_SCHEMA}")
        assure_existence_directory(PROCESSED_DIR)
        schema_json = train_df.schema.json()
        with open(PROCESSED_SCHEMA, 'w') as f:
            f.write(schema_json)
        test_df.write.mode('overwrite').parquet(PROCESSED_TEST_PARQUET)
        train_df.write.mode('overwrite').parquet(PROCESSED_TRAIN_PARQUET)
    else:
        with open(PROCESSED_SCHEMA, 'r') as f:
            schema_json = f.read()

        schema = StructType.fromJson(json.loads(schema_json))

        test_df = spark.read.parquet(PROCESSED_TEST_PARQUET, schema=schema)
        train_df = spark.read.parquet(PROCESSED_TRAIN_PARQUET, schema=schema)

        print(test_df.head())
        print(train_df.head())
    spark.stop()

load_split_and_preprocess(debug=False)

In [None]:
<<<<<<< REMOTE CELL DELETED >>>>>>>
# =================== 3. Data Processing ===================
def process_data(data, mode):
    """
    Process the dataset: handle missing values and perform feature engineering.

    Args:
        data (DataFrame): Spark DataFrame to process.
        mode (str): Mode of operation ("train" or "predict").

    Returns:
        DataFrame: Processed Spark DataFrame with new features added.
    """
    # Validate the target variable for training mode
    if mode == "train" and "ArrDelay" not in data.columns:
        raise ValueError("The target variable 'ArrDelay' is missing.")

    # Handle missing values
    if mode == "train":
        # Drop rows where the target variable or features are null
        data = data.dropna(subset=["ArrDelay"])

    # Example: Fill null values in specific columns with a default value
    # Replace 'column_name' with actual column names as needed
    # Uncomment this if specific columns require filling
    # data = data.fillna({"column_name": 0})

    # Transform special variables
    # Feature engineering: Create time-based features
    if "DepTime" in data.columns:
        data = data.withColumn("DepHour", (col("DepTime") / 100).cast("int"))  # Extract hour from departure time

    if "FlightDate" in data.columns:
        data = data.withColumn("DayOfWeek", date_format(col("FlightDate"), "u").cast("int"))  # Convert to day of the week

    # Feature engineering: Create flight distance categories
    if "Distance" in data.columns:
        data = data.withColumn(
            "DistanceCategory",
            when(col("Distance") < 500, "Short")  # Short flights
            .when((col("Distance") >= 500) & (col("Distance") < 1500), "Medium")  # Medium flights
            .otherwise("Long")  # Long flights
        )

    return data


In [None]:
# import json
# from pyspark import StorageLevel
# from pyspark.sql import SparkSession
# from pyspark.sql.types import StructType
# from pyspark.sql.functions import col, isnan
# from pyspark.storagelevel import StorageLevel
# 
# def test_train_and_predict(processed_schema_path, processed_train_path_parquet, processed_test_path_parquet, model_path):
#     """
#     Function to test the build_and_train_model and predict functions using train and test datasets.
# 
#     Args:
#         processed_schema_path (str): Path to the schema JSON file.
#         processed_train_path_parquet (str): Path to the train.parquet file.
#         processed_test_path_parquet (str): Path to the test.parquet file.
#         model_path (str): Path to save or load the trained model.
# 
#     Returns:
#         None
#     """
#     
# 
#     # Configuración de Spark
#     spark = SparkSession.builder \
#         .appName("Optimización con recursos limitados") \
#         .master("local[*]") \
#         .config("spark.executor.memory", "2g") \
#         .config("spark.driver.memory", "2g") \
#         .config("spark.sql.shuffle.partitions", "100") \
#         .config("spark.python.worker.timeout", "600") \
#         .config("spark.network.timeout", "600s") \
#         .getOrCreate()
# 
#         
#     pipeline = Pipeline(stages=[])
#         
#     try:
#         # Carga el esquema desde el archivo JSON
#         print("Loading schema from JSON...")
#         with open(processed_schema_path, 'r') as f:
#             schema_json = f.read()
# 
#         schema = StructType.fromJson(json.loads(schema_json))
# 
#         # Carga los DataFrames de Parquet usando el esquema
#         print("Loading train and test datasets from Parquet...")
#         train_df = spark.read.parquet(processed_train_path_parquet, schema=schema)
#         test_df = spark.read.parquet(processed_test_path_parquet, schema=schema)
# 
# 
#         train_df = train_df.sample(fraction=0.1)  # Usa una muestra del 10%
#         train_df = train_df.repartition(10)  # Reparticionar para evitar carga de memoria
# 
# 
#         # Persistir los DataFrames en disco
#         train_df.persist(StorageLevel.DISK_ONLY)
#         test_df.persist(StorageLevel.DISK_ONLY)
# 
#         # Confirmación de datos cargados
#         print("Train and test datasets loaded successfully!")
#         print(f"Train dataset count: {train_df.count()}")
#         print(f"Test dataset count: {test_df.count()}")
# 
#         # train_df = train_df.repartition(200)  # Ajusta el número de particiones según el tamaño de los datos
# 
#         # Train the model
#         print("Training the model...")
#         metrics = build_and_train_model(train_df, pipeline, model_path)
#         print(f"Training completed. Metrics: {metrics}")
# 
#         # Predict using the trained model
#         print("Making predictions on the test dataset...")
#         output_path = model_path + "_predictions"
#         predict(test_df, model_path, output_path)
#         print(f"Predictions saved to: {output_path}")
# 
#     except Exception as e:
#         print(f"An error occurred: {e}")
#         print("An error occurred:", e)
#         traceback.print_exc()
# 
# # # Rutas de ejemplo
# # processed_schema_path = "data/processed/schema.json"  # Ruta al archivo JSON del esquema
# # processed_train_path_parquet = "data/processed/train.parquet"  # Ruta al archivo Parquet de train
# # processed_test_path_parquet = "data/processed/test.parquet"  # Ruta al archivo Parquet de test
# # model_path = "data/models/trained_model"
# 
# # # Llamada a la función
# # test_train_and_predict(processed_schema_path, processed_train_path_parquet, processed_test_path_parquet, model_path)


In [None]:
def validate(data, model_path, output_path):
    """
    Loads trained models from disk, generates predictions on new data, evaluates model performance,
    and saves the predictions to CSV files.

    Parameters:
    -----------
    data (pyspark.sql.DataFrame): 
        Input Spark DataFrame containing:
        - All feature columns used during training
        - 'ArrDelay' column if evaluation metrics are needed
        DataFrame should be preprocessed using the same steps as training data
    
    model_path (str): 
        Directory path containing the saved models
        Each subdirectory should contain a PipelineModel saved by build_and_train_model()
    
    output_path (str): 
        Directory path where prediction CSV files will be saved
        One CSV file will be created for each model with format: {model_name}_pred.csv
    
    Evaluation Metrics:
    ------------------
    For each model, calculates:
    - RMSE (Root Mean Square Error)
    - MAE (Mean Absolute Error)
    - R² (R-squared score)
    
    Output Files:
    ------------
    Creates CSV files containing:
    - Original columns (excluding engineered features)
    - Prediction column with model's delay predictions
    - Excludes intermediate columns (*_features, scaledFeatures)
    
    Returns:
        None
    """
    # Load the trained model
    
    for model_name in os.listdir(model_path):
        model_folder = os.path.join(model_path, model_name)
        print(f"Loading model: {model_folder}")
        model = PipelineModel.load(model_folder)
        name = model_name
    
        # Make predictions on the input data
        print("Validating model...")
        predictions = model.transform(data)
        rmse_evaluator = RegressionEvaluator(labelCol="ArrDelay", predictionCol="prediction", metricName="rmse")
        metrics = {}
        metrics['rmse'] = rmse_evaluator.evaluate(predictions)
        print(f"{name} - Root Mean Square Error (RMSE) on test data: {metrics['rmse']}")

        # Mean Absolute Error (MAE)
        mae_evaluator = RegressionEvaluator(labelCol="ArrDelay", predictionCol="prediction", metricName="mae")
        metrics['mae'] = mae_evaluator.evaluate(predictions)
        print(f"{name} - Mean Absolute Error (MAE) on test data: {metrics['mae']}")

        # R-Squared (R²)
        r2_evaluator = RegressionEvaluator(labelCol="ArrDelay", predictionCol="prediction", metricName="r2")
        metrics['r2'] = r2_evaluator.evaluate(predictions)
        print(f"{name} - R-Squared (R²) on test data: {metrics['r2']}")
        # Save predictions to the specified output path
        
        out_csv_path = os.path.join(output_path, name + "_pred.csv")
        print(f"Saving predictions to {out_csv_path}")
        old_columns = [col for col in predictions.columns if "_" not in col]
        predictions = predictions.select(old_columns).drop(*["features", "scaledFeatures"])
        os.makedirs(output_path, exist_ok=True)  # Ensure output directory exists
        predictions.write.mode("overwrite").csv(out_csv_path, header=True)
        print(f"Predictions saved to: {output_path}")


In [None]:
def build_and_train_model(train_df, model_save_path=None):
    """
    Builds, trains, and evaluates multiple regression models using cross-validation, with optional model saving.
    The function implements a complete ML pipeline including feature scaling, model training, and evaluation.
    
    Parameters:
    -----------
    data (pyspark.sql.DataFrame): 
        Input Spark DataFrame containing:
        - A 'features' column with the feature vector
        - An 'ArrDelay' column as the target variable
    
    model_save_path (str, optional): 
        Directory path to save the retrained best models. If provided, 
        saves each model in a subdirectory named 'retrained_{model_name}'
    
    Models Trained:
    --------------
    1. Linear Regression
       - Hyperparameter tuned: regParam [0.1, 0.3, 0.5]
    
    2. Decision Tree Regressor
       - Hyperparameter tuned: maxDepth [5, 10, 20]
    
    3. Random Forest Regressor
       - Hyperparameter tuned: numTrees [10, 50, 100]
    
    Pipeline Steps:
    --------------
    1. StandardScaler: Normalizes features (mean=0, std=1)
    2. Model Training: Uses 5-fold cross-validation
    3. Hyperparameter Tuning: Model-specific grid search
    4. Evaluation: Computes RMSE, MAE, and R² metrics
    
    Returns:
    --------
    dict: Nested dictionary containing evaluation metrics for each model
        {
            'Model_Name': {
                'rmse': Root Mean Square Error value,
                'mae': Mean Absolute Error value,
                'r2': R-squared value
            },
            ...
        }
    """
    # Split data into training and testing sets
    train_data, test_data = train_df.randomSplit([0.8, 0.2], seed=42)

    if "prediction" in train_data.columns:
        train_data = train_data.drop("prediction")
    
    # Initialize metrics dictionary
    all_metrics = {}
    
    
    # Define the models: RandomForestRegressor, DecisionTreeRegressor, LinearRegression
    scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withMean=True, withStd=True)
    rf = RandomForestRegressor(featuresCol="scaledFeatures", labelCol="ArrDelay")
    dt = DecisionTreeRegressor(featuresCol="scaledFeatures", labelCol="ArrDelay")
    lr = LinearRegression(featuresCol="scaledFeatures", labelCol="ArrDelay")
    
    models = {
        'Linear_Regression': lr, 'Decision_Tree': dt, 'Random_Forest': rf
    }
    model_names = list(models.keys())
    for name in model_names:
        model = models[name]
        print(f"Training {name} model...")
        
        # Add the current model to the pipeline
        pipeline = Pipeline(stages=[scaler, model])

        # Hyperparameter tuning with cross-validation for the current model
        param_grid_builder = ParamGridBuilder()

        if isinstance(model, RandomForestRegressor):
            param_grid_builder.addGrid(model.numTrees, [10, 50, 100])
        elif isinstance(model, DecisionTreeRegressor):
            param_grid_builder.addGrid(model.maxDepth, [5, 10, 20])
        elif isinstance(model, LinearRegression):
            param_grid_builder.addGrid(model.regParam, [0.1, 0.3, 0.5])

        # Construir la grilla de parámetros
        param_grid = param_grid_builder.build()


        evaluator = RegressionEvaluator(labelCol="ArrDelay", predictionCol="prediction", metricName="rmse")

        # Set up cross-validation
        cv = CrossValidator(
            estimator=pipeline,
            estimatorParamMaps=param_grid,
            evaluator=evaluator,
            numFolds=5
        )

        # Train the model with cross-validation
        cv_model = cv.fit(train_data)

        if "prediction" in train_df.columns:
            data = data.drop("prediction")

        # Generate predictions on the test dataset
        predictions = cv_model.transform(test_data)

        # Evaluate the model using multiple metrics
        metrics = {}
        # Root Mean Square Error (RMSE)
        rmse_evaluator = RegressionEvaluator(labelCol="ArrDelay", predictionCol="prediction", metricName="rmse")
        metrics['rmse'] = rmse_evaluator.evaluate(predictions)
        print(f"{name} - Root Mean Square Error (RMSE) on test data: {metrics['rmse']}")

        # Mean Absolute Error (MAE)
        mae_evaluator = RegressionEvaluator(labelCol="ArrDelay", predictionCol="prediction", metricName="mae")
        metrics['mae'] = mae_evaluator.evaluate(predictions)
        print(f"{name} - Mean Absolute Error (MAE) on test data: {metrics['mae']}")

        # R-Squared (R²)
        r2_evaluator = RegressionEvaluator(labelCol="ArrDelay", predictionCol="prediction", metricName="r2")
        metrics['r2'] = r2_evaluator.evaluate(predictions)
        print(f"{name} - R-Squared (R²) on test data: {metrics['r2']}")

        # Store model-specific metrics
        all_metrics[name] = metrics

        # Save the best model if a save path is provided
        if model_save_path:
            params = cv_model.bestModel.extractParamMap()
            model = models[name]
            model.setParams(**params)
            print(f"Re-Training {name} model with params {params}...")
            # Add the current model to the pipeline
            pipeline = Pipeline(stages=[scaler, model])
            pipeline_model = pipeline.fit(train_df)
            path = f"{model_save_path}/retrained_{name}"
            pipeline_model.write().overwrite().save(path)
            print(f"Selected best model retrained saved to: {path}")
    return all_metrics


In [None]:
import json
import os
from pyspark import StorageLevel
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType
from pyspark.ml import Pipeline

def test_train_and_predict(processed_schema_path, processed_train_path_parquet, processed_test_path_parquet, model_path, debug=False, skip_training=False):
    """
    Orchestrates the complete machine learning pipeline including loading datasets, training a model,
    and generating predictions on test data. The function handles Spark configuration, data loading,
    and execution of the training and prediction processes.
    
    Parameters:
    processed_schema_path (str): Path to the JSON file containing the DataFrame schema definition.
                                Used to ensure consistent data types across train and test sets.
    
    processed_train_path_parquet (str): Path to the preprocessed training data in Parquet format.
                                       This data is used to train the model if skip_training=False.
    
    processed_test_path_parquet (str): Path to the preprocessed test data in Parquet format.
                                      Used for generating predictions.
    
    model_path (str): Base directory path for:
                      - Saving the trained model if training is performed
                      - Loading an existing model if skip_training=True
                      - Storing predictions in a 'predictions' subdirectory
    
    debug (bool, optional): If True, runs the pipeline on 10% of the training data and uses
                           1 partition instead of 10. Defaults to False.
    
    skip_training (bool, optional): If True, skips the model training phase and loads an
                                   existing model from model_path. Defaults to False.
    
    Returns:
    None
    """
    # Spark configuration
    spark = SparkSession.builder \
        .appName("Optimización con recursos limitados") \
        .master("local[*]") \
        .config("spark.executor.memory", "18g") \
        .config("spark.driver.memory", "42g") \
        .config("spark.sql.shuffle.partitions", "10") \
        .config("spark.python.worker.timeout", "600") \
        .config("spark.network.timeout", "600s") \
        .config("spark.executor.heartbeatInterval", "60s") \
        .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC") \
        .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC") \
        .getOrCreate()
    
    # Load schema from JSON
    print("Loading schema from JSON...")
    with open(processed_schema_path, 'r') as f:
        schema_json = f.read()
    schema = StructType.fromJson(json.loads(schema_json))

    # Load DataFrames from Parquet
    print("Loading train and test datasets from Parquet...")
    train_df = spark.read.schema(schema).parquet(processed_train_path_parquet)
    test_df = spark.read.schema(schema).parquet(processed_test_path_parquet)

    # Sample and repartition data
    
    if debug:
        train_df = train_df.sample(fraction=0.1).repartition(1)
    else:
        train_df = train_df.repartition(10)
        
    # train_df.persist(StorageLevel.DISK_ONLY)
    # test_df.persist(StorageLevel.DISK_ONLY)

    # Confirm data loading
    print("Train and test datasets loaded successfully!")
    print(f"Train dataset count: {train_df.count()}")
    print(f"Test dataset count: {test_df.count()}")

    # Train the model
    print("Training the model...")
    if not skip_training:
        metrics = build_and_train_model(train_df, model_path)
        print(f"Training completed. Metrics: {metrics}")

    # Predict using the trained model
    print("Making predictions on the test dataset...")
    output_path = model_path + "/predictions/"
    
    validate(test_df, model_path, output_path)
        
        

# Example paths
processed_schema_path = "data/processed/schema.json"
processed_train_path_parquet = "data/processed/train.parquet"
processed_test_path_parquet = "data/processed/test.parquet"
model_path = "data/models/trained_model"

# Call the function
test_train_and_predict(processed_schema_path, processed_train_path_parquet, processed_test_path_parquet, model_path, debug=False, skip_training=False)


In [None]:
# spark-submit notebook.py --mode train --input path/to/train.csv --model path/to/save_model
# spark-submit notebook.py --mode predict --input path/to/test.csv --model path/to/save_model --output path/to/predictions