In [1]:
# =================== 1. Setup Spark and Import Libraries ===================
from pyspark.sql import SparkSession
from pyspark.ml.regression import RandomForestRegressor, DecisionTreeRegressor, LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
import argparse
from pyspark.ml import Pipeline
from pyspark.ml import PipelineModel
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import *


# Initialize Spark Session
spark = SparkSession.builder.appName("MachineLearningProject").getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/07 03:30:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# =================== 1. Data Reading ===================
def load_data(spark, input_path, mode):
    """
    Load the input dataset, drop forbidden columns, and validate its structure.

    Args:
        spark (SparkSession): The active Spark session.
        input_path (str): Path to the input CSV file.
        mode (str): Mode of operation ("train" or "predict").
    
    Returns:
        DataFrame: Processed Spark DataFrame.
    """
    forbidden_columns = [
        "ArrTime", "ActualElapsedTime", "AirTime", "TaxiIn", "Diverted",
        "CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay"
    ]

    try:
        # Read the dataset
        data = spark.read.csv(input_path, header=True, inferSchema=True)

        # Drop forbidden columns
        data = data.drop(*forbidden_columns)

        # Check if the dataset is empty
        if data.count() == 0:
            raise ValueError("The dataset is empty.")

        # Validate the presence of the target variable for training
        if mode == "train" and "ArrDelay" not in data.columns:
            raise ValueError("The target variable 'ArrDelay' is missing.")
    
    except Exception as e:
        print(f"Error reading the dataset: {e}")
        spark.stop()
        raise
    
    return data


In [3]:
# =================== 2. Exploratory Data Analysis (EDA) ===================
def eda(data):
    """
    Perform exploratory data analysis on the dataset, including univariate and multivariate analysis.

    Args:
        data (DataFrame): Spark DataFrame to analyze.

    Returns:
        DataFrame: DataFrame with a feature vector column added for further processing.
    """
    # Univariate analysis: Display statistical summary
    print("Statistical Summary:")
    data.describe().show()

    # Multivariate analysis: Correlations and patterns
    # Select numeric columns for correlation analysis
    numeric_cols = [col for col, dtype in data.dtypes if dtype in ('int', 'double')]

    if numeric_cols:
        # Assemble numeric columns into a single feature vector
        vector_col = "features_vector"
        assembler = VectorAssembler(inputCols=numeric_cols, outputCol=vector_col)
        data = assembler.transform(data)

        # Display correlation matrix for the numeric features
        from pyspark.ml.stat import Correlation
        try:
            correlation_matrix = Correlation.corr(data, vector_col).head()[0]
            print(f"Correlation matrix:\n{correlation_matrix}")
        except Exception as e:
            print(f"Error calculating correlations: {e}")
    else:
        print("No numeric columns available for correlation analysis.")

    return data


In [4]:
# =================== 3. Data Processing ===================
def process_data(data, mode):
    """
    Process the dataset: handle missing values and perform feature engineering.

    Args:
        data (DataFrame): Spark DataFrame to process.
        mode (str): Mode of operation ("train" or "predict").
    
    Returns:
        DataFrame: Processed Spark DataFrame with new features added.
    """
    # Validate the target variable for training mode
    if mode == "train" and "ArrDelay" not in data.columns:
        raise ValueError("The target variable 'ArrDelay' is missing.")

    # Handle missing values
    if mode == "train":
        # Drop rows where the target variable or features are null
        data = data.dropna(subset=["ArrDelay"])

    # Example: Fill null values in specific columns with a default value
    # Replace 'column_name' with actual column names as needed
    # Uncomment this if specific columns require filling
    # data = data.fillna({"column_name": 0})

    # Transform special variables
    # Feature engineering: Create time-based features
    if "DepTime" in data.columns:
        data = data.withColumn("DepHour", (col("DepTime") / 100).cast("int"))  # Extract hour from departure time

    if "FlightDate" in data.columns:
        data = data.withColumn("DayOfWeek", date_format(col("FlightDate"), "u").cast("int"))  # Convert to day of the week

    # Feature engineering: Create flight distance categories
    if "Distance" in data.columns:
        data = data.withColumn(
            "DistanceCategory",
            when(col("Distance") < 500, "Short")  # Short flights
            .when((col("Distance") >= 500) & (col("Distance") < 1500), "Medium")  # Medium flights
            .otherwise("Long")  # Long flights
        )
    
    return data


In [5]:
# =================== 4. Feature Engineering ===================
def feature_engineering(data, additional_dataset_path=None):
    """
    Perform feature engineering, including creating new features and optionally integrating additional datasets.

    Args:
        data (DataFrame): Spark DataFrame for feature engineering.
        additional_dataset_path (str): Optional path to an additional dataset for integration.
    
    Returns:
        DataFrame: Enhanced Spark DataFrame with new features.
    """
    # Create new features based on existing columns
    if "existing_column" in data.columns:
        data = data.withColumn("new_feature", col("existing_column") * 2)  # Example transformation

    # Optional: Integrate additional datasets
    if additional_dataset_path:
        try:
            additional_dataset = spark.read.csv(additional_dataset_path, header=True, inferSchema=True)
            
            # Example: Join the datasets on a common column
            if "common_column" in data.columns and "common_column" in additional_dataset.columns:
                data = data.join(additional_dataset, on="common_column", how="left")
        except Exception as e:
            print(f"Error integrating additional dataset: {e}")

    return data


In [6]:
def build_and_train_model(data, pipeline, model_save_path=None):
    """
    Build, train, evaluate, and optionally save the model using cross-validation with three models.

    Args:
        data (DataFrame): Spark DataFrame with features and labels.
        pipeline (Pipeline): Preprocessing pipeline to use before modeling.
        model_save_path (str): Path to save the trained model (optional).
    
    Returns:
        dict: Evaluation metrics for the trained model.
    """
    # Split data into training and testing sets
    train_data = data

    if "prediction" in train_data.columns:
        train_data = train_data.drop("prediction")

    # Define the models: RandomForestRegressor, DecisionTreeRegressor, LinearRegression
    rf = RandomForestRegressor(featuresCol="features", labelCol="ArrDelay")
    dt = DecisionTreeRegressor(featuresCol="features", labelCol="ArrDelay")
    lr = LinearRegression(featuresCol="features", labelCol="ArrDelay")

    # Initialize metrics dictionary
    all_metrics = {}

    # Evaluate each model separately
    models = [rf, dt, lr]
    model_names = ['Random Forest', 'Decision Tree', 'Linear Regression']
    
    for model, name in zip(models, model_names):
        print(f"Training {name} model...")
        
        # Add the current model to the pipeline
        pipeline.setStages(pipeline.getStages() + [model])

        # Hyperparameter tuning with cross-validation for the current model
        param_grid_builder = ParamGridBuilder()

        if isinstance(model, RandomForestRegressor):
            param_grid_builder.addGrid(model.numTrees, [10, 50, 100])
        elif isinstance(model, DecisionTreeRegressor):
            param_grid_builder.addGrid(model.maxDepth, [5, 10, 20])
        elif isinstance(model, LinearRegression):
            param_grid_builder.addGrid(model.regParam, [0.1, 0.3, 0.5])

        # Construir la grilla de parámetros
        param_grid = param_grid_builder.build()


        evaluator = RegressionEvaluator(labelCol="ArrDelay", predictionCol="prediction", metricName="rmse")

        # Set up cross-validation
        cv = CrossValidator(
            estimator=pipeline,
            estimatorParamMaps=param_grid,
            evaluator=evaluator,
            numFolds=5
        )

        # Train the model with cross-validation
        cv_model = cv.fit(train_data)

        if "prediction" in data.columns:
            data = data.drop("prediction")

        # Generate predictions on the test dataset
        predictions = cv_model.transform(test_data)

        # Evaluate the model using multiple metrics
        metrics = {}
        # Root Mean Square Error (RMSE)
        rmse_evaluator = RegressionEvaluator(labelCol="ArrDelay", predictionCol="prediction", metricName="rmse")
        metrics['rmse'] = rmse_evaluator.evaluate(predictions)
        print(f"{name} - Root Mean Square Error (RMSE) on test data: {metrics['rmse']}")

        # Mean Absolute Error (MAE)
        mae_evaluator = RegressionEvaluator(labelCol="ArrDelay", predictionCol="prediction", metricName="mae")
        metrics['mae'] = mae_evaluator.evaluate(predictions)
        print(f"{name} - Mean Absolute Error (MAE) on test data: {metrics['mae']}")

        # R-Squared (R²)
        r2_evaluator = RegressionEvaluator(labelCol="ArrDelay", predictionCol="prediction", metricName="r2")
        metrics['r2'] = r2_evaluator.evaluate(predictions)
        print(f"{name} - R-Squared (R²) on test data: {metrics['r2']}")

        # Store model-specific metrics
        all_metrics[name] = metrics

        # Save the best model if a save path is provided
        if model_save_path:
            cv_model.bestModel.write().overwrite().save(f"{model_save_path}_{name}")
            print(f"Best {name} model saved to: {model_save_path}_{name}")

    return all_metrics


In [7]:
def predict(data, model_path, output_path):
    """
    Load the trained model and generate predictions for the given data.

    Args:
        data (DataFrame): Spark DataFrame for predictions.
        model_path (str): Path to load the trained model.
        output_path (str): Path to save predictions (CSV).
    
    Returns:
        None
    """
    # Load the trained model
    model = PipelineModel.load(model_path)

    # Drop the existing 'prediction' column if it exists
    if "prediction" in data.columns:
        print("Dropping existing 'prediction' column...")
        data = data.drop("prediction")

    # Make predictions on the input data
    predictions = model.transform(data)

    # Save predictions to the specified output path
    predictions.select("features_vector", "prediction").write.csv(output_path, header=True)
    print(f"Predictions saved to: {output_path}")


In [8]:
def main():
    """
    Main function to execute the pipeline workflow.
    Accepts command-line arguments for dynamic input/output handling.
    """
    parser = argparse.ArgumentParser(description="Flight Delay Prediction Application")
    parser.add_argument("--mode", type=str, required=True, choices=["train", "predict"], help="Mode: train or predict")
    parser.add_argument("--input", type=str, required=True, help="Path to input CSV file")
    parser.add_argument("--model", type=str, required=True, help="Path to save/load the model")
    parser.add_argument("--output", type=str, help="Path to save predictions (required for predict mode)")

    args = parser.parse_args()

    # Start Spark Session
    spark = SparkSession.builder.appName("FlightDelayPipeline").getOrCreate()

    try:
        # Workflow
        data = load_data(spark, args.input, args.mode)  # Load the dataset
        data = eda(data)
        data = process_data(data, args.mode)        # Preprocess the dataset
        pipeline, _ = feature_engineering(data)        # Perform feature engineering

        if args.mode == "train":
            # Train the model, evaluate it, and optionally save it
            metrics = build_and_train_model(data, pipeline, args.model)
            print(f"Training completed. Evaluation metrics: {metrics}")
        elif args.mode == "predict":
            if not args.output:
                raise ValueError("Output path is required for prediction mode.")
            # Use the trained model to generate predictions
            predict(data, args.model, args.output)

    finally:
        # Stop Spark Session
        spark.stop()

# Add this block to execute the script when running it as a standalone script
#if __name__ == "__main__":
#    main()


In [9]:
import json
from pyspark import StorageLevel
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType
from pyspark.sql.functions import col, isnan
from pyspark.storagelevel import StorageLevel

def test_train_and_predict(processed_schema_path, processed_train_path_parquet, processed_test_path_parquet, model_path):
    """
    Function to test the build_and_train_model and predict functions using train and test datasets.

    Args:
        processed_schema_path (str): Path to the schema JSON file.
        processed_train_path_parquet (str): Path to the train.parquet file.
        processed_test_path_parquet (str): Path to the test.parquet file.
        model_path (str): Path to save or load the trained model.

    Returns:
        None
    """
    

    # Configuración de Spark
    spark = SparkSession.builder \
        .appName("Optimización con recursos limitados") \
        .master("local[*]") \
        .config("spark.executor.memory", "2g") \
        .config("spark.driver.memory", "2g") \
        .config("spark.sql.shuffle.partitions", "100") \
        .getOrCreate()

        
    pipeline = Pipeline(stages=[])
        
    try:
        # Carga el esquema desde el archivo JSON
        print("Loading schema from JSON...")
        with open(processed_schema_path, 'r') as f:
            schema_json = f.read()

        schema = StructType.fromJson(json.loads(schema_json))

        # Carga los DataFrames de Parquet usando el esquema
        print("Loading train and test datasets from Parquet...")
        train_df = spark.read.parquet(processed_train_path_parquet, schema=schema)
        test_df = spark.read.parquet(processed_test_path_parquet, schema=schema)


        train_df = train_df.sample(fraction=0.1)  # Usa una muestra del 10%
        train_df = train_df.repartition(10)  # Reparticionar para evitar carga de memoria


        # Persistir los DataFrames en disco
        train_df.persist(StorageLevel.DISK_ONLY)
        test_df.persist(StorageLevel.DISK_ONLY)

        # Confirmación de datos cargados
        print("Train and test datasets loaded successfully!")
        print(f"Train dataset count: {train_df.count()}")
        print(f"Test dataset count: {test_df.count()}")

        train_df = train_df.repartition(200)  # Ajusta el número de particiones según el tamaño de los datos

        # Train the model
        print("Training the model...")
        metrics = build_and_train_model(train_df, pipeline, model_path)
        print(f"Training completed. Metrics: {metrics}")

        # Predict using the trained model
        print("Making predictions on the test dataset...")
        output_path = model_path + "_predictions"
        predict(test_df, model_path, output_path)
        print(f"Predictions saved to: {output_path}")

    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        train_df.unpersist()
        test_df.unpersist()
        spark.stop()

# # Rutas de ejemplo
# processed_schema_path = "data/processed/schema.json"  # Ruta al archivo JSON del esquema
# processed_train_path_parquet = "data/processed/train.parquet"  # Ruta al archivo Parquet de train
# processed_test_path_parquet = "data/processed/test.parquet"  # Ruta al archivo Parquet de test
# model_path = "data/models/trained_model"

# # Llamada a la función
# test_train_and_predict(processed_schema_path, processed_train_path_parquet, processed_test_path_parquet, model_path)


In [10]:
import json
import os
from pyspark import StorageLevel
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType
from pyspark.ml import Pipeline

def test_train_and_predict(processed_schema_path, processed_train_path_parquet, processed_test_path_parquet, model_path):
    """
    Function to test the build_and_train_model and predict functions using train and test datasets.

    Args:
        processed_schema_path (str): Path to the schema JSON file.
        processed_train_path_parquet (str): Path to the train.parquet file.
        processed_test_path_parquet (str): Path to the test.parquet file.
        model_path (str): Path to save or load the trained model.

    Returns:
        None
    """
    # Spark configuration
    spark = SparkSession.builder \
        .appName("Optimización con recursos limitados") \
        .master("local[*]") \
        .config("spark.executor.memory", "2g") \
        .config("spark.driver.memory", "2g") \
        .config("spark.sql.shuffle.partitions", "100") \
        .getOrCreate()

    try:
        # Load schema from JSON
        print("Loading schema from JSON...")
        with open(processed_schema_path, 'r') as f:
            schema_json = f.read()
        schema = StructType.fromJson(json.loads(schema_json))

        # Load DataFrames from Parquet
        print("Loading train and test datasets from Parquet...")
        train_df = spark.read.schema(schema).parquet(processed_train_path_parquet)
        test_df = spark.read.schema(schema).parquet(processed_test_path_parquet)

        # Sample and repartition data
        train_df = train_df.sample(fraction=0.1).repartition(10)
        train_df.persist(StorageLevel.DISK_ONLY)
        test_df.persist(StorageLevel.DISK_ONLY)

        # Confirm data loading
        print("Train and test datasets loaded successfully!")
        print(f"Train dataset count: {train_df.count()}")
        print(f"Test dataset count: {test_df.count()}")

        # Train the model
        print("Training the model...")
        pipeline = Pipeline(stages=[])

        metrics = build_and_train_model(train_df, pipeline, model_path)
        print(f"Training completed. Metrics: {metrics}")

        # Predict using the trained model
        print("Making predictions on the test dataset...")
        output_path = model_path + "_predictions"
        os.makedirs(output_path, exist_ok=True)  # Ensure output directory exists
        predict(test_df, model_path, output_path)
        print(f"Predictions saved to: {output_path}")

    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        if 'train_df' in locals():
            train_df.unpersist()
        if 'test_df' in locals():
            test_df.unpersist()
        spark.stop()

# Example paths
processed_schema_path = "data/processed/schema.json"
processed_train_path_parquet = "data/processed/train.parquet"
processed_test_path_parquet = "data/processed/test.parquet"
model_path = "data/models/trained_model"

# Call the function
test_train_and_predict(processed_schema_path, processed_train_path_parquet, processed_test_path_parquet, model_path)


25/01/07 03:30:26 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Loading schema from JSON...
Loading train and test datasets from Parquet...


25/01/07 03:30:27 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


Train and test datasets loaded successfully!


                                                                                

Train dataset count: 178394


                                                                                

Test dataset count: 447562
Training the model...
Training Random Forest model...


                                                                                

An error occurred: name 'test_data' is not defined


In [11]:
# spark-submit notebook.py --mode train --input path/to/train.csv --model path/to/save_model
# spark-submit notebook.py --mode predict --input path/to/test.csv --model path/to/save_model --output path/to/predictions