In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=61c01ebf377b4888b6f207c83a8b4ba30b049557e968c849eae7cc04f6d70a8b
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
!pip install sparkxgb

Collecting sparkxgb
  Downloading sparkxgb-0.1.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyspark==3.1.1 (from sparkxgb)
  Downloading pyspark-3.1.1.tar.gz (212.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.3/212.3 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9 (from pyspark==3.1.1->sparkxgb)
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.6/198.6 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sparkxgb, pyspark
  Building wheel for sparkxgb (setup.py) ... [?25l[?25hdone
  Created wheel for sparkxgb: filename=sparkxgb-0.1-py3-none-any.whl size=5629 sha256=4dbe328c2466e0a889c42b89442f4bc20280ad312b53d816597d2a3f78f624ef
  Stored in directory: /root/.cache/pip/wheels/b7/0c/a1/786408e13056fabeb8a72134e101b1e142fc95905c7b0e2

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import FloatType
from pyspark.sql.functions import col, isnan
from pyspark.ml.feature import VectorAssembler, PolynomialExpansion
from pyspark.ml.regression import RandomForestRegressor, LinearRegression, GBTRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import Imputer

# Initialize Spark session
spark = SparkSession.builder.appName("FlightDelayPred").getOrCreate()

# Load the data
data_path = "/content/drive/MyDrive/Datasets/1987.csv"  # Replace with your file path
df = spark.read.csv(data_path, header=True, inferSchema=True)

# Data type casting and handling missing values
df = df.withColumn("DepTime", col("DepTime").cast(FloatType()))
df = df.withColumn("Distance", col("Distance").cast(FloatType()))
df = df.withColumn("CRSDepTime", col("CRSDepTime").cast(FloatType()))
df = df.withColumn("Month", col("Month").cast(FloatType()))
df = df.withColumn("ArrDelay", col("ArrDelay").cast(FloatType()))
df = df.withColumn("DepDelay", col("DepDelay").cast(FloatType()))
df = df.withColumn("CRSElapsedTime", col("CRSElapsedTime").cast(FloatType()))
df = df.withColumn("DayofMonth", col("DayofMonth").cast(FloatType()))
df = df.withColumn("FlightNum", col("FlightNum").cast(FloatType()))

# Remove rows with null or NaN values in target column
df = df.filter(df.ArrDelay.isNotNull() & (~isnan(df.ArrDelay)))

# Selecting the features and target variable
features = ['Month', 'CRSDepTime', 'DepTime', 'Distance', 'DepDelay', 'CRSElapsedTime','DayofMonth','FlightNum']
target = 'ArrDelay'

imputer = Imputer(
    inputCols = features,
    strategy = 'median'
)

# VectorAssembler to combine feature columns into a single vector column
assembler = VectorAssembler(inputCols=features, outputCol="features", handleInvalid="skip")

# Polynomial Expansion for degree 5
polyExpansion = PolynomialExpansion(degree=5, inputCol="features", outputCol="polyFeatures")

# Define Linear Regression model for polynomial regression
poly_lr = LinearRegression(featuresCol="polyFeatures", labelCol=target)

# Update pipeline for Polynomial Regression
poly_pipeline = Pipeline(stages=[assembler, polyExpansion, poly_lr])

# Define the models
rf = RandomForestRegressor(featuresCol="features", labelCol=target)
lr = LinearRegression(featuresCol="features", labelCol=target)
gbt = GBTRegressor(featuresCol="features", labelCol=target)

# Pipelines for the models
rf_pipeline = Pipeline(stages=[assembler, rf])
lr_pipeline = Pipeline(stages=[assembler, lr])
gbt_pipeline = Pipeline(stages=[assembler, gbt])

#models = [rf_pipeline, lr_pipeline, gbt_pipeline, poly_pipeline]
models = [rf_pipeline]

for model in models:
    # Train the model
    trained_model = model.fit(df)

    # Make predictions
    predictions = trained_model.transform(df)

    # Evaluate the model for RMSE
    rmse_evaluator = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName="rmse")
    rmse = rmse_evaluator.evaluate(predictions)

    # Evaluate the model for R2
    r2_evaluator = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName="r2")
    r2 = r2_evaluator.evaluate(predictions)

    # Print the RMSE and R2
    if isinstance(model.getStages()[-1], LinearRegression) and len(model.getStages()) > 2:
        model_name = "Polynomial Linear Regression"
    else:
        model_name = model.getStages()[-1].__class__.__name__
    print(f"{model_name} - Root Mean Squared Error (RMSE): {rmse}, R2: {r2}")

# Stop the Spark session
spark.stop()

RandomForestRegressor - Root Mean Squared Error (RMSE): 16.817192711743353, R2: 0.5754737491427673


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import FloatType
from pyspark.sql.functions import col, isnan
from pyspark.ml.feature import VectorAssembler, PolynomialExpansion
from pyspark.ml.regression import RandomForestRegressor, LinearRegression, GBTRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import Imputer

# Initialize Spark session
spark = SparkSession.builder.appName("FlightDelayPred").getOrCreate()

# Load the data
data_path = "/content/drive/MyDrive/SENG550/2008.csv"  # Replace with your file path
df = spark.read.csv(data_path, header=True, inferSchema=True)

# Data type casting and handling missing values
df = df.withColumn("DepTime", col("DepTime").cast(FloatType()))
df = df.withColumn("Distance", col("Distance").cast(FloatType()))
df = df.withColumn("CRSDepTime", col("CRSDepTime").cast(FloatType()))
df = df.withColumn("Month", col("Month").cast(FloatType()))
df = df.withColumn("ArrDelay", col("ArrDelay").cast(FloatType()))
df = df.withColumn("DepDelay", col("DepDelay").cast(FloatType()))
df = df.withColumn("CRSElapsedTime", col("CRSElapsedTime").cast(FloatType()))
df = df.withColumn("DayofMonth", col("DayofMonth").cast(FloatType()))
df = df.withColumn("FlightNum", col("FlightNum").cast(FloatType()))

# Remove rows with null or NaN values in target column
df = df.filter(df.ArrDelay.isNotNull() & (~isnan(df.ArrDelay)))

# Split the data into training and test sets (e.g., 80% training and 20% testing)
train_df, test_df = df.randomSplit([0.75, 0.25], seed=42)

# Selecting the features and target variable
features = ['Month', 'CRSDepTime', 'DepTime', 'Distance', 'DepDelay', 'CRSElapsedTime', 'DayofMonth', 'FlightNum']
target = 'ArrDelay'

imputer = Imputer(
    inputCols = features,
    strategy = 'mean'
)

# VectorAssembler to combine feature columns into a single vector column
assembler = VectorAssembler(inputCols=features, outputCol="features", handleInvalid="skip")


# Polynomial Expansion for degree 5
polyExpansion = PolynomialExpansion(degree=5, inputCol="features", outputCol="polyFeatures")
# Define Linear Regression model for polynomial regression
poly_lr = LinearRegression(featuresCol="polyFeatures", labelCol=target)
# Update pipeline for Polynomial Regression
poly_pipeline = Pipeline(stages=[assembler, polyExpansion, poly_lr])


# Define the models
rf = RandomForestRegressor(featuresCol="features", labelCol=target)
lr = LinearRegression(featuresCol="features", labelCol=target)
gbt = GBTRegressor(featuresCol="features", labelCol=target)

# Pipelines for the models
rf_pipeline = Pipeline(stages=[assembler, rf])
lr_pipeline = Pipeline(stages=[assembler, lr])
gbt_pipeline = Pipeline(stages=[assembler, gbt])

# List of models to train and evaluate
models = [rf_pipeline,lr_pipeline,gbt_pipeline,poly_pipeline]

for model in models:
    # Train the model on the training set
    trained_model = model.fit(train_df)

    # Make predictions on the test set
    predictions = trained_model.transform(test_df)

    # Evaluate the model for RMSE
    rmse_evaluator = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName="rmse")
    rmse = rmse_evaluator.evaluate(predictions)

    # Evaluate the model for R2
    r2_evaluator = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName="r2")
    r2 = r2_evaluator.evaluate(predictions)

    # Print the RMSE and R2
    model_name = model.getStages()[-1].__class__.__name__
    print(f"{model_name} - Root Mean Squared Error (RMSE): {rmse}, R2: {r2}")

# Stop the Spark session
spark.stop()


RandomForestRegressor - Root Mean Squared Error (RMSE): 20.4463633834094, R2: 0.734799143742128
LinearRegression - Root Mean Squared Error (RMSE): 14.094089967195988, R2: 0.8739865648412362
GBTRegressor - Root Mean Squared Error (RMSE): 15.991982332136542, R2: 0.8377639460490582
LinearRegression - Root Mean Squared Error (RMSE): 14.072926181899728, R2: 0.8743647260362339
