In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from pyspark.sql import SparkSession

In [None]:
spark=SparkSession.builder.master('local[4]').appName('project').getOrCreate()

In [None]:
# Import the necessary libraries
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
# Load the dataset
df = spark.read.csv("/content/drive/MyDrive/Project_knowit/finalRegression.csv", header=True, inferSchema=True)


In [None]:
df

DataFrame[_c0: int, year: int, month: int, airport: string, arr_del15: double, carrier_ct: double, weather_ct: double, nas_ct: double, security_ct: double, late_aircraft_ct: double, arr_diverted: double, carrier_delay: double, weather_delay: double, nas_delay: double, security_delay: double, late_aircraft_delay: double, arr_delay_scaled: double]

In [None]:
df.show()

+---+----+-----+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|_c0|year|month|airport|           arr_del15|          carrier_ct|          weather_ct|              nas_ct|         security_ct|    late_aircraft_ct|        arr_diverted|       carrier_delay|       weather_delay|           nas_delay|      security_delay| late_aircraft_delay|    arr_delay_scaled|
+---+----+-----+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  0|2022|   11|    ABY| -0.3317854945166775|-0.37592621851215446|-0.17641855560199077| -0.261866990041179

In [None]:
# Split the data into input features (X) and target variable (y)
feature_cols = df.columns[:-1]

In [None]:
feature_cols

['_c0',
 'year',
 'month',
 'airport',
 'arr_del15',
 'carrier_ct',
 'weather_ct',
 'nas_ct',
 'security_ct',
 'late_aircraft_ct',
 'arr_diverted',
 'carrier_delay',
 'weather_delay',
 'nas_delay',
 'security_delay',
 'late_aircraft_delay']

In [None]:
df.count()

78439

In [None]:
df.show(5)

+---+----+-----+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|_c0|year|month|airport|           arr_del15|          carrier_ct|          weather_ct|              nas_ct|         security_ct|    late_aircraft_ct|        arr_diverted|       carrier_delay|       weather_delay|           nas_delay|      security_delay| late_aircraft_delay|    arr_delay_scaled|
+---+----+-----+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  0|2022|   11|    ABY| -0.3317854945166775|-0.37592621851215446|-0.17641855560199077| -0.261866990041179

In [None]:
df.summary().show()

+-------+------------------+------------------+------------------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|summary|               _c0|              year|             month|airport|           arr_del15|          carrier_ct|          weather_ct|              nas_ct|         security_ct|    late_aircraft_ct|        arr_diverted|       carrier_delay|       weather_delay|           nas_delay|      security_delay| late_aircraft_delay|    arr_delay_scaled|
+-------+------------------+------------------+------------------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------

In [None]:
pdf = df.toPandas()

In [None]:
pdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78439 entries, 0 to 78438
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   _c0                  78439 non-null  int32  
 1   year                 78439 non-null  int32  
 2   month                78439 non-null  int32  
 3   airport              78439 non-null  object 
 4   arr_del15            78439 non-null  float64
 5   carrier_ct           78439 non-null  float64
 6   weather_ct           78439 non-null  float64
 7   nas_ct               78439 non-null  float64
 8   security_ct          78439 non-null  float64
 9   late_aircraft_ct     78439 non-null  float64
 10  arr_diverted         78439 non-null  float64
 11  carrier_delay        78439 non-null  float64
 12  weather_delay        78439 non-null  float64
 13  nas_delay            78439 non-null  float64
 14  security_delay       78439 non-null  float64
 15  late_aircraft_delay  78439 non-null 

In [None]:
pdf['arr_delay_scaled'].value_counts()

-0.313633    4505
-0.312277     184
-0.312096     171
-0.312186     157
-0.312005     136
             ... 
 1.111773       1
 0.007052       1
 1.828906       1
 1.934414       1
 7.617141       1
Name: arr_delay_scaled, Length: 12720, dtype: int64

In [None]:
pdf.isnull().sum()

_c0                    0
year                   0
month                  0
airport                0
arr_del15              0
carrier_ct             0
weather_ct             0
nas_ct                 0
security_ct            0
late_aircraft_ct       0
arr_diverted           0
carrier_delay          0
weather_delay          0
nas_delay              0
security_delay         0
late_aircraft_delay    0
arr_delay_scaled       0
dtype: int64

In [None]:
from pyspark.sql.functions import isnull,when,count,col

In [None]:
df.select([count(when(isnull(c),c)).alias(c) for c in df.columns]).show()

+---+----+-----+-------+---------+----------+----------+------+-----------+----------------+------------+-------------+-------------+---------+--------------+-------------------+----------------+
|_c0|year|month|airport|arr_del15|carrier_ct|weather_ct|nas_ct|security_ct|late_aircraft_ct|arr_diverted|carrier_delay|weather_delay|nas_delay|security_delay|late_aircraft_delay|arr_delay_scaled|
+---+----+-----+-------+---------+----------+----------+------+-----------+----------------+------------+-------------+-------------+---------+--------------+-------------------+----------------+
|  0|   0|    0|      0|        0|         0|         0|     0|          0|               0|           0|            0|            0|        0|             0|                  0|               0|
+---+----+-----+-------+---------+----------+----------+------+-----------+----------------+------------+-------------+-------------+---------+--------------+-------------------+----------------+



In [None]:
# Prepare features and label
required_features =['year','month','arr_del15','carrier_ct','weather_ct','nas_ct','security_ct','late_aircraft_ct','arr_diverted','carrier_delay','weather_delay','nas_delay','security_delay','late_aircraft_delay']

In [None]:
pdf[required_features]

Unnamed: 0,year,month,arr_del15,carrier_ct,weather_ct,nas_ct,security_ct,late_aircraft_ct,arr_diverted,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2022,11,-0.331785,-0.375926,-0.176419,-0.261867,-0.204229,-0.302070,-0.217510,-0.319421,-0.218920,-0.232233,-0.163283,-0.291977
1,2022,11,-0.311897,-0.338844,-0.282602,-0.248319,-0.204229,-0.279732,-0.217510,-0.275219,-0.250517,-0.217853,-0.163283,-0.282159
2,2022,11,-0.338415,-0.375717,-0.282602,-0.286970,-0.204229,-0.284927,-0.217510,-0.319172,-0.250517,-0.242053,-0.163283,-0.265262
3,2022,11,-0.245603,-0.268870,-0.282602,-0.233377,-0.204229,-0.177390,-0.217510,-0.236480,-0.250517,-0.220308,-0.163283,-0.092414
4,2022,11,0.410709,0.224722,0.041687,0.455564,-0.204229,0.491035,0.090656,1.034204,-0.094964,0.133576,-0.163283,0.672502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78434,2019,1,0.105756,-0.122007,-0.014274,0.071048,-0.204229,0.322024,-0.217510,-0.169680,-0.012326,0.031514,-0.163283,0.292327
78435,2019,1,-0.252233,-0.318522,-0.183593,-0.237959,-0.204229,-0.161632,0.090656,-0.239211,-0.212844,-0.202772,-0.163283,-0.232611
78436,2019,1,-0.172680,-0.278088,0.041687,-0.051678,-0.204229,-0.176351,-0.217510,-0.241695,-0.130207,-0.072301,-0.163283,-0.209092
78437,2019,1,-0.113015,-0.232835,0.710355,-0.057854,-0.204229,-0.133232,-0.217510,-0.201714,0.518742,-0.058623,-0.163283,-0.137396


In [None]:

assembler= VectorAssembler(inputCols=required_features,
                           outputCol='features')

In [None]:
transformed_data=assembler.transform(df)

In [None]:
transformed_data.show(5)

+---+----+-----+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|_c0|year|month|airport|           arr_del15|          carrier_ct|          weather_ct|              nas_ct|         security_ct|    late_aircraft_ct|        arr_diverted|       carrier_delay|       weather_delay|           nas_delay|      security_delay| late_aircraft_delay|    arr_delay_scaled|            features|
+---+----+-----+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  0|2022|   11|    ABY| -0.331785494516677

In [None]:
# Split data into train and test sets
training_data,test_data =transformed_data.randomSplit([0.75,0.25],
                                                      seed=0)

In [None]:
# Create the linear regression model
reg_model = LinearRegression(featuresCol="features", labelCol="arr_delay_scaled")


In [None]:
# Train the model using the training set
model = reg_model.fit(training_data)


In [None]:
# Make predictions using the testing set
predictions_test = model.transform(test_data)

In [None]:
# Make predictions using the training set
predictions_train = model.transform(training_data)

In [None]:
predictions_test.show()

+---+----+-----+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|_c0|year|month|airport|           arr_del15|          carrier_ct|          weather_ct|              nas_ct|         security_ct|    late_aircraft_ct|        arr_diverted|       carrier_delay|       weather_delay|           nas_delay|      security_delay| late_aircraft_delay|    arr_delay_scaled|            features|          prediction|
+---+----+-----+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--

In [None]:
predictions_train.show()

+---+----+-----+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|_c0|year|month|airport|           arr_del15|          carrier_ct|          weather_ct|              nas_ct|         security_ct|    late_aircraft_ct|        arr_diverted|       carrier_delay|       weather_delay|           nas_delay|      security_delay| late_aircraft_delay|    arr_delay_scaled|            features|          prediction|
+---+----+-----+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--

In [None]:
predictions_test.select('prediction').toPandas()

Unnamed: 0,prediction
0,-0.308751
1,-0.284159
2,-0.184165
3,-0.219425
4,0.332347
...,...
19580,-0.298534
19581,-0.309112
19582,0.054156
19583,-0.239587


In [None]:
predictions_train.select('prediction').toPandas()

Unnamed: 0,prediction
0,-0.287414
1,-0.302964
2,-0.198812
3,0.669487
4,-0.219877
...,...
58849,0.092761
58850,0.060484
58851,-0.247995
58852,-0.105057


In [None]:
# Evaluate the model using MSE
evaluator = RegressionEvaluator(labelCol="arr_delay_scaled", predictionCol="prediction", metricName="mse")


In [None]:
mse = evaluator.evaluate(predictions_test)
print('Mean Squared Error:', mse)

Mean Squared Error: 1.0812988825184839e-07


In [None]:
# Evaluate the model using MAE
evaluator = RegressionEvaluator(labelCol="arr_delay_scaled", predictionCol="prediction", metricName="mae")


In [None]:
mae = evaluator.evaluate(predictions_test)
print('Mean Absolute Error:', mae)

Mean Absolute Error: 2.3900823821832687e-06


In [None]:

# Evaluate the model using RMSE and R2 score
evaluator = RegressionEvaluator(labelCol="arr_delay_scaled", predictionCol="prediction", metricName="rmse")


In [None]:
rmse_test = evaluator.evaluate(predictions_test)
rmse_train = evaluator.evaluate(predictions_train)

In [None]:
print("RMSE for train:", rmse_train)
print("----------------------------------")
print("RMSE for test:", rmse_test)

RMSE for train: 7.453354152324654e-06
----------------------------------
RMSE for test: 0.00032883109380326


In [None]:
evaluator = RegressionEvaluator(labelCol="arr_delay_scaled", predictionCol="prediction", metricName="r2")


In [None]:
r2_test = evaluator.evaluate(predictions_test)
r2_train= evaluator.evaluate(predictions_train)


In [None]:
print("R2 Score for test:", r2_test)
print("----------------------------------")
print("R2 Score for train:", r2_train)


R2 Score for test: 0.9999998849727723
----------------------------------
R2 Score for train: 0.9999999999455341
