In [1]:
#One worker per core 
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *

sc=SparkContext(master="local[2]")
spark = SparkSession(sc)

#from pyspark.sql.session import SparkSession
#sc = SparkContext.getOrCreate()
#spark = SparkSession(sc)
#sc.stop()
import pandas as pd 
import numpy as np 

In [3]:
#Regression related libraries 

from pyspark.sql.types import *
from pyspark.sql.functions import *

from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

In [6]:
#Data loading 
csv=spark.read.csv("data/flights.csv",inferSchema=True,header=True)
data=csv.select("DayofMonth", "DayOfWeek", "OriginAirportID", "DestAirportID", "DepDelay", "ArrDelay")
data.show(5)

+----------+---------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+---------------+-------------+--------+--------+
|        19|        5|          11433|        13303|      -3|       1|
|        19|        5|          14869|        12478|       0|      -8|
|        19|        5|          14057|        14869|      -4|     -15|
|        19|        5|          15016|        11433|      28|      24|
|        19|        5|          11193|        12892|      -6|     -11|
+----------+---------+---------------+-------------+--------+--------+
only showing top 5 rows



In [11]:
#Splitting
splits = data.randomSplit([0.7, 0.3])
train=splits[0]
test=splits[1]
print("Train: ",train.count(),"Test: ",test.count())

Train:  1892133 Test:  810085


In [13]:
#Trainning transformation 
assembler = VectorAssembler(inputCols = ["DayofMonth", "DayOfWeek", "OriginAirportID", "DestAirportID", "DepDelay"], outputCol="features")
training = assembler.transform(train).select(col("features"), (col("ArrDelay").cast("Int").alias("label")))
training.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1.0,1.0,10140.0,...|  -11|
|[1.0,1.0,10140.0,...|  -18|
|[1.0,1.0,10140.0,...|  -17|
|[1.0,1.0,10140.0,...|   -9|
|[1.0,1.0,10140.0,...|    4|
|[1.0,1.0,10140.0,...|   94|
|[1.0,1.0,10140.0,...|  -23|
|[1.0,1.0,10140.0,...|  -14|
|[1.0,1.0,10140.0,...|  -12|
|[1.0,1.0,10140.0,...|   -6|
|[1.0,1.0,10140.0,...|  -10|
|[1.0,1.0,10140.0,...|    5|
|[1.0,1.0,10140.0,...|   14|
|[1.0,1.0,10140.0,...|   -9|
|[1.0,1.0,10140.0,...|   -5|
|[1.0,1.0,10140.0,...|   -6|
|[1.0,1.0,10140.0,...|    6|
|[1.0,1.0,10140.0,...|   13|
|[1.0,1.0,10140.0,...|   19|
|[1.0,1.0,10140.0,...|   38|
+--------------------+-----+
only showing top 20 rows



In [14]:
#Testing transformation 
testing = assembler.transform(test).select(col("features"), (col("ArrDelay")).cast("Int").alias("trueLabel"))
testing.show()

+--------------------+---------+
|            features|trueLabel|
+--------------------+---------+
|[1.0,1.0,10140.0,...|      -12|
|[1.0,1.0,10140.0,...|       -9|
|[1.0,1.0,10140.0,...|      -14|
|[1.0,1.0,10140.0,...|      -11|
|[1.0,1.0,10140.0,...|      -11|
|[1.0,1.0,10140.0,...|      -12|
|[1.0,1.0,10140.0,...|       19|
|[1.0,1.0,10140.0,...|       23|
|[1.0,1.0,10140.0,...|       41|
|[1.0,1.0,10140.0,...|       -6|
|[1.0,1.0,10140.0,...|       -8|
|[1.0,1.0,10140.0,...|       -5|
|[1.0,1.0,10140.0,...|       -1|
|[1.0,1.0,10140.0,...|        2|
|[1.0,1.0,10140.0,...|       38|
|[1.0,1.0,10140.0,...|      -13|
|[1.0,1.0,10140.0,...|       -1|
|[1.0,1.0,10140.0,...|      -10|
|[1.0,1.0,10140.0,...|      -23|
|[1.0,1.0,10140.0,...|      106|
+--------------------+---------+
only showing top 20 rows



In [16]:
lr = LinearRegression(labelCol="label",featuresCol="features", maxIter=10, regParam=0.3)
model = lr.fit(training)
print("Model trained!")

Model trained!


In [17]:
prediction = model.transform(testing)
predicted = prediction.select("features", "prediction", "trueLabel")
predicted.show()

+--------------------+-------------------+---------+
|            features|         prediction|trueLabel|
+--------------------+-------------------+---------+
|[1.0,1.0,10140.0,...|-3.5644858296428996|      -12|
|[1.0,1.0,10140.0,...|  4.313355516359376|       -9|
|[1.0,1.0,10140.0,...| -8.744321912223267|      -14|
|[1.0,1.0,10140.0,...| -6.750766069439763|      -11|
|[1.0,1.0,10140.0,...| -4.757210226656259|      -11|
|[1.0,1.0,10140.0,...| -3.760432305264507|      -12|
|[1.0,1.0,10140.0,...| 17.171904043962286|       19|
|[1.0,1.0,10140.0,...| 17.171904043962286|       23|
|[1.0,1.0,10140.0,...| 31.126794943446818|       41|
|[1.0,1.0,10140.0,...|-11.742157107599814|       -6|
|[1.0,1.0,10140.0,...| -7.755045422032806|       -8|
|[1.0,1.0,10140.0,...| -3.767933736465798|       -5|
|[1.0,1.0,10140.0,...|-1.7743778936822934|       -1|
|[1.0,1.0,10140.0,...|-0.7775999722905413|        2|
|[1.0,1.0,10140.0,...| 37.099961040596035|       38|
|[1.0,1.0,10140.0,...|-13.737076846965373|    