In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# innstall java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# install spark (change the version number if needed)
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz

# unzip the spark file to the current folder
!tar xf spark-3.0.0-bin-hadoop3.2.tgz

# set your spark folder to your system path environment. 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"


# install findspark using pip
!pip install -q findspark

import findspark
findspark.init()
findspark.find()

'/content/spark-3.0.0-bin-hadoop3.2'

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

spark

## Loading Data

In [None]:
# Read data from CSV file
flights = spark.read.csv('/content/drive/MyDrive/Data/flights.csv',
                         sep=',',
                         header=True,
                         inferSchema=True,
                         nullValue='NA')

# Get number of records
print("The data contain %d records." % flights.count())

# View the first five records
flights.show(5)

# Check column data types
print(flights.dtypes)

The data contain 50000 records.
+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351| null|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65| null|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 5 rows

[('mon', 'int'), ('dom', 'int'), ('dow', 'int'), ('carrier', 'string'), ('flight', 'int'), ('org', 'string'), ('mile', 'int'), ('depart', 'double'), ('duration', 'int'), ('delay', 'int')]


### Encoding flight origin

The org column in the flights data is a categorical variable giving the airport from which a flight departs.

* ORD — O'Hare International Airport (Chicago)
* SFO — San Francisco International Airport
* JFK — John F Kennedy International Airport (New York)
* LGA — La Guardia Airport (New York)
* SMF — Sacramento
* SJC — San Jose
* TUS — Tucson International Airport
* OGG — Kahului (Hawaii)

The data are in a variable called flights. You have already used a string indexer to create a column of indexed values corresponding to the strings in org.

In [None]:
from pyspark.ml.feature import StringIndexer

# Repeat the process for the other categorical feature
flights = StringIndexer(inputCol='org', outputCol='org_idx').fit(flights).transform(flights)

flights.show(5)

# Import the one hot encoder class
from pyspark.ml.feature import OneHotEncoder

# Create an instance of the one hot encoder
onehot = OneHotEncoder(inputCols=['org_idx'], outputCols=['org_dummy'])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flights)
flights = onehot.transform(flights)

# Check the results
flights.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx').show()

+---+---+---+-------+------+---+----+------+--------+-----+-------+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|org_idx|
+---+---+---+-------+------+---+----+------+--------+-----+-------+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351| null|    2.0|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|    0.0|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|    1.0|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|    0.0|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65| null|    0.0|
+---+---+---+-------+------+---+----+------+--------+-----+-------+
only showing top 5 rows

+---+-------+-------------+
|org|org_idx|    org_dummy|
+---+-------+-------------+
|ORD|    0.0|(7,[0],[1.0])|
|SFO|    1.0|(7,[1],[1.0])|
|JFK|    2.0|(7,[2],[1.0])|
|LGA|    3.0|(7,[3],[1.0])|
|SJC|    4.0|(7,[4],[1.0])|
|SMF|    5.0|(7,[5],[1.0])|
|TUS|    6.0|(7,[6],[1.0])|
|OGG|    7.0|    (7,[],[])|
+---+-------+-------------+



## Feature Manupilation & Linear Regression

In [None]:
# Import the required function
from pyspark.sql.functions import round
from pyspark.ml.feature import VectorAssembler

# Read data from CSV file
flights = spark.read.csv('/content/drive/MyDrive/Data/flights.csv',
                         sep=',',
                         header=True,
                         inferSchema=True,
                         nullValue='NA')

# Convert 'mile' to 'km' and drop 'mile' column
flights = flights.withColumn('km', round(flights.mile * 1.60934, 0)).drop('mile')

# Create an assembler object
assembler = VectorAssembler(inputCols=['km'], outputCol='features')

# Consolidate predictor columns
flights = assembler.transform(flights)

# Split into training and testing sets in a 80:20 ratio
flights_train, flights_test = flights.randomSplit([0.8,0.2], seed = 17)

flights_train.show(5)

from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Create a regression object and train on training data
regression = LinearRegression(labelCol = 'duration').fit(flights_train)

# Create predictions for the testing data and take a look at the predictions
predictions = regression.transform(flights_test)
predictions.select('duration', 'prediction').show(5, False)

# Calculate the RMSE
RegressionEvaluator(labelCol = 'duration').evaluate(predictions)

+---+---+---+-------+------+---+------+--------+-----+------+--------+
|mon|dom|dow|carrier|flight|org|depart|duration|delay|    km|features|
+---+---+---+-------+------+---+------+--------+-----+------+--------+
|  0|  1|  2|     AA|    59|JFK|   7.0|     385|  -16|4162.0|[4162.0]|
|  0|  1|  2|     AA|    73|ORD|  9.08|     560|   39|6828.0|[6828.0]|
|  0|  1|  2|     AA|   150|SFO| 23.42|     325|   22|4352.0|[4352.0]|
|  0|  1|  2|     AA|   154|ORD| 17.25|     135|   49|1395.0|[1395.0]|
|  0|  1|  2|     AA|   181|JFK|  17.0|     379|  -10|3983.0|[3983.0]|
+---+---+---+-------+------+---+------+--------+-----+------+--------+
only showing top 5 rows

+--------+------------------+
|duration|prediction        |
+--------+------------------+
|370     |345.9288489263871 |
|310     |347.29222263984207|
|115     |133.62126454782072|
|80      |85.29724070425121 |
|240     |213.53011275309635|
+--------+------------------+
only showing top 5 rows



17.07662043948225

## Interpreting Simple Model

In [None]:
# Intercept (average minutes on ground)
inter = regression.intercept
print(inter)

# Coefficients
coefs = regression.coefficients
print(coefs)

# Average minutes per km
minutes_per_km = regression.coefficients[0]
print(minutes_per_km)

# Average speed in km per hour
avg_speed = 60 / minutes_per_km
print(avg_speed)

44.24454333244136
[0.07574298408082997]
0.07574298408082997
792.152576613173


## Multiple Linear Regression

In [None]:
# Read data from CSV file
flights = spark.read.csv('/content/drive/MyDrive/Data/flights.csv',
                         sep=',',
                         header=True,
                         inferSchema=True,
                         nullValue='NA')

########## DEPART ############
from pyspark.ml.feature import Bucketizer
# Create buckets at 3 hour intervals through the day
buckets = Bucketizer(splits=[0, 3, 6, 9, 12, 15, 18, 21, 24], inputCol='depart', outputCol='depart_bucket')

# Bucket the departure times
bucketed = buckets.transform(flights)
bucketed.select('depart', 'depart_bucket')

# Create a one-hot encoder
onehot = OneHotEncoder(inputCols=['depart_bucket'], outputCols=['depart_dummy'])

# One-hot encode the bucketed departure times
flights = onehot.fit(bucketed).transform(bucketed)
flights.select('depart', 'depart_bucket', 'depart_dummy')

########## MILE ############

# Convert 'mile' to 'km' and drop 'mile' column
flights = flights.withColumn('km', round(flights.mile * 1.60934, 0)).drop('mile')

########## ORG ############

# Repeat the process for the other categorical feature
flights = StringIndexer(inputCol='org', outputCol='org_idx').fit(flights).transform(flights)

# Create an instance of the one hot encoder
onehot = OneHotEncoder(inputCols=['org_idx'], outputCols=['org_dummy'])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flights)
flights = onehot.transform(flights)

# Check the results
flights.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx')

########## MON ############

# Repeat the process for the other categorical feature
flights = StringIndexer(inputCol='mon', outputCol='mon_idx').fit(flights).transform(flights)

# Create an instance of the one hot encoder
onehot = OneHotEncoder(inputCols=['mon_idx'], outputCols=['mon_dummy'])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flights)
flights = onehot.transform(flights)

# Check the results
flights.select('mon', 'mon_idx', 'mon_dummy').distinct().sort('mon_idx')

########## DOW ############

# Repeat the process for the other categorical feature
flights = StringIndexer(inputCol='dow', outputCol='dow_idx').fit(flights).transform(flights)

# Create an instance of the one hot encoder
onehot = OneHotEncoder(inputCols=['dow_idx'], outputCols=['dow_dummy'])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flights)
flights = onehot.transform(flights)

# Check the results
flights.select('dow', 'dow_idx', 'dow_dummy').distinct().sort('dow_idx')

flights.show(5)

+---+---+---+-------+------+---+------+--------+-----+-------------+-------------+------+-------+-------------+-------+---------------+-------+-------------+
|mon|dom|dow|carrier|flight|org|depart|duration|delay|depart_bucket| depart_dummy|    km|org_idx|    org_dummy|mon_idx|      mon_dummy|dow_idx|    dow_dummy|
+---+---+---+-------+------+---+------+--------+-----+-------------+-------------+------+-------+-------------+-------+---------------+-------+-------------+
| 11| 20|  6|     US|    19|JFK|  9.48|     351| null|          3.0|(7,[3],[1.0])|3465.0|    2.0|(7,[2],[1.0])|    6.0| (11,[6],[1.0])|    6.0|    (6,[],[])|
|  0| 22|  2|     UA|  1107|ORD| 16.33|      82|   30|          5.0|(7,[5],[1.0])| 509.0|    0.0|(7,[0],[1.0])|    1.0| (11,[1],[1.0])|    2.0|(6,[2],[1.0])|
|  2| 20|  4|     UA|   226|SFO|  6.17|      82|   -8|          2.0|(7,[2],[1.0])| 542.0|    1.0|(7,[1],[1.0])|    4.0| (11,[4],[1.0])|    3.0|(6,[3],[1.0])|
|  9| 13|  1|     AA|   419|ORD| 10.33|     195|   -

## Multiple Linear Regression : Train Test Split

In [None]:
# Create an assembler object
assembler = VectorAssembler(inputCols=['km','org_dummy','depart_dummy','dow_dummy','mon_dummy'], outputCol='features')

# Consolidate predictor columns
flights = assembler.transform(flights)

# Split into training and testing sets in a 80:20 ratio
flights_train, flights_test = flights.randomSplit([0.8,0.2], seed = 17)

flights_train.show(5)

+---+---+---+-------+------+---+------+--------+-----+-------------+-------------+------+-------+-------------+-------+--------------+-------+-------------+--------------------+
|mon|dom|dow|carrier|flight|org|depart|duration|delay|depart_bucket| depart_dummy|    km|org_idx|    org_dummy|mon_idx|     mon_dummy|dow_idx|    dow_dummy|            features|
+---+---+---+-------+------+---+------+--------+-----+-------------+-------------+------+-------+-------------+-------+--------------+-------+-------------+--------------------+
|  0|  1|  2|     AA|    59|JFK|   7.0|     385|  -16|          2.0|(7,[2],[1.0])|4162.0|    2.0|(7,[2],[1.0])|    1.0|(11,[1],[1.0])|    2.0|(6,[2],[1.0])|(32,[0,3,10,17,22...|
|  0|  1|  2|     AA|    73|ORD|  9.08|     560|   39|          3.0|(7,[3],[1.0])|6828.0|    0.0|(7,[0],[1.0])|    1.0|(11,[1],[1.0])|    2.0|(6,[2],[1.0])|(32,[0,1,11,17,22...|
|  0|  1|  2|     AA|   150|SFO| 23.42|     325|   22|          7.0|    (7,[],[])|4352.0|    1.0|(7,[1],[1.0])

## Multiple Linear Regression : Evaluating Model

In [None]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Fit linear regression model to training data
regression = LinearRegression(labelCol = 'duration').fit(flights_train)

# Make predictions on testing data
predictions = regression.transform(flights_test)

# Calculate the RMSE on testing data
rmse = RegressionEvaluator(labelCol='duration').evaluate(predictions)
print("The test RMSE is", rmse)

# Look at the model coefficients
coeffs = regression.coefficients
print(coeffs)

The test RMSE is 10.740502378624523
[0.0744284401685256,26.996312105104003,19.94020032553227,51.76183943628122,45.45584247148976,17.25480651546479,14.855117875045766,16.800119688054526,-15.516868438024465,0.9376712941270354,3.9209465544338573,6.800981397048632,4.536562586593437,8.743745607877418,8.551832715240087,0.1399407485836122,0.08227287343586619,-0.12619923034732058,0.35431900572898783,0.4297011162497779,0.27432770544091817,-3.314805768755545,-1.1507984385493892,-3.6063230364191026,-1.4785231963021643,-1.353519806369124,-3.4516962545128136,0.8448188549985467,-2.724930340267113,-3.2054982432226864,-3.2564766790851403,-1.9150868176271394]


## Multiple Linear Regression : Regularization

Lasso regression (regularized with a L1 penalty) to create a more parsimonious model. Many of the coefficients in the resulting model will be set to zero. This means that only a subset of the predictors actually contribute to the model. Despite the simpler model, it still produces a good RMSE on the testing data.

You'll use a specific value for the regularization strength. Later you'll learn how to find the best value using cross validation.

The data (same as previous exercise) are available as flights, randomly split into flights_train and flights_test.

For **Ridge**

* LinearRegression(labelCol='consumption', elasticNetParam=0, regParam=0.1)

For **Lasso**

* LinearRegression(labelCol='consumption', elasticNetParam=1, regParam=0.1)

In [None]:
# Fit Lasso model (α = 1) to training data
regression = LinearRegression(labelCol='duration', regParam=1, elasticNetParam=1).fit(flights_train)

# Calculate the RMSE on testing data
rmse = RegressionEvaluator(labelCol='duration').evaluate(regression.transform(flights_test))
print("The test RMSE is", rmse)

# Look at the model coefficients
coeffs = regression.coefficients
print(coeffs)


The test RMSE is 11.749089740115453
[0.07356941786680918,5.454471650523119,0.0,29.166723897433588,21.805904602903258,0.0,-2.340766656658043,0.0,0.0,0.0,0.0,0.0,0.0,1.1309839788797809,1.0811408456785048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]


## Multiple Linear Regression Pipeline

In [None]:
# Read data from CSV file
flights = spark.read.csv('/content/drive/MyDrive/Data/flights.csv',
                         sep=',',
                         header=True,
                         inferSchema=True,
                         nullValue='NA')

# Convert 'mile' to 'km' and drop 'mile' column
flights = flights.withColumn('km', round(flights.mile * 1.60934, 0)).drop('mile')

# Split into training and testing sets in a 80:20 ratio
flights_train, flights_test = flights.randomSplit([0.8,0.2], seed = 17)

In [None]:
evaluator = RegressionEvaluator(labelCol = 'duration')

In [None]:
# Convert categorical strings to index values
indexer = StringIndexer(inputCol = 'org', outputCol = 'org_idx')

# One-hot encode index values
onehot = OneHotEncoder(
    inputCols=['org_idx','dow'],
    outputCols=['org_dummy','dow_dummy']
)

# Assemble predictors into a single column
assembler = VectorAssembler(inputCols=['km','org_dummy','dow_dummy'], outputCol='features')

# A linear regression object
regression = LinearRegression(featuresCol='features', labelCol='duration')

# Import class for creating a pipeline
from pyspark.ml import Pipeline

# Construct a pipeline
pipeline = Pipeline(stages=[indexer,onehot, assembler, regression])


from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
# Create parameter grid
params = ParamGridBuilder()

# Add grids for two parameters
params = params.addGrid(regression.regParam, [0.01, 0.1, 1.0, 10.0]) \
               .addGrid(regression.elasticNetParam, [0.0, 0.5, 1.0])

# Build parameter grid
params = params.build()
print('Number of models to be tested: ', len(params))

# Create cross-validator
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=params, evaluator=evaluator, numFolds=5)

# Train and test model on multiple folds of the training data
cv = cv.fit(flights_train)

Number of models to be tested:  12


In [None]:
# Get the best model from cross validation
best_model = cv.bestModel

# Look at the stages in the best model
print(best_model.stages)

##Prediction Evaluation
evaluator.evaluate(best_model.transform(flights_test))

[StringIndexerModel: uid=StringIndexer_a8956839cb39, handleInvalid=error, OneHotEncoderModel: uid=OneHotEncoder_9a48040dddcc, dropLast=true, handleInvalid=error, numInputCols=2, numOutputCols=2, VectorAssembler_74f429109db2, LinearRegressionModel: uid=LinearRegression_a5660f6c037d, numFeatures=14]


11.163705452721718