In [1]:
# !apt update
# !apt-get install openjdk-11-jdk-headless -qq > /dev/null
# !wget -q http://archive.apache.org/dist/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz
# !tar -xvf spark-3.3.0-bin-hadoop3.tgz
# !pip install -q findspark
# import os
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-3.3.0-bin-hadoop3"
import findspark
findspark.init()

# Demo Pipeline Linear Regression

### Dataset: flights.csv
- You'll build a regression model to predict flight duration
- With dow, org, mile as a predictor

First thing to do is start a Spark Session

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
# %cd '/content/gdrive/My Drive/LDS9/Practice/Chapter7/'

/content/gdrive/My Drive/LDS9/Practice/Chapter7


In [4]:
import pyspark

In [5]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder.appName('lr_demo').getOrCreate()

In [7]:
# Use Spark to read in the Ecommerce Customers csv file.
data = spark.read.csv("flights.csv",inferSchema=True,header=True)

In [8]:
# Print the Schema of the DataFrame
data.printSchema()

root
 |-- mon: integer (nullable = true)
 |-- dom: integer (nullable = true)
 |-- dow: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: integer (nullable = true)
 |-- org: string (nullable = true)
 |-- mile: integer (nullable = true)
 |-- depart: double (nullable = true)
 |-- duration: integer (nullable = true)
 |-- delay: string (nullable = true)



In [9]:
data.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 3 rows



In [10]:
data.head()

Row(mon=11, dom=20, dow=6, carrier='US', flight=19, org='JFK', mile=2153, depart=9.48, duration=351, delay='NA')

In [11]:
data.count()

50000

In [12]:
# Remove the 'flight' column
data = data.drop('flight')

In [13]:
# Number of records with missing 'delay' values
data.filter('delay IS NULL').count()

0

In [14]:
# Remove records with missing 'delay' values
data = data.filter('delay IS NOT NULL')

In [15]:
# Remove records with missing values in any column and get the number of remaining rows
data = data.na.drop()
data.count()

50000

In [16]:
# Import the required function
from pyspark.sql.functions import round

In [17]:
# Convert 'mile' to 'km' and drop 'mile' column
data = data.withColumn('km', round(data.mile * 1.60934, 0))

In [18]:
final_data = data[["dow", "org", "km", "duration"]]
final_data.count()

50000

In [19]:
final_data = final_data.na.drop()
final_data.show(5)

+---+---+------+--------+
|dow|org|    km|duration|
+---+---+------+--------+
|  6|JFK|3465.0|     351|
|  2|ORD| 509.0|      82|
|  4|SFO| 542.0|      82|
|  1|ORD|1989.0|     195|
|  5|ORD| 415.0|      65|
+---+---+------+--------+
only showing top 5 rows



In [20]:
final_data.count()

50000

# Thực hiện Pipeline
- ...

In [21]:
train_data,test_data = final_data.randomSplit([0.8,0.2])

In [32]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.regression import LinearRegression


In [29]:
# Create an indexer
indexer = StringIndexer(inputCol='org', outputCol='org_idx')

onehot1 = OneHotEncoder(inputCol="org_idx", outputCol="org_vec")
onehot2 = OneHotEncoder(inputCol="dow", outputCol="dow_vec")

assembler = VectorAssembler(inputCols=['km', 'org_vec', 'dow_vec'],
                            outputCol='features')

regression = LinearRegression(featuresCol='features',
                              labelCol='duration',
                              predictionCol='prediction')

In [33]:
pipeline_m = Pipeline(stages = [indexer,onehot1,onehot2,assembler,regression]) 

In [34]:
pipeline = pipeline_m.fit(train_data)

In [35]:
predictions = pipeline.transform(test_data) 

In [36]:
predictions.select('prediction','duration').show(5)

+-----------------+--------+
|       prediction|duration|
+-----------------+--------+
|79.88943132599744|      67|
|79.88943132599744|      87|
|79.88943132599744|      95|
|81.37595366057552|      58|
|81.37595366057552|      88|
+-----------------+--------+
only showing top 5 rows



In [37]:
from pyspark.ml.evaluation import RegressionEvaluator

In [38]:
eval = RegressionEvaluator(labelCol='duration', predictionCol='prediction')

In [39]:
eval.evaluate(predictions, {eval.metricName: 'rmse'})

11.41778763496601

In [40]:
eval.evaluate(predictions, {eval.metricName: 'r2'})

0.9831784331672464

In [41]:
lrModel = pipeline.stages[-1]

In [42]:
lrModel.summary.rootMeanSquaredError

11.16770162877805

In [43]:
lrModel.summary.r2

0.9834434185689179

In [45]:
pipeline.save('Pipeline_flight_50k')

Py4JJavaError: An error occurred while calling o693.save.
: java.io.IOException: Path Pipeline_flight_50k already exists. To overwrite it, please use write.overwrite().save(path) for Scala and use write().overwrite().save(path) for Java and Python.
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:683)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:167)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.super$save(Pipeline.scala:344)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.$anonfun$save$4(Pipeline.scala:344)
	at org.apache.spark.ml.MLEvents.withSaveInstanceEvent(events.scala:174)
	at org.apache.spark.ml.MLEvents.withSaveInstanceEvent$(events.scala:169)
	at org.apache.spark.ml.util.Instrumentation.withSaveInstanceEvent(Instrumentation.scala:42)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.$anonfun$save$3(Pipeline.scala:344)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.$anonfun$save$3$adapted(Pipeline.scala:344)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.PipelineModel$PipelineModelWriter.save(Pipeline.scala:344)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)


In [46]:
from pyspark.ml import PipelineModel

In [47]:
pipeline2 = PipelineModel.load('Pipeline_flight_50k')

In [49]:
unlabeled_data = test_data.drop('label')

In [50]:
predictions2 = pipeline2.transform(unlabeled_data)

In [51]:
predictions2.select('prediction','duration').show(5)

+-----------------+--------+
|       prediction|duration|
+-----------------+--------+
|79.88943132599744|      67|
|79.88943132599744|      87|
|79.88943132599744|      95|
|81.37595366057552|      58|
|81.37595366057552|      88|
+-----------------+--------+
only showing top 5 rows

