In [2]:
from mleap import pyspark

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import *

In [3]:
from mleap.pyspark.spark_support import SimpleSparkSerializer

In [4]:
file_name = "../data/data.csv"

In [5]:
# loading the data
data = sqlContext.read.options(header='true', inferschema='true').format('csv').load(file_name)

In [6]:
# check the types of data
data.cache()
data.dtypes

[('temperature', 'double'),
 ('exhaust_vacuum', 'double'),
 ('ambient_pressure', 'double'),
 ('relative_humidity', 'double'),
 ('energy_output', 'double')]

In [7]:
data.printSchema()

root
 |-- temperature: double (nullable = true)
 |-- exhaust_vacuum: double (nullable = true)
 |-- ambient_pressure: double (nullable = true)
 |-- relative_humidity: double (nullable = true)
 |-- energy_output: double (nullable = true)



In [8]:
data.describe().show()

+-------+------------------+------------------+------------------+------------------+------------------+
|summary|       temperature|    exhaust_vacuum|  ambient_pressure| relative_humidity|     energy_output|
+-------+------------------+------------------+------------------+------------------+------------------+
|  count|              9571|              9570|              9572|              9571|              9572|
|   mean|19.653347612579715|54.306858934169455|1013.2582490597486| 73.30461289311452|454.36104575846235|
| stddev| 7.453012545430897|12.707639410263813| 5.938325792904679|14.608793746127796| 17.06487829976061|
|    min|              1.81|             25.36|            992.89|             25.56|            420.26|
|    max|             37.11|             81.56|            1033.3|            100.16|            495.76|
+-------+------------------+------------------+------------------+------------------+------------------+



In [9]:
dataFiltered = data.dropna()

In [10]:
dataFiltered.describe().show()

+-------+------------------+------------------+------------------+------------------+-----------------+
|summary|       temperature|    exhaust_vacuum|  ambient_pressure| relative_humidity|    energy_output|
+-------+------------------+------------------+------------------+------------------+-----------------+
|  count|              9568|              9568|              9568|              9568|             9568|
|   mean|19.651231187291014|54.305803720735966|1013.2590781772483| 73.30897784280928|454.3650094063547|
| stddev| 7.452473229611075|12.707892998326807| 5.938783705811638|14.600268756728957|17.06699499980342|
|    min|              1.81|             25.36|            992.89|             25.56|           420.26|
|    max|             37.11|             81.56|            1033.3|            100.16|           495.76|
+-------+------------------+------------------+------------------+------------------+-----------------+



In [11]:
# define the features into a list
features = ["temperature", "exhaust_vacuum", "ambient_pressure", "relative_humidity"]

In [12]:
feature_assembler = VectorAssembler(inputCols=features, outputCol="unscaled_features")
feature_scaler    = StandardScaler(inputCol="unscaled_features", outputCol="scaled_features", withMean=True, withStd=True)

In [13]:
estimators = [feature_assembler, feature_scaler]
featurePipeline = Pipeline(stages=estimators)
sparkFeaturePipelineModel = featurePipeline.fit(dataFiltered)

In [14]:
linearRegression = LinearRegression(featuresCol="scaled_features", labelCol="energy_output", predictionCol="energy_prediction")
pipeline = [sparkFeaturePipelineModel, linearRegression]

linearRegressionModel = Pipeline(stages=pipeline).fit(dataFiltered)

In [15]:
linearRegressionModel.serializeToBundle("file:/tmp/LinearRegression.zip",linearRegressionModel.transform(dataFiltered))

Py4JJavaError: An error occurred while calling o126.serializeToBundle.
: java.lang.NoClassDefFoundError: resource/package$
	at ml.combust.mleap.spark.SimpleSparkSerializer.serializeToBundle(SimpleSparkSerializer.scala:20)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: resource.package$
	at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
	at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:335)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
	... 12 more
