## Display Environment Vars

In [1]:
!export

export CLICOLOR='1'
export GIT_PAGER='cat'
export HOME='/root'
export JPY_PARENT_PID='18'
export MPLBACKEND='module://ipykernel.pylab.backend_inline'
export PAGER='cat'
export PATH='/opt/conda/bin:/root/spark-2.0.1-SNAPSHOT-bin-fluxcapacitor/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin'
export PWD='/root/notebooks'
export PYSPARK_SUBMIT_ARGS='--master spark://10.0.1.217:7077 --packages com.amazonaws:aws-java-sdk:1.7.4,org.apache.hadoop:hadoop-aws:2.7.1 --jars /root/lib/jpmml-sparkml-package-1.0-SNAPSHOT.jar --py-files /root/lib/jpmml.py pyspark-shell'
export SHELL='/bin/bash'
export SPARK_HOME='/root/spark-2.0.1-SNAPSHOT-bin-fluxcapacitor'
export SPARK_MASTER='spark://10.0.1.217:7077'
export SPARK_SUBMIT_ARGS='--packages com.amazonaws:aws-java-sdk:1.7.4,org.apache.hadoop:hadoop-aws:2.7.1 --jars /root/lib/jpmml-sparkml-package-1.0-SNAPSHOT.jar --py-files /root/lib/jpmml.py'
export TERM='xterm-color'
export USER='root'


## Setup Spark and SQL Contexts

In [2]:
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext

sparkContext = SparkContext.getOrCreate()
sqlContext = SQLContext(sparkContext)

sqlContext

<pyspark.sql.context.SQLContext at 0x7f23627cc518>

## Setup S3 Credentials

In [9]:
hadoopConf = sparkContext._jsc.hadoopConfiguration()
# Set your AWS Credentials here
myAccessKey = ""
mySecretKey = ""
hadoopConf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoopConf.set("fs.s3a.awsAccessKeyId", myAccessKey)
hadoopConf.set("fs.s3a.awsSecretAccessKey", mySecretKey)
hadoopConf.set("fs.s3a.fast.upload", "false")

## Load Dataset into Spark Cluster

In [6]:
data = sqlContext.read.csv("s3a://fluxcapacitor.com/datasets/R/wine.csv", header=True, inferSchema=True)
data.take(10)

[Row(fixed_acidity=7.4, volatile_acidity=0.7, citric_acid=0.0, residual_sugar=1.9, chlorides=0.076, free_sulfur_dioxide=11.0, total_sulfur_dioxide=34.0, density=0.9978, pH=3.51, sulphates=0.56, alcohol=9.4, quality=5, color='red'),
 Row(fixed_acidity=7.8, volatile_acidity=0.88, citric_acid=0.0, residual_sugar=2.6, chlorides=0.098, free_sulfur_dioxide=25.0, total_sulfur_dioxide=67.0, density=0.9968, pH=3.2, sulphates=0.68, alcohol=9.8, quality=5, color='red'),
 Row(fixed_acidity=7.8, volatile_acidity=0.76, citric_acid=0.04, residual_sugar=2.3, chlorides=0.092, free_sulfur_dioxide=15.0, total_sulfur_dioxide=54.0, density=0.997, pH=3.26, sulphates=0.65, alcohol=9.8, quality=5, color='red'),
 Row(fixed_acidity=11.2, volatile_acidity=0.28, citric_acid=0.56, residual_sugar=1.9, chlorides=0.075, free_sulfur_dioxide=17.0, total_sulfur_dioxide=60.0, density=0.998, pH=3.16, sulphates=0.58, alcohol=9.8, quality=6, color='red'),
 Row(fixed_acidity=7.4, volatile_acidity=0.7, citric_acid=0.0, residu

## Build Decision Tree (Regression) with Spark ML Pipeline

In [7]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import RFormula
from pyspark.ml.regression import DecisionTreeRegressor


formula = RFormula(formula = "quality ~ .")
regressor = DecisionTreeRegressor()
pipeline = Pipeline(stages = [formula, regressor])
pipelineModel = pipeline.fit(data)

pipelineModel

PipelineModel_476daec78a4b5418d603

## Convert Spark ML Model and Pipeline to PMML

In [8]:
from jpmml import toPMMLBytes

pmmlBytes = toPMMLBytes(sparkContext, data, pipelineModel)

str(pmmlBytes)

'b\'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\\n<PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">\\n\\t<Header>\\n\\t\\t<Application/>\\n\\t\\t<Timestamp>2016-10-11T14:51:44Z</Timestamp>\\n\\t</Header>\\n\\t<DataDictionary>\\n\\t\\t<DataField name="quality" optype="continuous" dataType="double"/>\\n\\t\\t<DataField name="volatile_acidity" optype="continuous" dataType="double"/>\\n\\t\\t<DataField name="citric_acid" optype="continuous" dataType="double"/>\\n\\t\\t<DataField name="residual_sugar" optype="continuous" dataType="double"/>\\n\\t\\t<DataField name="chlorides" optype="continuous" dataType="double"/>\\n\\t\\t<DataField name="free_sulfur_dioxide" optype="continuous" dataType="double"/>\\n\\t\\t<DataField name="total_sulfur_dioxide" optype="continuous" dataType="double"/>\\n\\t\\t<DataField name="density" optype="continuous" dataType="double"/>\\n\\t\\t<DataField name="pH" optype="continuous" dataType="double"/>\\n\\t\\t<DataField name="sulphates" optype="con

In [10]:
!git status

fatal: Not a git repository (or any of the parent directories): .git
