In [1]:
from pyspark import SparkContext, SparkConf, SQLContext
import os
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark2pmml import PMMLBuilder
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler
import logging
import shutil
import site
import sys
import wget
import re
import pandas as pd
import numpy as np

In [2]:
data_parquet = os.environ.get('data_parquet',
                              'data.parquet')  # input file name (parquet)
master = os.environ.get('master',
                        "local[*]")  # URL to Spark master
model_target = os.environ.get('model_target',
                              "model.xml")  # model output file name
data_dir = os.environ.get('data_dir',
                          '../data/')  # temporary directory for data
input_columns = os.environ.get('input_columns',
                               '["x", "y", "z"]')  # input columns to consider

In [3]:
parameters = list(
  map(
      lambda s: re.sub('$', '"', s),
      map(
          lambda s: s.replace('=', '="'),
          filter(
              lambda s: s.find('=') > -1 and bool(re.match('[A-Za-z0-9_]*=[.\/A-Za-z0-9]*', s)),
              sys.argv
          )
      )
  )
)

for parameter in parameters:
    logging.warning('Parameter: '+parameter) 
    exec(parameter)

In [4]:
conf = SparkConf().setMaster(master)

In [5]:
conf.set("spark.jars", 'jpmml-sparkml-executable-1.6.5.jar')

<pyspark.conf.SparkConf at 0x162fd611040>

In [6]:
sc = SparkContext.getOrCreate(conf)
sqlContext = SQLContext(sc)
spark = sqlContext.sparkSession

In [7]:
df = spark.read.parquet(data_dir + data_parquet)

In [8]:
# register a corresponding query table
df.createOrReplaceTempView('df')

In [9]:
from pyspark.sql.types import DoubleType
df = df.withColumn("x", df.x.cast(DoubleType()))
df = df.withColumn("y", df.y.cast(DoubleType()))
df = df.withColumn("z", df.z.cast(DoubleType()))

In [10]:
splits = df.randomSplit([0.8, 0.2])
df_train = splits[0]
df_test = splits[1]

In [11]:
indexer = StringIndexer(inputCol="class", outputCol="label")

vectorAssembler = VectorAssembler(inputCols=eval(input_columns),
                                  outputCol="features")

normalizer = MinMaxScaler(inputCol="features", outputCol="features_norm")

In [12]:
def HyperParameterTuning(maxIterV,regParamV,elasticNetParamV):
    lr = LogisticRegression(maxIter=maxIterV, regParam=regParamV, elasticNetParam=elasticNetParamV)
    pipeline = Pipeline(stages=[indexer, vectorAssembler, normalizer, lr])
    model = pipeline.fit(df_train)
    prediction = model.transform(df_train)
    binEval = MulticlassClassificationEvaluator(). \
        setMetricName("accuracy"). \
        setPredictionCol("prediction"). \
        setLabelCol("label")
    res=binEval.evaluate(prediction)
    return res

In [13]:
df = pd.DataFrame(columns=["maxIter", "regParam","elasticNetParam","evaluation"])
columns = list(df)
data = []
index=0
import numpy as np
for maxIterV in [10, 100, 1000]: # np.arange(10, 20, 5):
    for regParamV in  [0.01, 0.5, 2.0]: 
        for elasticNetParamV in [0.0, 0.5, 1.0]:
            index+=1
            maxIterb= np.int16(maxIterV).item()          
            regParamb=np.float32(regParamV).item()  
            elasticNetParamb=np.float32(elasticNetParamV).item()  
            resV=HyperParameterTuning(maxIterb,regParamb,elasticNetParamb)
            #resV=0
            stuff_in_string = "maxIter= {} regParam= {} elasticNetParam={:.2f},  evaluation = {} ".format(maxIterV, regParamV,elasticNetParamV,resV)
            values = [maxIterV,regParamV,elasticNetParamV,resV]
            zipped = zip(columns, values)
            a_dictionary = dict(zipped)
            data.append(a_dictionary) 
            print(index,stuff_in_string)
df = df.append(data, True)

1 maxIter= 10 regParam= 0.01 elasticNetParam=0.00,  evaluation = 0.32804948772472453 
2 maxIter= 10 regParam= 0.01 elasticNetParam=0.50,  evaluation = 0.3338012030134169 
3 maxIter= 10 regParam= 0.01 elasticNetParam=1.00,  evaluation = 0.3167897976393593 
4 maxIter= 10 regParam= 0.5 elasticNetParam=0.00,  evaluation = 0.2222156851209037 
5 maxIter= 10 regParam= 0.5 elasticNetParam=0.50,  evaluation = 0.2067227549959797 
6 maxIter= 10 regParam= 0.5 elasticNetParam=1.00,  evaluation = 0.2067227549959797 
7 maxIter= 10 regParam= 2.0 elasticNetParam=0.00,  evaluation = 0.2067227549959797 
8 maxIter= 10 regParam= 2.0 elasticNetParam=0.50,  evaluation = 0.2067227549959797 
9 maxIter= 10 regParam= 2.0 elasticNetParam=1.00,  evaluation = 0.2067227549959797 
10 maxIter= 100 regParam= 0.01 elasticNetParam=0.00,  evaluation = 0.3460722760599209 
11 maxIter= 100 regParam= 0.01 elasticNetParam=0.50,  evaluation = 0.35314635355819096 
12 maxIter= 100 regParam= 0.01 elasticNetParam=1.00,  evaluation 

In [14]:
df.head()

Unnamed: 0,maxIter,regParam,elasticNetParam,evaluation
0,10,0.01,0.0,0.328049
1,10,0.01,0.5,0.333801
2,10,0.01,1.0,0.31679
3,10,0.5,0.0,0.222216
4,10,0.5,0.5,0.206723


In [15]:
print(df[df.evaluation == df.evaluation.min()])

   maxIter  regParam  elasticNetParam  evaluation
4       10       0.5              0.5    0.206723
5       10       0.5              1.0    0.206723
6       10       2.0              0.0    0.206723
7       10       2.0              0.5    0.206723
8       10       2.0              1.0    0.206723
13     100       0.5              0.5    0.206723
14     100       0.5              1.0    0.206723
15     100       2.0              0.0    0.206723
16     100       2.0              0.5    0.206723
17     100       2.0              1.0    0.206723
22    1000       0.5              0.5    0.206723
23    1000       0.5              1.0    0.206723
24    1000       2.0              0.0    0.206723
25    1000       2.0              0.5    0.206723
26    1000       2.0              1.0    0.206723
