In [None]:
from pyspark.sql.session import SparkSession
from pyspark.ml.classification import LinearSVC, LogisticRegression, OneVsRest
from pyspark.ml.feature import StandardScaler, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [None]:
inputFile = "../data/csh101.ann.features.csv"

main program

In [None]:
# create a SparkSession
spark = SparkSession.builder.appName("CasasSVN").getOrCreate()
# create a DataFrame using an ifered Schema
df = spark.read.option("header", "true") \
        .option("inferSchema", "true") \
        .option("delimiter", ",") \
        .csv(inputFile)
# Prepare training and test data.
splits = df.randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

In [None]:
featureCols = df.columns.copy()
featureCols.remove("activity") # = label
assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)
labelIndexer = StringIndexer(inputCol="activity", outputCol="label")
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

In [None]:
#  # build network parameters grid
# paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [100 ])\
#                              .addGrid(lr.regParam, [0.001]) \
#                              .build()
   
lsvc = LinearSVC(labelCol="label",aggregationDepth=2, featuresCol="scaledFeatures")
paramGrid = ParamGridBuilder().addGrid(lsvc.maxIter, [100 ])\
                                 .addGrid(lsvc.regParam, [0.1]) \
                                 .build()

# the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lsvc)

# Build the pipeline
pipeline = Pipeline(stages= [assembler, labelIndexer, scaler, ovr] )
evaluator = MulticlassClassificationEvaluator(
labelCol="label", predictionCol="prediction", metricName="accuracy")
cv = CrossValidator(estimator=pipeline, evaluator=evaluator,estimatorParamMaps=paramGrid,numFolds=2, parallelism=2)

In [None]:
#fit (train) the model
cvModel = cv.fit(training)
		
#test the model
predictions = cvModel.transform(test)
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

In [None]:
spark.stop()