In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession \
        .builder \
        .appName('Sample multiclass classification') \
        .getOrCreate()

In [3]:
spark


In [6]:
# Load training data
data = spark.read.format("libsvm")\
    .load("../../data/mllib/sample_multiclass_classification_data.txt")

In [7]:
data

DataFrame[label: double, features: vector]

In [8]:
data.take(5)

[Row(label=1.0, features=SparseVector(4, {0: -0.2222, 1: 0.5, 2: -0.7627, 3: -0.8333})),
 Row(label=1.0, features=SparseVector(4, {0: -0.5556, 1: 0.25, 2: -0.8644, 3: -0.9167})),
 Row(label=1.0, features=SparseVector(4, {0: -0.7222, 1: -0.1667, 2: -0.8644, 3: -0.8333})),
 Row(label=1.0, features=SparseVector(4, {0: -0.7222, 1: 0.1667, 2: -0.6949, 3: -0.9167})),
 Row(label=0.0, features=SparseVector(4, {0: 0.1667, 1: -0.4167, 2: 0.4576, 3: 0.5}))]

In [9]:
# Split the data into train and test
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]

In [10]:
# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [4, 5, 4, 3]


In [12]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [13]:
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)


In [14]:
# train the model
model = trainer.fit(train)


In [15]:
# compute accuracy on the test set
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

Test set accuracy = 0.9019607843137255
