In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql               import SparkSession 


app_name    = "classification_sexe_cycliste_3"
nb_cores    = 3
paralelisme = 3
memory      = 3




spark = SparkSession.builder\
.config("spark.app.name"                  , app_name                                   )\
.config("spark.cores.max"                 , "%s"%(nb_cores)                            )\
.config("spark.mesos.coarse"             , "True"                                      )\
.config("spark.executor.memory"          , "%sg"%memory                                )\
.config("spark.driver.memory"            , "%sg"%memory                                )\
.config("spark.serializer"               , "org.apache.spark.serializer.KryoSerializer")\
.config("spark.kryoserializer.buffer.max", "1024m"                                     )\
.config("spark.driver.maxResultSize"     , "1g"                                       )\
.config("spark.cores.max"                , "%s"%(nb_cores)                             )\
.config("spark.default.parallelism"      , "%s"%(nb_cores*paralelisme)                 )\
.config("spark.storage.memoryFraction"   , "0.5"                                       )\
.getOrCreate()

In [7]:
# Load and parse the data file, converting it to a DataFrame.
data = spark.read.format("libsvm").load("./sample_libsvm_data.txt")

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)

# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [8]:
data.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [6]:
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)

# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("predictedLabel", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

rfModel = model.stages[2]
print(rfModel)  # summary only

+--------------+-----+--------------------+
|predictedLabel|label|            features|
+--------------+-----+--------------------+
|           0.0|  0.0|(692,[123,124,125...|
|           0.0|  0.0|(692,[127,128,129...|
|           0.0|  0.0|(692,[153,154,155...|
|           1.0|  1.0|(692,[124,125,126...|
|           1.0|  1.0|(692,[124,125,126...|
+--------------+-----+--------------------+
only showing top 5 rows

Test Error = 0
RandomForestClassificationModel (uid=RandomForestClassifier_d4d0c3832229) with 10 trees
