In [1]:
# Spark 2.0
from pyspark.sql               import SparkSession 

# Machine learning
from pyspark.ml                import Pipeline
from pyspark.ml.feature        import StringIndexer
from pyspark.ml.feature        import VectorAssembler
from pyspark.ml.evaluation     import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier


# 0) settings

In [2]:
app_name    = "classification_sexe_cycliste_avec_pipeline"
nb_cores    = 3
paralelisme = 3
memory      = 3
spark = SparkSession.builder\
.config("spark.app.name" , app_name )\
.config("spark.cores.max" , "%s"%(nb_cores) )\
.config("spark.default.parallelism", "%s"%(nb_cores*paralelisme) )\
.config("spark.storage.memoryFraction", "0.5" )\
.getOrCreate()


In [4]:
url_fichier = "./../data/Villes/ville_1.csv"
data        = spark.read.option("header", "true").option("inferSchema", "true").csv(url_fichier)

(trainingData, testData) = data.randomSplit([0.7, 0.3])

colonnes_explicatives = [ 'vitesse_a_pied','vitesse_a_velo', 'sportif', 'casseur', 
                         'statut-num', 'salaire', 'age', 'sportivite', 'velo_perf_minimale']



In [5]:
encode_statut     = StringIndexer(inputCol="statut" , outputCol="statut-num" )
encode_y          = StringIndexer(inputCol="sexe" , outputCol="label"      )
assemble_features = VectorAssembler(inputCols=colonnes_explicatives  , outputCol="features",  )

In [17]:
algorithme = RandomForestClassifier()
etapes   = [encode_statut, encode_y, assemble_features, algorithme]

In [7]:
pipeline = Pipeline()
pipeline = pipeline.setStages(etapes)

In [8]:
modele       = pipeline.fit(trainingData)
predictions  = modele.transform(testData)

In [9]:
predictions.printSchema()

root
 |-- id: integer (nullable = true)
 |-- vitesse_a_pied: double (nullable = true)
 |-- vitesse_a_velo: double (nullable = true)
 |-- home: string (nullable = true)
 |-- travail: string (nullable = true)
 |-- sportif: boolean (nullable = true)
 |-- casseur: boolean (nullable = true)
 |-- statut: string (nullable = true)
 |-- salaire: double (nullable = true)
 |-- sexe: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- sportivite: double (nullable = true)
 |-- velo_perf_minimale: double (nullable = true)
 |-- statut-num: double (nullable = false)
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [26]:
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
accuracy  = evaluator.evaluate(predictions)
error     = 1 - accuracy

In [14]:
print("RandomForest => Accuracy = %.2f, Error = %.2f" % (accuracy, error))

RandomForest => Accuracy = 0.84, Error = 0.16


In [21]:
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder


# Définition de la grille des paramètres à chercher
paramGrid  = ParamGridBuilder()
paramGrid  = paramGrid.addGrid(algorithme.numTrees     , [1, 5, 10] )
paramGrid  = paramGrid.addGrid(algorithme.maxDepth     , [1, 5, 10] )
paramGrid  = paramGrid.build()

# Insertion de la grille dans un objet de cross-validation
# qui s'appuie sur le pipeline :

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=4) 


cvModel = crossval.fit(trainingData)

In [22]:
predictions = cvModel.transform(testData)
evaluator = MulticlassClassificationEvaluator()
evaluator.evaluate(predictions)

0.8392857142857143

In [None]:
predictions.take(2)

In [31]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics

In [42]:
predictions

DataFrame[id: int, vitesse_a_pied: double, vitesse_a_velo: double, home: string, travail: string, sportif: boolean, casseur: boolean, statut: string, salaire: double, sexe: string, age: int, sportivite: double, velo_perf_minimale: double, statut-num: double, label: double, features: vector, rawPrediction: vector, probability: vector, prediction: double]

In [44]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [45]:
metriques = {
"f1"                 : MulticlassClassificationEvaluator()                     ,
"precision"          : MulticlassClassificationEvaluator(metricName="accuracy") ,
"precision ponderee" : MulticlassClassificationEvaluator(metricName="weightedPrecision") ,
"rappel pondere"     :  MulticlassClassificationEvaluator(metricName="weightedRecall")  ,    
"AUC"                :  BinaryClassificationEvaluator()                         }

for mesure, evaluator in metriques.items():
    valeur = evaluator.evaluate(predictions)
    print("{mesure: <18} : {valeur:.3f}".format(mesure=mesure, valeur=valeur))

f1                 : 0.780
precision          : 0.782
precision ponderee : 0.781
rappel pondere     : 0.782
AUC                : 0.879


In [41]:
algorithme = RandomForestClassifier()
modele = algorithme.fit(trainingData.withColumnRenamed("sexe-num", "labels"))

IllegalArgumentException: 'Field "features" does not exist.\nAvailable fields: id, vitesse_a_pied, vitesse_a_velo, home, travail, sportif, casseur, statut, salaire, sexe, age, sportivite, velo_perf_minimale'

In [None]:
BinaryClassificationMetrics()