# -1) Imports

In [1]:
# Python
## modules
import time  
import sys  
import numpy as np 
import copy
## fonctions
from datetime                  import datetime
from dateutil.relativedelta    import relativedelta

# Spark 2.0
from pyspark.sql               import SparkSession 

# Fonctions
from pyspark.sql               import Row
from pyspark.sql.types         import *

# Machine learning
from pyspark.ml                import Pipeline
from pyspark.ml.feature        import OneHotEncoder
from pyspark.ml.feature        import StringIndexer
from pyspark.ml.feature        import VectorIndexer
from pyspark.ml.feature        import VectorAssembler
from pyspark.ml.evaluation     import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier

# Ancienne librairie de ML
from pyspark.mllib.evaluation  import MulticlassMetrics

# 0) settings

In [2]:
app_name    = "classification_sexe_cycliste_2"
nb_cores    = 3
paralelisme = 3
memory      = 3
start_load  = time.time()


spark = SparkSession.builder\
.config("spark.app.name"                  , app_name                                   )\
.config("spark.cores.max"                 , "%s"%(nb_cores)                            )\
.config("spark.mesos.coarse"             , "True"                                      )\
.config("spark.executor.memory"          , "%sg"%memory                                )\
.config("spark.driver.memory"            , "%sg"%memory                                )\
.config("spark.serializer"               , "org.apache.spark.serializer.KryoSerializer")\
.config("spark.kryoserializer.buffer.max", "1024m"                                     )\
.config("spark.driver.maxResultSize"     , "10g"                                       )\
.config("spark.cores.max"                , "%s"%(nb_cores)                             )\
.config("spark.default.parallelism"      , "%s"%(nb_cores*paralelisme)                 )\
.config("spark.storage.memoryFraction"   , "0.5"                                       )\
.getOrCreate()


In [3]:
spark

# 1) Structure et import du fichier

In [4]:
!git status 


fatal: not a git repository (or any parent up to mount point /home/jovyan)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


In [9]:
url_fichier = "./../data/Villes/ville_1.csv"
data = spark.read.option("header", "true").option("inferSchema", "true").csv(url_fichier)

In [10]:
data.count()

1083

In [11]:
type(data)

pyspark.sql.dataframe.DataFrame

In [12]:
colonnes = data.columns

In [13]:
data.printSchema()

root
 |-- id: integer (nullable = true)
 |-- vitesse_a_pied: double (nullable = true)
 |-- vitesse_a_velo: double (nullable = true)
 |-- home: string (nullable = true)
 |-- travail: string (nullable = true)
 |-- sportif: boolean (nullable = true)
 |-- casseur: boolean (nullable = true)
 |-- statut: string (nullable = true)
 |-- salaire: double (nullable = true)
 |-- sexe: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- sportivite: double (nullable = true)
 |-- velo_perf_minimale: double (nullable = true)



# 2) one hot encoding sur le sexe

In [14]:
from  pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import IndexToString

In [15]:
one_hot_encoding = StringIndexer(inputCol="sexe", outputCol="sexe-num")

In [16]:
model_one_hot_encoding = one_hot_encoding.fit(data)

In [17]:
data_avec_sexe_en_binaire = model_one_hot_encoding.transform(data)

In [18]:
data_avec_sexe_en_binaire.take(5)

[Row(id=5251, vitesse_a_pied=0.02, vitesse_a_velo=0.05, home='(lon:26.60 lat:28.13)', travail='(lon:21.08 lat:14.11)', sportif=False, casseur=False, statut='reserviste', salaire=29800.610034665042, sexe='F', age=18, sportivite=0.1, velo_perf_minimale=0.4, sexe-num=1.0),
 Row(id=5252, vitesse_a_pied=0.14974625830876215, vitesse_a_velo=0.37436564577190534, home='(lon:0.26 lat:42.61)', travail='(lon:36.35 lat:33.28)', sportif=False, casseur=False, statut='professeur', salaire=23595.44383981423, sexe='F', age=28, sportivite=0.7487312915438107, velo_perf_minimale=0.4, sexe-num=1.0),
 Row(id=5253, vitesse_a_pied=0.6309711587089704, vitesse_a_velo=1.6825897565572543, home='(lon:3.34 lat:13.95)', travail='(lon:24.75 lat:48.15)', sportif=False, casseur=False, statut='technicien_de_surface', salaire=18530.14776280135, sexe='H', age=65, sportivite=2.103237195696568, velo_perf_minimale=0.4, sexe-num=0.0),
 Row(id=5254, vitesse_a_pied=0.04009596300649916, vitesse_a_velo=0.10692256801733109, home='(

# 3) création d'une colonne "features" qui comprend les paramètres explicatifs du sexe

In [19]:
colonnes = data_avec_sexe_en_binaire.columns
colonnes

['id',
 'vitesse_a_pied',
 'vitesse_a_velo',
 'home',
 'travail',
 'sportif',
 'casseur',
 'statut',
 'salaire',
 'sexe',
 'age',
 'sportivite',
 'velo_perf_minimale',
 'sexe-num']

In [20]:
colonnes_a_enlever = ['sexe', 'sexe-num', 'id']
colonnes_sans_y = [x for x in colonnes if x not in colonnes_a_enlever]
sorted(colonnes_sans_y)

['age',
 'casseur',
 'home',
 'salaire',
 'sportif',
 'sportivite',
 'statut',
 'travail',
 'velo_perf_minimale',
 'vitesse_a_pied',
 'vitesse_a_velo']

In [21]:
colonnes_a_garder = list(set(colonnes) - set(colonnes_a_enlever))
sorted(colonnes_a_garder)

['age',
 'casseur',
 'home',
 'salaire',
 'sportif',
 'sportivite',
 'statut',
 'travail',
 'velo_perf_minimale',
 'vitesse_a_pied',
 'vitesse_a_velo']

# 2) gestion des  colonnes catégorielles / numériques

In [22]:
def indexStringColumns(df, cols):
    """
    Convertit les colonnes de string en numériques.
    (pbm = donne l'idée d'un ordre naturel)
    Parameters:
        df : matrice à modifier 
            dataframe
        cols : noms des colonnes à indexer
            list de chaine de caractère
            
    Return: dataframe
    """
    from pyspark.ml.feature import StringIndexer
    from collections import OrderedDict
    newdf = df
    string_indexers = OrderedDict()
    for old_col in cols:
        
        new_col = old_col+"-num"
        indexer = StringIndexer(inputCol=old_col, outputCol=new_col)
        
        model   = indexer.fit(newdf)
        newdf   = model.transform(newdf)
        newdf   = newdf.drop(old_col)
        newdf   = newdf.withColumnRenamed(new_col, old_col)
        string_indexers[old_col] = model
    return newdf, string_indexers


In [23]:
def oneHotEncodeColumns(df, cols):
    """
    Convertit une colonne contenant n modalité 
    en n colonne ne comprenant qu'une seule valeur.
    (Supprime les effets d'ordre des valeurs numériques)
    Parameters:
        df : matrice à modifier 
            dataframe
        cols : noms des colonnes à indexer
            list de chaine de caractère
            
    Return: dataframe
    """
    from pyspark.ml.feature import OneHotEncoder
    from collections import OrderedDict
    newdf = df
    one_hot_encoders = OrderedDict()
    for old_col in cols:
        new_col   = old_col+"-onehot"
        onehotenc = OneHotEncoder(inputCol=old_col, outputCol=new_col, dropLast=False)
        newdf     = onehotenc.transform(newdf)
        newdf     = newdf.drop(old_col)
        newdf     = newdf.withColumnRenamed(new_col, old_col)
        one_hot_encoders[old_col] = onehotenc
    return newdf, one_hot_encoders

# 2.1) catégories => numériques

In [24]:
colY     = "sexe"
colY_num = "sexe-num"

In [25]:
typeString = [x[0] for x in data.dtypes if x[1]=='string']
typeString

['home', 'travail', 'statut', 'sexe']

In [26]:
colString_without_Y = copy.copy(typeString)
colString_without_Y.remove(colY)
colString_without_Y

['home', 'travail', 'statut']

In [27]:
data_avec_sexe_en_binaire.printSchema()

root
 |-- id: integer (nullable = true)
 |-- vitesse_a_pied: double (nullable = true)
 |-- vitesse_a_velo: double (nullable = true)
 |-- home: string (nullable = true)
 |-- travail: string (nullable = true)
 |-- sportif: boolean (nullable = true)
 |-- casseur: boolean (nullable = true)
 |-- statut: string (nullable = true)
 |-- salaire: double (nullable = true)
 |-- sexe: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- sportivite: double (nullable = true)
 |-- velo_perf_minimale: double (nullable = true)
 |-- sexe-num: double (nullable = false)



In [28]:
data2, dico_indexers = indexStringColumns(data_avec_sexe_en_binaire, colString_without_Y)
data2.printSchema()

root
 |-- id: integer (nullable = true)
 |-- vitesse_a_pied: double (nullable = true)
 |-- vitesse_a_velo: double (nullable = true)
 |-- sportif: boolean (nullable = true)
 |-- casseur: boolean (nullable = true)
 |-- salaire: double (nullable = true)
 |-- sexe: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- sportivite: double (nullable = true)
 |-- velo_perf_minimale: double (nullable = true)
 |-- sexe-num: double (nullable = false)
 |-- home: double (nullable = false)
 |-- travail: double (nullable = false)
 |-- statut: double (nullable = false)



# 2.2) numériques => one hot encoding

In [29]:
colString_without_Y

['home', 'travail', 'statut']

In [30]:
data3, dico_encoders = oneHotEncodeColumns(data2, colString_without_Y)
data3.take(1)

[Row(id=5251, vitesse_a_pied=0.02, vitesse_a_velo=0.05, sportif=False, casseur=False, salaire=29800.610034665042, sexe='F', age=18, sportivite=0.1, velo_perf_minimale=0.4, sexe-num=1.0, home=SparseVector(1083, {818: 1.0}), travail=SparseVector(1083, {728: 1.0}), statut=SparseVector(6, {5: 1.0}))]

In [31]:
data3.printSchema()

root
 |-- id: integer (nullable = true)
 |-- vitesse_a_pied: double (nullable = true)
 |-- vitesse_a_velo: double (nullable = true)
 |-- sportif: boolean (nullable = true)
 |-- casseur: boolean (nullable = true)
 |-- salaire: double (nullable = true)
 |-- sexe: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- sportivite: double (nullable = true)
 |-- velo_perf_minimale: double (nullable = true)
 |-- sexe-num: double (nullable = false)
 |-- home: vector (nullable = true)
 |-- travail: vector (nullable = true)
 |-- statut: vector (nullable = true)



# 2.3) plusieurs colonnes => un vecteur

In [32]:
data3.columns

['id',
 'vitesse_a_pied',
 'vitesse_a_velo',
 'sportif',
 'casseur',
 'salaire',
 'sexe',
 'age',
 'sportivite',
 'velo_perf_minimale',
 'sexe-num',
 'home',
 'travail',
 'statut']

In [33]:
colY_num

'sexe-num'

In [34]:
data3.columns

['id',
 'vitesse_a_pied',
 'vitesse_a_velo',
 'sportif',
 'casseur',
 'salaire',
 'sexe',
 'age',
 'sportivite',
 'velo_perf_minimale',
 'sexe-num',
 'home',
 'travail',
 'statut']

In [35]:
# Choix des colonnes
features = data3.columns

# on supprime les colonnes dont toutes les valeurs sont différentes
features.remove("home")
features.remove("travail")

features.remove(colY)
features.remove(colY_num)
# Assembleur
print(features)
assemblor = VectorAssembler(inputCols=features, outputCol="features", )
# Application
data4 = assemblor.transform(data3)
data4.take(2)

['id', 'vitesse_a_pied', 'vitesse_a_velo', 'sportif', 'casseur', 'salaire', 'age', 'sportivite', 'velo_perf_minimale', 'statut']


[Row(id=5251, vitesse_a_pied=0.02, vitesse_a_velo=0.05, sportif=False, casseur=False, salaire=29800.610034665042, sexe='F', age=18, sportivite=0.1, velo_perf_minimale=0.4, sexe-num=1.0, home=SparseVector(1083, {818: 1.0}), travail=SparseVector(1083, {728: 1.0}), statut=SparseVector(6, {5: 1.0}), features=SparseVector(15, {0: 5251.0, 1: 0.02, 2: 0.05, 5: 29800.61, 6: 18.0, 7: 0.1, 8: 0.4, 14: 1.0})),
 Row(id=5252, vitesse_a_pied=0.14974625830876215, vitesse_a_velo=0.37436564577190534, sportif=False, casseur=False, salaire=23595.44383981423, sexe='F', age=28, sportivite=0.7487312915438107, velo_perf_minimale=0.4, sexe-num=1.0, home=SparseVector(1083, {991: 1.0}), travail=SparseVector(1083, {38: 1.0}), statut=SparseVector(6, {4: 1.0}), features=SparseVector(15, {0: 5252.0, 1: 0.1497, 2: 0.3744, 5: 23595.4438, 6: 28.0, 7: 0.7487, 8: 0.4, 13: 1.0}))]

In [36]:
nom_des_features = []
for col in assemblor.getInputCols():
    if col in dico_indexers.keys():
        nom_des_features.extend(dico_indexers[col].labels)
    else:
        nom_des_features.append(col)
indices_et_noms_des_features = {indice: col for indice, col in enumerate(nom_des_features)}
indices_et_noms_des_features

{0: 'id',
 1: 'vitesse_a_pied',
 2: 'vitesse_a_velo',
 3: 'sportif',
 4: 'casseur',
 5: 'salaire',
 6: 'age',
 7: 'sportivite',
 8: 'velo_perf_minimale',
 9: 'cadre',
 10: 'employe',
 11: 'technicien_de_surface',
 12: 'éboueur',
 13: 'professeur',
 14: 'reserviste'}

# 2.4) projection, et cleaning

In [37]:
data5 = data4.select("features", colY_num)
data5.schema

StructType(List(StructField(features,VectorUDT,true),StructField(sexe-num,DoubleType,false)))

In [38]:
data6 = data5.dropDuplicates()
data6.count()

1083

# 4) création d'un vecteur avec les colonnes à garder

In [39]:
if False:
    from pyspark.ml.feature import VectorAssembler
    instance = VectorAssembler(inputCols=colonnes_a_garder, outputCol="features")
    data_avec_col_features = instance.transform()
    data_avec_col_features.take(5)

# 2.5) équilibrage des classes

In [40]:
data6.groupBy(colY_num).count().show()

+--------+-----+
|sexe-num|count|
+--------+-----+
|     0.0|  560|
|     1.0|  523|
+--------+-----+



In [41]:
if False:
    nb_examples = 10000
    income_0             = data6.filter("income == 0")
    _10000_income_0      = income_0.sample(False, nb_examples/float(income_0.count()))

    income_1             = data6.filter("income == 1")
    _10000_income_1      = income_1.sample(False, nb_examples/float(income_1.count()))

    classes_equilibrees  = _10000_income_0.union(_10000_income_1)
    print (_10000_income_0.count(), _10000_income_1.count(), classes_equilibrees.count())
else:
    classes_equilibrees = data6

# 3) Apprentissage

In [42]:
data6.printSchema()

root
 |-- features: vector (nullable = true)
 |-- sexe-num: double (nullable = false)



In [43]:
classes_equilibrees.take(1)

[Row(features=SparseVector(15, {0: 5558.0, 1: 1.2136, 2: 3.034, 5: 21831.9187, 6: 74.0, 7: 6.068, 8: 0.4, 13: 1.0}), sexe-num=1.0)]

In [44]:
classes_equilibrees.printSchema()

root
 |-- features: vector (nullable = true)
 |-- sexe-num: double (nullable = false)



In [45]:
# Division en jeu de test, et jeu d'apprentissage
(trainingData, testData) = classes_equilibrees.randomSplit([0.7, 0.3])

In [46]:
trainingData.columns

['features', 'sexe-num']

In [47]:
rf          = RandomForestClassifier(labelCol=colY_num)
model       = rf.fit(trainingData)
predictions = model.transform(testData)

evaluator = MulticlassClassificationEvaluator(  labelCol      = colY_num , 
                                                predictionCol = "prediction"   , 
                                                metricName    = "accuracy"     )
accuracy  = evaluator.evaluate(predictions)
error     = 1 - accuracy




In [48]:
print("RandomForest => Accuracy = %g, Error = %s" % (accuracy, error))

RandomForest => Accuracy = 0.736527, Error = 0.26347305389221554


In [49]:
variables_les_plus_importantes =model.featureImportances

In [50]:
r = []
for indice, valeur in enumerate(variables_les_plus_importantes.toArray()):
    r.append({"colonne" : indices_et_noms_des_features[indice]  ,
              "valeur"  : valeur                                })
    

In [51]:
import pandas as pd

In [52]:
df = pd.DataFrame(r)
df.sort_values("valeur", ascending=False)

Unnamed: 0,colonne,valeur
1,vitesse_a_pied,0.312833
2,vitesse_a_velo,0.282324
7,sportivite,0.155653
5,salaire,0.154906
0,id,0.03315
6,age,0.02335
8,velo_perf_minimale,0.009925
14,reserviste,0.005595
11,technicien_de_surface,0.005478
13,professeur,0.003709


In [53]:
if False:
    from pyspark.ml.feature import OneHotEncoder

    newdf       = copy.copy(data)
    inputCol_1  = newdf.columns[1]
    outputCol_1 = col+"-num"
    indexer     = StringIndexer(inputCol=inputCol_1, outputCol=outputCol_1)
    model       = indexer.fit(newdf)
    newdf_1     = model.transform(newdf)
    newdf_1.select(inputCol_1, outputCol_1).dropDuplicates().show()

    inputCol_2   = outputCol_1
    outputCol_2  = col+"-onehot"
    onehotenc    = OneHotEncoder(inputCol=inputCol_2, outputCol=outputCol_2, dropLast=False)
    newdf_2      = onehotenc.transform(newdf_1)
    newdf_2.select(inputCol_1, outputCol_1, outputCol_2).dropDuplicates().show()

# 3.1) variation du nombre d'arbres

In [54]:
nb_max_arbre = 20
for ntree in range(1,nb_max_arbre, 5) :
    rf          = RandomForestClassifier(labelCol=colY_num, numTrees=ntree)
    model       = rf.fit(trainingData)
    predictions = model.transform(testData)

    evaluator = MulticlassClassificationEvaluator(  labelCol      = colY_num , 
                                                    predictionCol = "prediction"   , 
                                                    metricName    = "accuracy"     )
    accuracy  = evaluator.evaluate(predictions)
    error     = 1 - accuracy

    print("%s arbre => Accuracy = %g, Error = %s" % (ntree, accuracy, error))


1 arbre => Accuracy = 0.832335, Error = 0.16766467065868262
6 arbre => Accuracy = 0.730539, Error = 0.2694610778443114
11 arbre => Accuracy = 0.724551, Error = 0.27544910179640714
16 arbre => Accuracy = 0.814371, Error = 0.18562874251497008


In [159]:
evaluator.isLargerBetter()

True

# 3.2) variation de la profondeur

In [55]:
test_forets = [1, 10, 20]
test_depth  = [5, 10, 20]
for ntree in test_forets:
    for depth in test_depth:
        rf          = RandomForestClassifier(labelCol=colY_num, numTrees=ntree, maxDepth=depth)
        model       = rf.fit(trainingData)
        predictions = model.transform(testData)

        evaluator = MulticlassClassificationEvaluator(  labelCol      = colY_num , 
                                                        predictionCol = "prediction"   , 
                                                        metricName    = "accuracy"     )
        accuracy  = 1 - evaluator.evaluate(predictions)

        print("%s arbre, depth = %s => Error = %g" % (ntree, depth, accuracy))


1 arbre, depth = 5 => Error = 0.167665
1 arbre, depth = 10 => Error = 0.0658683
1 arbre, depth = 20 => Error = 0.0479042
10 arbre, depth = 5 => Error = 0.203593
10 arbre, depth = 10 => Error = 0.125749
10 arbre, depth = 20 => Error = 0.101796
20 arbre, depth = 5 => Error = 0.263473
20 arbre, depth = 10 => Error = 0.164671
20 arbre, depth = 20 => Error = 0.137725


In [57]:
from time import time as now

# 3.3) Etendue de la forêt
(on peut aller prendre un café)

In [58]:
test_forets = [ 20, 30,  50]
test_depth  = [ 30] # limite à 30 de profondeurs 
for ntree in test_forets:
    for depth in test_depth:
        debut       = now()
        # modélisation :
        rf          = RandomForestClassifier(labelCol=colY_num, numTrees=ntree, maxDepth=depth,)
        model       = rf.fit(trainingData)
        predictions = model.transform(testData)
        # mesure de la performance :
        evaluator   = MulticlassClassificationEvaluator(  labelCol      = colY_num , 
                                                          predictionCol = "prediction"   , 
                                                          metricName    = "accuracy"     )
        accuracy     = 1 - evaluator.evaluate(predictions)
        duree        = now()-debut
        print("{0} arbre, depth = {1} => Error = {2:3.4}, duree = {3:5} sec ".format (ntree, depth, accuracy, duree))
        

20 arbre, depth = 30 => Error = 0.1377, duree = 19.171584367752075 sec 
30 arbre, depth = 30 => Error = 0.1587, duree = 23.561111211776733 sec 
50 arbre, depth = 30 => Error = 0.1078, duree = 32.50536870956421 sec 


# 3.4) Grille de recherche sur les hyperparamètres

In [106]:
#Import des librairies de tuning des algorithmes
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
# choix d’un algorithme
algorithme = RandomForestClassifier()
# Définition de la grille des paramètres à chercher
paramGrid  = ParamGridBuilder()
paramGrid  = paramGrid.addGrid(rf.numTrees     , [10, 30] )
paramGrid  = paramGrid.addGrid(rf.maxDepth     , [10, 30] )
paramGrid  = paramGrid.build()

In [107]:
tvs_grille = TrainValidationSplit( estimator          = algorithme,
                                   estimatorParamMaps = paramGrid,
                                   evaluator          = BinaryClassificationEvaluator(),
                                   trainRatio         = 0.8)


In [111]:
modele      = tvs_grille.fit(trainingData.withColumnRenamed(colY_num, "label"))

predictions = modele.transform(testData)

evaluator.evaluate(predictions)

0.7365269461077845

In [110]:
predictions.select("features", colY_num, "prediction").show()

+--------------------+--------+----------+
|            features|sexe-num|prediction|
+--------------------+--------+----------+
|(15,[0,1,2,5,6,7,...|     0.0|       1.0|
|(15,[0,1,2,5,6,7,...|     1.0|       1.0|
|(15,[0,1,2,5,6,7,...|     1.0|       0.0|
|(15,[0,1,2,5,6,7,...|     0.0|       0.0|
|(15,[0,1,2,5,6,7,...|     1.0|       1.0|
|(15,[0,1,2,5,6,7,...|     0.0|       0.0|
|(15,[0,1,2,5,6,7,...|     0.0|       0.0|
|(15,[0,1,2,5,6,7,...|     0.0|       0.0|
|(15,[0,1,2,5,6,7,...|     0.0|       0.0|
|(15,[0,1,2,5,6,7,...|     0.0|       0.0|
|(15,[0,1,2,5,6,7,...|     1.0|       1.0|
|(15,[0,1,2,5,6,7,...|     0.0|       0.0|
|(15,[0,1,2,5,6,7,...|     0.0|       0.0|
|(15,[0,1,2,5,6,7,...|     0.0|       1.0|
|(15,[0,1,2,5,6,7,...|     1.0|       1.0|
|(15,[0,1,2,5,6,7,...|     0.0|       0.0|
|(15,[0,1,2,5,6,7,...|     1.0|       1.0|
|(15,[0,1,2,5,6,7,...|     1.0|       1.0|
|(15,[0,1,2,5,6,7,...|     0.0|       0.0|
|(15,[0,1,2,5,6,7,...|     0.0|       1.0|
+----------

In [64]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


rf = RandomForestClassifier()
pipeline = Pipeline(stages=[rf])
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [10, 30]).build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=2) 

cvModel = crossval.fit(trainingData.withColumnRenamed(colY_num, "label"))

In [74]:
cvModel.estimatorParamMaps.

Param(parent='CrossValidatorModel_8f41c212ebda', name='estimatorParamMaps', doc='estimator param maps')

In [70]:
cvModel.bestModel.transform(testData).collect()

[Row(features=SparseVector(15, {0: 6070.0, 1: 0.329, 2: 0.8774, 5: 20111.5601, 6: 72.0, 7: 1.0967, 8: 0.4, 12: 1.0}), sexe-num=0.0, rawPrediction=DenseVector([4.0276, 5.9724]), probability=DenseVector([0.4028, 0.5972]), prediction=1.0),
 Row(features=SparseVector(15, {0: 6077.0, 1: 0.7497, 2: 1.8742, 5: 18285.3522, 6: 35.0, 7: 3.7484, 8: 0.4, 12: 1.0}), sexe-num=1.0, rawPrediction=DenseVector([2.8234, 7.1766]), probability=DenseVector([0.2823, 0.7177]), prediction=1.0),
 Row(features=SparseVector(15, {0: 5558.0, 1: 1.2136, 2: 3.034, 5: 21831.9187, 6: 74.0, 7: 6.068, 8: 0.4, 13: 1.0}), sexe-num=1.0, rawPrediction=DenseVector([3.6274, 6.3726]), probability=DenseVector([0.3627, 0.6373]), prediction=1.0),
 Row(features=SparseVector(15, {0: 5627.0, 1: 0.7329, 2: 1.9544, 5: 28126.2401, 6: 32.0, 7: 2.443, 8: 0.4, 10: 1.0}), sexe-num=0.0, rawPrediction=DenseVector([6.7352, 3.2648]), probability=DenseVector([0.6735, 0.3265]), prediction=0.0),
 Row(features=SparseVector(15, {0: 5912.0, 1: 0.02, 

# 4) changement des classifieurs

In [71]:

from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import LogisticRegression

classifiers = { "NaiveBayes"         : NaiveBayes(labelCol=colY_num)         , 
                "GBTClassifier"      : GBTClassifier(labelCol=colY_num)      , 
                "LogisticRegression" : LogisticRegression(labelCol=colY_num) }
best_accuracy   = 0
best_classifier = ""
for classifierName,classifier in classifiers.items():
    debut       = now()
    model       = classifier.fit(trainingData)
    predictions = model.transform(testData)

    evaluator   = MulticlassClassificationEvaluator(  labelCol      = colY_num , 
                                                      predictionCol = "prediction"   , 
                                                      metricName    = "accuracy"     )
    accuracy    = evaluator.evaluate(predictions)
    error       = 1 - accuracy
    duree       = now() - debut
    print("{0:20} => Accuracy = {1:4.3}, Error = {2:4.3}, duree = {3:5.3} sec".format (classifierName ,  
                                                                                       accuracy       , 
                                                                                       error          , 
                                                                                       duree         ))
    if accuracy > best_accuracy:
        best_accuracy   = accuracy
        best_classifier = classifierName
        
print ("best_classifier = %s, best_accuracy = %s"%(best_classifier, best_accuracy))


NaiveBayes           => Accuracy = 0.614, Error = 0.386, duree =  41.6 sec
GBTClassifier        => Accuracy = 0.949, Error = 0.0509, duree = 2.57e+02 sec
LogisticRegression   => Accuracy = 0.994, Error = 0.00599, duree = 1.41e+02 sec
best_classifier = LogisticRegression, best_accuracy = 0.9940119760479041


In [72]:
print ("best_classifier = %s, best_accuracy = %s"%(best_classifier, best_accuracy))

best_classifier = LogisticRegression, best_accuracy = 0.9940119760479041
