# -1) Imports

In [21]:
# Python
## modules
import time  
import sys  
import numpy as np 
import copy
## fonctions
from datetime                  import datetime
from dateutil.relativedelta    import relativedelta

# Spark 1.6
from pyspark                   import SparkContext 
from pyspark                   import SparkConf    
from pyspark.sql               import SQLContext   
from pyspark.sql               import HiveContext  

# Spark 2.0
from pyspark.sql               import SparkSession 

# Fonctions
from pyspark.sql               import Row
from pyspark.sql.types         import *

# Machine learning
from pyspark.ml                import Pipeline
from pyspark.ml.feature        import OneHotEncoder
from pyspark.ml.feature        import StringIndexer
from pyspark.ml.feature        import VectorIndexer
from pyspark.ml.feature        import VectorAssembler
from pyspark.ml.evaluation     import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier

# Ancienne librairie de ML
from pyspark.mllib.evaluation  import MulticlassMetrics

In [22]:
sc

In [23]:
spark

# 0) settings

In [24]:
app_name    = "Random forest RF"
nb_cores    = 3
paralelisme = 3
memory      = 3
start_load  = time.time()
spark_1_6   = False
spark_2     = True

In [25]:
assert( spark_1_6 & spark_2 == False)

In [26]:
#spark 1.6
if spark_1_6:
    conf = SparkConf()
    conf.setAppName(app_name)
    conf.set("spark.mesos.coarse"             , "True")
    conf.set("spark.executor.memory"          , "%sg"%memory)
    conf.set("spark.driver.memory"            , "%sg"%memory)
    conf.set("spark.serializer"               , "org.apache.spark.serializer.KryoSerializer")
    conf.set("spark.kryoserializer.buffer.max", "1024m")
    conf.set("spark.driver.maxResultSize"     , "10g")
    conf.set("spark.cores.max"                , "%s"%(nb_cores))
    conf.set("spark.default.parallelism"      , "%s"%(nb_cores*paralelisme))
    conf.set("spark.storage.memoryFraction"   , "0.5")
    sc         = SparkContext(conf=conf)
    sqlContext = HiveContext(sc)


In [27]:
if spark_2:
    spark = SparkSession.builder\
    .config("spark.app.name"                  , app_name                                   )\
    .config("spark.cores.max"                 , "%s"%(nb_cores)                            )\
    .config("spark.mesos.coarse"             , "True"                                      )\
    .config("spark.executor.memory"          , "%sg"%memory                                )\
    .config("spark.driver.memory"            , "%sg"%memory                                )\
    .config("spark.serializer"               , "org.apache.spark.serializer.KryoSerializer")\
    .config("spark.kryoserializer.buffer.max", "1024m"                                     )\
    .config("spark.driver.maxResultSize"     , "10g"                                       )\
    .config("spark.cores.max"                , "%s"%(nb_cores)                             )\
    .config("spark.default.parallelism"      , "%s"%(nb_cores*paralelisme)                 )\
    .config("spark.storage.memoryFraction"   , "0.5"                                       )\
    .getOrCreate()


In [28]:
spark

# 1) Structure et import du fichier

In [29]:
url_fichier = "hdfs://steeves:8020/cycliste_cyclistes.csv"
data = spark.read.option("header", "true").option("inferSchema", "true").csv(url_fichier)

In [40]:
type(data)

pyspark.sql.dataframe.DataFrame

In [32]:
colonnes = data.columns

# 2) one hot encoding sur le sexe

In [41]:
from  pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import IndexToString

In [42]:
one_hot_encoding = StringIndexer(inputCol="sexe", outputCol="sexe-num")

In [44]:
model_one_hot_encoding = one_hot_encoding.fit(data)

In [45]:
data_avec_sexe_en_binaire = model_one_hot_encoding.transform(data)

In [46]:
data_avec_sexe_en_binaire.take(5)

[Row(cycliste='cycliste_azrc5', sportif=-0.5, age=24, sexe='femme', nb_km=14.9038044082, vitesse=12.0815937384, attente=0.5, sexe-num=0.0),
 Row(cycliste='cycliste_azrs0', sportif=-0.5, age=67, sexe='femme', nb_km=14.5112928217, vitesse=14.0640955863, attente=0.5, sexe-num=0.0),
 Row(cycliste='cycliste_azyqs', sportif=6.0, age=27, sexe='femme', nb_km=9.90271533363, vitesse=20.937700539, attente=0.5, sexe-num=0.0),
 Row(cycliste='cycliste_azqb1', sportif=2.0, age=46, sexe='homme', nb_km=9.73490180628, vitesse=24.0848358156, attente=0.5, sexe-num=1.0),
 Row(cycliste='cycliste_azulv', sportif=2.0, age=64, sexe='femme', nb_km=5.62040512786, vitesse=11.4958355607, attente=0.5, sexe-num=0.0)]

# 3) création d'une colonne "features" qui comprend les paramètres explicatifs du sexe

In [48]:
colonnes = data_avec_sexe_en_binaire.columns
colonnes

['cycliste',
 'sportif',
 'age',
 'sexe',
 'nb_km',
 'vitesse',
 'attente',
 'sexe-num']

In [51]:
colonnes_a_enlever = ['sexe', 'sexe-num', 'cycliste']
colonnes_sans_y = [x for x in colonnes if x not in colonnes_a_enlever]
colonnes_sans_y

['sportif', 'age', 'nb_km', 'vitesse', 'attente']

In [53]:
colonnes_a_garder = list(set(colonnes) - set(colonnes_a_enlever))
colonnes_a_garder

['vitesse', 'age', 'sportif', 'attente', 'nb_km']

# 4) création d'un vecteur avec les colonnes à garder

In [None]:
from pyspark.ml.feature import VectorAssembler

In [57]:
instance = VectorAssembler(inputCols=colonnes_a_garder, outputCol="features")

In [58]:
data_avec_col_features = instance.transform(data_avec_sexe_en_binaire)

In [59]:
data_avec_col_features.take(5)

[Row(cycliste='cycliste_azrc5', sportif=-0.5, age=24, sexe='femme', nb_km=14.9038044082, vitesse=12.0815937384, attente=0.5, sexe-num=0.0, features=DenseVector([12.0816, 24.0, -0.5, 0.5, 14.9038])),
 Row(cycliste='cycliste_azrs0', sportif=-0.5, age=67, sexe='femme', nb_km=14.5112928217, vitesse=14.0640955863, attente=0.5, sexe-num=0.0, features=DenseVector([14.0641, 67.0, -0.5, 0.5, 14.5113])),
 Row(cycliste='cycliste_azyqs', sportif=6.0, age=27, sexe='femme', nb_km=9.90271533363, vitesse=20.937700539, attente=0.5, sexe-num=0.0, features=DenseVector([20.9377, 27.0, 6.0, 0.5, 9.9027])),
 Row(cycliste='cycliste_azqb1', sportif=2.0, age=46, sexe='homme', nb_km=9.73490180628, vitesse=24.0848358156, attente=0.5, sexe-num=1.0, features=DenseVector([24.0848, 46.0, 2.0, 0.5, 9.7349])),
 Row(cycliste='cycliste_azulv', sportif=2.0, age=64, sexe='femme', nb_km=5.62040512786, vitesse=11.4958355607, attente=0.5, sexe-num=0.0, features=DenseVector([11.4958, 64.0, 2.0, 0.5, 5.6204]))]

# 2) gestion des  colonnes catégorielles / numériques

In [None]:
def oneHotEncodeColumns(df, cols):
    """
    Convertit une colonne contenant n modalité 
    en n colonne ne comprenant qu'une seule valeur.
    (Supprime les effets d'ordre des valeurs numériques)
    Parameters:
        df : matrice à modifier 
            dataframe
        cols : noms des colonnes à indexer
            list de chaine de caractère
            
    Return: dataframe
    """
    from pyspark.ml.feature import OneHotEncoder
    newdf = df
    for col in cols:
        onehotenc = OneHotEncoder(inputCol=col, outputCol=col+"-onehot", dropLast=False)
        newdf     = onehotenc.transform(newdf).drop(col)
        newdf     = newdf.withColumnRenamed(col+"-onehot", col)
    return newdf

# 2.1) catégories => numériques

In [None]:
colY

In [None]:
typeString = [x[0] for x in data.dtypes if x[1]=='string']

In [None]:
typeString

In [None]:
colString_without_Y = copy.copy(typeString)
colString_without_Y.remove(colY)

In [None]:
data.take(1)

In [None]:
data2 = indexStringColumns(data, typeString)
data2.take(1)

# 2.2) numériques => one hot encoding

In [None]:
typeDouble = [x[0] for x in data2.dtypes if x[1]=='double']

In [None]:
data3 = oneHotEncodeColumns(data2, colString_without_Y)
data3.take(1)

# 2.3) plusieurs colonnes => un vecteur

In [None]:
# Choix des colonnes
features = data3.columns
features.remove(colY)
# Assembleur
assemblor = VectorAssembler(inputCols=features, outputCol="features")
# Application
data4 = assemblor.transform(data3)
data4.take(2)

In [None]:
if False:
    label = [colY]

# 2.4) projection, et cleaning

In [None]:
data5 = data4.select("features", colY)
data5.schema

In [None]:
data5.take(2)

In [None]:
data5.count()

In [None]:
data6 = data5.dropDuplicates()
data6.count()

# 2.5) équilibrage des classes

In [None]:
data6.groupBy(colY).count().show()

In [None]:
nb_examples = 10000
income_0             = data6.filter("income == 0")
_10000_income_0      = income_0.sample(False, nb_examples/float(income_0.count()))

income_1             = data6.filter("income == 1")
_10000_income_1      = income_1.sample(False, nb_examples/float(income_1.count()))

classes_equilibrees  = _10000_income_0.union(_10000_income_1)

In [None]:
print _10000_income_0.count(), _10000_income_1.count(), classes_equilibrees.count()

# 3) Apprentissage

In [None]:
# Division en jeu de test, et jeu d'apprentissage
(trainingData, testData) = classes_equilibrees.randomSplit([0.7, 0.3])

# 3.1) variation du nombre d'arbres

In [None]:
nb_max_arbre = 20
for ntree in range(1,nb_max_arbre) :
    rf          = RandomForestClassifier(labelCol=colY, numTrees=ntree)
    model       = rf.fit(trainingData)
    predictions = model.transform(testData)

    evaluator = MulticlassClassificationEvaluator(  labelCol      = "income" , 
                                                    predictionCol = "prediction"   , 
                                                    metricName    = "accuracy"     )
    accuracy  = evaluator.evaluate(predictions)
    error     = 1 - accuracy

    print("%s arbre => Accuracy = %g, Error = %s" % (ntree, accuracy, error))


In [None]:
evaluator.isLargerBetter()

# 3.2) variation de la profondeur

In [None]:
test_forets = [1, 10, 20]
test_depth  = [5, 10, 20]
for ntree in test_forets:
    for depth in test_depth:
        rf          = RandomForestClassifier(labelCol=colY, numTrees=ntree, maxDepth=depth)
        model       = rf.fit(trainingData)
        predictions = model.transform(testData)

        evaluator = MulticlassClassificationEvaluator(  labelCol      = "income" , 
                                                        predictionCol = "prediction"   , 
                                                        metricName    = "accuracy"     )
        accuracy  = 1 - evaluator.evaluate(predictions)

        print("%s arbre, depth = %s => Error = %g" % (ntree, depth, accuracy))


In [None]:
from time import time as now

# 3.3) Etendue de la forêt
(on peut aller prendre un café)

In [None]:
test_forets = [ 20, 30,  50]
test_depth  = [ 30] # limite à 30 de profondeurs 
for ntree in test_forets:
    for depth in test_depth:
        debut       = now()
        # modélisation :
        rf          = RandomForestClassifier(labelCol=colY, numTrees=ntree, maxDepth=depth,)
        model       = rf.fit(trainingData)
        predictions = model.transform(testData)
        # mesure de la performance :
        evaluator   = MulticlassClassificationEvaluator(  labelCol      = "income" , 
                                                          predictionCol = "prediction"   , 
                                                          metricName    = "accuracy"     )
        accuracy     = 1 - evaluator.evaluate(predictions)
        duree        = now()-debut
        print("{0} arbre, depth = {1} => Error = {2:3.4}, duree = {3:5} sec ".format (ntree, depth, accuracy, duree))
    

# Tentative de récupération des features importance

In [None]:
if False:
    from pyspark.ml.feature import OneHotEncoder

    newdf       = copy.copy(data)
    inputCol_1  = newdf.columns[1]
    outputCol_1 = col+"-num"
    indexer     = StringIndexer(inputCol=inputCol_1, outputCol=outputCol_1)
    model       = indexer.fit(newdf)
    newdf_1     = model.transform(newdf)
    newdf_1.select(inputCol_1, outputCol_1).dropDuplicates().show()

    inputCol_2   = outputCol_1
    outputCol_2  = col+"-onehot"
    onehotenc    = OneHotEncoder(inputCol=inputCol_2, outputCol=outputCol_2, dropLast=False)
    newdf_2      = onehotenc.transform(newdf_1)
    newdf_2.select(inputCol_1, outputCol_1, outputCol_2).dropDuplicates().show()

# 4) changement des classifieurs

In [None]:

from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import LogisticRegression

classifiers = { "NaiveBayes"         : NaiveBayes(labelCol=colY)         , 
                "GBTClassifier"      : GBTClassifier(labelCol=colY)      , 
                "LogisticRegression" : LogisticRegression(labelCol=colY) }
best_accuracy   = 0
best_classifier = ""
for classifierName,classifier in classifiers.iteritems():
    debut       = now()
    model       = classifier.fit(trainingData)
    predictions = model.transform(testData)

    evaluator   = MulticlassClassificationEvaluator(  labelCol      = "income" , 
                                                      predictionCol = "prediction"   , 
                                                      metricName    = "accuracy"     )
    accuracy    = evaluator.evaluate(predictions)
    error       = 1 - accuracy
    duree       = now() - debut
    print("{0:20} => Accuracy = {1:4.3}, Error = {2:4.3}, duree = {3:5.3} sec".format (classifierName ,  
                                                                                       accuracy       , 
                                                                                       error          , 
                                                                                       duree         ))
    if accuracy > best_accuracy:
        best_accuracy   = accuracy
        best_classifier = classifierName
        
print "best_classifier = %s, best_accuracy = %s"%(best_classifier, best_accuracy)
