In [287]:
println("ready")

ready


# -2) imports

In [288]:
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.{StructType, StructField, StringType};

# -1) refaire un spark context personnalisé

In [289]:
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().getOrCreate()

In [290]:
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import spark.implicits._

In [291]:
val appName = "rennes"
val master  = "local[3]"
val conf    = new SparkConf()
conf.setAppName(appName)
conf.setMaster(master)


org.apache.spark.SparkConf@3e44c9a3

# Créer un sqlcontext

In [292]:
val sqlContext = new org.apache.spark.sql.SQLContext(sc)

# Possibilité 1 : appliquer un schéma à postériori

### Cyclistes

In [293]:
var cyclistes = sc.textFile("./logs/cycliste_cyclistes.csv")
// on retire le header
val header = cyclistes.first() 
cyclistes = cyclistes.filter(row => row != header) 
// on filtre sur les lignes pleines
cyclistes = cyclistes.filter(!_.isEmpty() )
System.out.println(cyclistes.count)

// on crée le schéma
var champs = List(  StructField("cycliste"     , StringType, true),
                    StructField("sportif"      , FloatType, true),
                    StructField("age"          , IntegerType, true),
                    StructField("genre"        , StringType, true),
                    StructField("nb_km_moyen"  , FloatType, true),
                    StructField("vitesse"      , FloatType, true),
                    StructField("attente"      , FloatType, true))
                    
var schema = StructType(champs)
// on cast les données dans le  même type que le schéma 
val rowRDD = cyclistes.map(_.split(",")).map(champs => Row(   champs(0), 
                                            champs(1).toFloat, 
                                            champs(2).toInt,
                                            champs(3),
                                            champs(4).toFloat,
                                            champs(5).toFloat,
                                            champs(5).toFloat))
// on crée une dataframe en appliquant le schéma sur les données castées
val df_cycliste = sqlContext.createDataFrame(rowRDD, schema)                                            

8071


In [294]:
df_cycliste.take(1)

Array([cycliste_azs78,4.0,42,femme,14.274417,19.646261,19.646261])

### trajets

In [295]:
import java.sql.Timestamp

In [296]:
var trajets = sc.textFile("./logs/trajets.csv")
val header = trajets.first() 
trajets    = trajets.filter(row => row != header) 
trajets    = trajets.filter(!_.isEmpty() )
System.out.println(trajets.count)
trajets.take(1)

78075


Array(arik8,cycliste_azom3,velo_arsd2_1486319951.69,1486320427.75,1486320436.3,8.54677605629,3.16227766017,azq69)

In [297]:
var champs = List(  StructField("station"     , StringType, true),
                    StructField("cycliste"      , StringType, true),
                    StructField("velo"          , StringType, true),
                    StructField("h_depart"        , TimestampType, true),
                    StructField("h_darrivee"        , TimestampType, true),
                    StructField("duree"         , FloatType, true),
                    StructField("distance"      , FloatType, true),
                    StructField("arrivee"      , StringType, true))
                    
var schema = StructType(champs)

val rowRDD = trajets.map(_.split(",")).map(champs => Row(   champs(0), 
                                            champs(1), 
                                            champs(2),
                                            new Timestamp(champs(3).substring(0,10).toLong*1000),
                                            new Timestamp(champs(4).substring(0,10).toLong*1000),
                                            champs(5).toFloat,
                                            champs(6).toFloat,
                                            champs(7)))
val df_trajets = sqlContext.createDataFrame(rowRDD, schema)                                            

In [298]:
df_trajets.take(1)

Array([arik8,cycliste_azom3,velo_arsd2_1486319951.69,2017-02-05 18:47:07.0,2017-02-05 18:47:16.0,8.546776,3.1622777,azq69])

# Possibilité 2 : inférer le schéma
=> plus lent car le fichier est lu deux fois

# options de lecture du csv

In [299]:
val options  = Map(  "header"                    -> "true"     ,
                     "ignoreLeadingWhiteSpace"   -> "true"     ,
                     "ignoreTrailingWhiteSpace"  -> "true"     ,
                     "inferSchema"               -> "true"     ,
                     "mode"                      -> "FAILFAST" )

### Cyclistes

In [300]:
val path     = "./logs/cycliste_cyclistes.csv"
val cyclistes = spark.read.options(options).csv(path)
cyclistes.show(2)
cyclistes.schema

+--------------+-------+---+-----+-------------+-------------+-------+
|      cycliste|sportif|age| sexe|        nb_km|      vitesse|attente|
+--------------+-------+---+-----+-------------+-------------+-------+
|cycliste_azs78|    4.0| 42|femme|14.2744166428|19.6462613202|    0.5|
|cycliste_azf23|    6.0| 15|homme|4.38667620022|37.5528123155|    0.5|
+--------------+-------+---+-----+-------------+-------------+-------+
only showing top 2 rows



StructType(StructField(cycliste,StringType,true), StructField(sportif,DoubleType,true), StructField(age,IntegerType,true), StructField(sexe,StringType,true), StructField(nb_km,DoubleType,true), StructField(vitesse,DoubleType,true), StructField(attente,DoubleType,true))

### Trajets

In [301]:
val path    = "./logs/trajets.csv"
val trajets = spark.read.options(options).csv(path)

In [302]:
trajets.show(2)
trajets.schema

+--------------+--------------+--------------------+---------------+--------------+-------------+-------------+---------------+
|station_depart|      cycliste|                velo|heure_de_depart|  heure_de_fin|        duree|     distance|station_arrivee|
+--------------+--------------+--------------------+---------------+--------------+-------------+-------------+---------------+
|         arik8|cycliste_azom3|velo_arsd2_148631...|1.48632042775E9|1.4863204363E9|8.54677605629|3.16227766017|          azq69|
|         ard67|cycliste_azuhb|velo_aelb6_148631...|1.48632042776E9|1.4863204363E9|8.54095602036|5.38516480713|          arti0|
+--------------+--------------+--------------------+---------------+--------------+-------------+-------------+---------------+
only showing top 2 rows



StructType(StructField(station_depart,StringType,true), StructField(cycliste,StringType,true), StructField(velo,StringType,true), StructField(heure_de_depart,DoubleType,true), StructField(heure_de_fin,DoubleType,true), StructField(duree,DoubleType,true), StructField(distance,DoubleType,true), StructField(station_arrivee,StringType,true))

### Créer des time stamps

In [303]:
// cast a column to a timestamp
val trajets2 = trajets.withColumn ("timestamp_de_depart",trajets("heure_de_depart").cast("timestamp"))
val trajets3 = trajets2.withColumn("timestamp_de_fin"  , trajets("heure_de_fin"   ).cast("timestamp"))
trajets3.select("timestamp_de_depart", "timestamp_de_fin", "duree").show(4, false)


+----------------------+---------------------+-------------+
|timestamp_de_depart   |timestamp_de_fin     |duree        |
+----------------------+---------------------+-------------+
|2017-02-05 18:47:07.75|2017-02-05 18:47:16.3|8.54677605629|
|2017-02-05 18:47:07.76|2017-02-05 18:47:16.3|8.54095602036|
|2017-02-05 18:47:07.76|2017-02-05 18:47:16.3|8.53870892525|
|2017-02-05 18:47:07.77|2017-02-05 18:47:16.3|8.53642320633|
+----------------------+---------------------+-------------+
only showing top 4 rows



### Velos

In [304]:
val path    = "./logs/velos_etats.csv"
val velos = spark.read.options(options).csv(path).withColumn("timestamp",'time.cast("timestamp")).drop('time)

In [305]:
velos.show(2, false)

+------------------------+---------+-----------+-----------+------------+----------------------+
|velo                    |n_message|station.nom|performance|nb_km_trajet|timestamp             |
+------------------------+---------+-----------+-----------+------------+----------------------+
|velo_ardc7_1486319950.92|10       |aeyuk      |0.0        |0.0         |2017-02-05 18:47:35.25|
|velo_ardc7_1486319950.92|9        |aeyuk      |0.0        |0.0         |2017-02-05 18:47:34.7 |
+------------------------+---------+-----------+-----------+------------+----------------------+
only showing top 2 rows



# Utilisation

### one-hot encoding : changer les catégories en nombre

In [306]:
import org.apache.spark.ml.feature.StringIndexer

In [307]:
val string_indexer       = new StringIndexer().setInputCol("sexe").setOutputCol("sexe-num")
val string_indexer_model = string_indexer.fit(cyclistes)
val cyclistes_doubles    = string_indexer_model.transform(cyclistes)
cyclistes_doubles.show(5)

+--------------+-------+---+-----+-------------+-------------+-------+--------+
|      cycliste|sportif|age| sexe|        nb_km|      vitesse|attente|sexe-num|
+--------------+-------+---+-----+-------------+-------------+-------+--------+
|cycliste_azs78|    4.0| 42|femme|14.2744166428|19.6462613202|    0.5|     1.0|
|cycliste_azf23|    6.0| 15|homme|4.38667620022|37.5528123155|    0.5|     0.0|
|cycliste_azifk|    2.0| 42|homme|10.3039046535|20.9549224279|    0.5|     0.0|
|cycliste_azfn4|    2.0| 75|femme|14.5216269084|9.46437654747|    0.5|     1.0|
|cycliste_azrg7|    0.0| 79|homme|1.41417358534|20.4831512318|    0.5|     0.0|
+--------------+-------+---+-----+-------------+-------------+-------+--------+
only showing top 5 rows



### définir   une colonne X qui est un vecteur appellée "features" 


In [308]:
import org.apache.spark.ml.feature.VectorAssembler

In [309]:
val colonnes_sans_y = cyclistes_doubles.columns.diff(Array("sexe", "sexe-num", "cycliste"))
colonnes_sans_y

Array(sportif, age, nb_km, vitesse, attente)

In [310]:
val x_y = new VectorAssembler()
x_y.setInputCols(colonnes_sans_y)
x_y.setOutputCol("features")

vecAssembler_85bb586d2331

In [311]:
val points=  x_y.transform(cyclistes_doubles)

#### On a créé les features

In [312]:
points.show(3)

+--------------+-------+---+-----+-------------+-------------+-------+--------+--------------------+
|      cycliste|sportif|age| sexe|        nb_km|      vitesse|attente|sexe-num|            features|
+--------------+-------+---+-----+-------------+-------------+-------+--------+--------------------+
|cycliste_azs78|    4.0| 42|femme|14.2744166428|19.6462613202|    0.5|     1.0|[4.0,42.0,14.2744...|
|cycliste_azf23|    6.0| 15|homme|4.38667620022|37.5528123155|    0.5|     0.0|[6.0,15.0,4.38667...|
|cycliste_azifk|    2.0| 42|homme|10.3039046535|20.9549224279|    0.5|     0.0|[2.0,42.0,10.3039...|
+--------------+-------+---+-----+-------------+-------------+-------+--------+--------------------+
only showing top 3 rows



# On renomme la colonne Y en "label"

In [313]:
val points_avec_label = points.withColumnRenamed("sexe-num", "label")

# On crée un jeu de test et d'apprentissage

In [314]:
val splits = points_avec_label.randomSplit(Array(0.8, 0.2))

# On met les jeux en mémoire

In [315]:
val jeu_d_apprentissage = splits(0).cache()

In [316]:
val jeu_de_test         = splits(1).cache()

In [317]:
jeu_d_apprentissage.show(3)
jeu_d_apprentissage.count()

+----------+-------+---+-----+-------------+-------------+-------+-----+--------------------+
|  cycliste|sportif|age| sexe|        nb_km|      vitesse|attente|label|            features|
+----------+-------+---+-----+-------------+-------------+-------+-----+--------------------+
|cycliste_a|    0.0| 71|femme|15.7469015273|9.12707675107|    0.5|  1.0|[0.0,71.0,15.7469...|
|cycliste_a|    2.0| 27|femme|3.43400008008| 25.058955128|    0.5|  1.0|[2.0,27.0,3.43400...|
|cycliste_a|    2.0| 35|femme|3.43400008008| 22.058955128|    0.5|  1.0|[2.0,35.0,3.43400...|
+----------+-------+---+-----+-------------+-------------+-------+-----+--------------------+
only showing top 3 rows



6447

In [318]:
jeu_de_test.show(3)
jeu_de_test.count()

+----------+-------+---+-----+--------------+-------------+-------+-----+--------------------+
|  cycliste|sportif|age| sexe|         nb_km|      vitesse|attente|label|            features|
+----------+-------+---+-----+--------------+-------------+-------+-----+--------------------+
|cycliste_a|    0.0| 80|femme| 3.43400008008| 15.558955128|    0.5|  1.0|[0.0,80.0,3.43400...|
|cycliste_a|    4.0| 22|homme| 12.7469015273|26.1694356681|    0.5|  0.0|[4.0,22.0,12.7469...|
|cycliste_a|    4.0| 66|homme|0.434000080076| 24.745273504|    0.5|  0.0|[4.0,66.0,0.43400...|
+----------+-------+---+-----+--------------+-------------+-------+-----+--------------------+
only showing top 3 rows



1624

# Régression logistique

In [319]:
import org.apache.spark.ml.classification.LogisticRegression

In [320]:
val lr = new LogisticRegression().setMaxIter(50).setRegParam(0.3).setElasticNetParam(0.8)


In [340]:
val lrModel = lr.fit(jeu_d_apprentissage)

In [341]:
val test = lrModel.transform(jeu_de_test)

In [342]:
bceval.evaluate(test)

0.5

# Arbre de décision

In [325]:
import org.apache.spark.ml.classification.DecisionTreeClassifier

In [326]:
val dt = new DecisionTreeClassifier()

In [327]:
val dtmodel = dt.fit(jeu_d_apprentissage)

In [328]:
val predictions = dtmodel.transform(jeu_de_test)

# Evaluer une classification binaire

In [329]:
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator

In [330]:
val bceval = new BinaryClassificationEvaluator()

In [331]:
bceval.getMetricName

areaUnderROC

In [332]:
bceval.evaluate(predictions)


0.7385167079358143

# Random forest

In [333]:
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.tree.model.RandomForestModel
import org.apache.spark.mllib.util.MLUtils

In [334]:
import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}

In [335]:
val numClasses = 2
val categoricalFeaturesInfo = Map[Int, Int]()
val numTrees = 3 // Use more in practice.
val featureSubsetStrategy = "auto" // Let the algorithm choose.
val impurity = "gini"
val maxDepth = 4
val maxBins = 32

val algo = new RandomForestClassifier()

In [336]:
val model  = algo.fit(jeu_d_apprentissage)

In [337]:
val predictions = model.transform(jeu_de_test)

In [338]:
predictions.show(5)

+--------------+-------+---+-----+--------------+-------------+-------+-----+--------------------+--------------------+--------------------+----------+
|      cycliste|sportif|age| sexe|         nb_km|      vitesse|attente|label|            features|       rawPrediction|         probability|prediction|
+--------------+-------+---+-----+--------------+-------------+-------+-----+--------------------+--------------------+--------------------+----------+
|    cycliste_a|    0.0| 80|femme| 3.43400008008| 15.558955128|    0.5|  1.0|[0.0,80.0,3.43400...|[12.5618331188626...|[0.62809165594313...|       0.0|
|    cycliste_a|    4.0| 22|homme| 12.7469015273|26.1694356681|    0.5|  0.0|[4.0,22.0,12.7469...|[13.2041582010960...|[0.66020791005480...|       0.0|
|    cycliste_a|    4.0| 66|homme|0.434000080076| 24.745273504|    0.5|  0.0|[4.0,66.0,0.43400...|[17.0555728101693...|[0.85277864050846...|       0.0|
|cycliste_azd12|    4.0| 16|femme| 6.44543733947|24.4005881653|    0.5|  1.0|[4.0,16.0,6

In [339]:
bceval.evaluate(predictions)

0.8236517733595503