In [1]:
println("ready")

ready


# -2) imports

In [2]:
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.{StructType, StructField, StringType};

# -1) refaire un spark context personnalisé

In [23]:
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().getOrCreate()

In [24]:
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import spark.implicits._

In [3]:
val appName = "rennes"
val master  = "local[3]"
val conf    = new SparkConf()
conf.setAppName(appName)
conf.setMaster(master)


org.apache.spark.SparkConf@63e18d18

# Créer un sqlcontext

In [26]:
val sqlContext = new org.apache.spark.sql.SQLContext(sc)

# Possibilité 1 : appliquer un schéma à postériori

### Cyclistes

In [106]:
var cyclistes = sc.textFile("./logs/cycliste_cyclistes.csv")
// on retire le header
val header = cyclistes.first() 
cyclistes = cyclistes.filter(row => row != header) 
// on filtre sur les lignes pleines
cyclistes = cyclistes.filter(!_.isEmpty() )
System.out.println(cyclistes.count)

// on crée le schéma
var champs = List(  StructField("cycliste"     , StringType, true),
                    StructField("sportif"      , FloatType, true),
                    StructField("age"          , IntegerType, true),
                    StructField("genre"        , StringType, true),
                    StructField("nb_km_moyen"  , FloatType, true),
                    StructField("vitesse"      , FloatType, true),
                    StructField("attente"      , FloatType, true))
                    
var schema = StructType(champs)
// on cast les données dans le  même type que le schéma 
val rowRDD = cyclistes.map(_.split(",")).map(champs => Row(   champs(0), 
                                            champs(1).toFloat, 
                                            champs(2).toInt,
                                            champs(3),
                                            champs(4).toFloat,
                                            champs(5).toFloat,
                                            champs(5).toFloat))
// on crée une dataframe en appliquant le schéma sur les données castées
val df_cycliste = sqlContext.createDataFrame(rowRDD, schema)                                            

300


In [167]:
df_cycliste.take(1)

Array([cycliste_azeuj,2.0,49,femme,14.564755,15.843843,15.843843])

### trajets

In [168]:
import java.sql.Timestamp

In [169]:
var trajets = sc.textFile("./logs/trajets.csv")
val header = trajets.first() 
trajets    = trajets.filter(row => row != header) 
trajets    = trajets.filter(!_.isEmpty() )
System.out.println(trajets.count)
trajets.take(1)

3445


Array(azjm1,cycliste_azeuj,velo_aetxc_1486044559.12,1486044561.54,1486044562.38,0.841557025909,1.0,arpn2)

In [170]:
var champs = List(  StructField("station"     , StringType, true),
                    StructField("cycliste"      , StringType, true),
                    StructField("velo"          , StringType, true),
                    StructField("h_depart"        , TimestampType, true),
                    StructField("h_darrivee"        , TimestampType, true),
                    StructField("duree"         , FloatType, true),
                    StructField("distance"      , FloatType, true),
                    StructField("arrivee"      , StringType, true))
                    
var schema = StructType(champs)

val rowRDD = trajets.map(_.split(",")).map(champs => Row(   champs(0), 
                                            champs(1), 
                                            champs(2),
                                            new Timestamp(champs(3).substring(0,10).toLong*1000),
                                            new Timestamp(champs(4).substring(0,10).toLong*1000),
                                            champs(5).toFloat,
                                            champs(6).toFloat,
                                            champs(7)))
val df_trajets = sqlContext.createDataFrame(rowRDD, schema)                                            

In [171]:
df_trajets.take(1)

Array([azjm1,cycliste_azeuj,velo_aetxc_1486044559.12,2017-02-02 14:09:21.0,2017-02-02 14:09:22.0,0.841557,1.0,arpn2])

# Possibilité 2 : inférer le schéma
=> plus lent car le fichier est lu deux fois

# options de lecture du csv

In [173]:
val options  = Map(  "header"                    -> "true"     ,
                     "ignoreLeadingWhiteSpace"   -> "true"     ,
                     "ignoreTrailingWhiteSpace"  -> "true"     ,
                     "inferSchema"               -> "true"     ,
                     "mode"                      -> "FAILFAST" )

### Cyclistes

In [177]:
val path     = "./logs/cycliste_cyclistes.csv"
val cyclistes = spark.read.options(options).csv(path)
cyclistes.show(2)
cyclistes.schema

+--------------+-------+---+-----+-------------+-------------+-------+
|      cycliste|sportif|age| sexe|        nb_km|      vitesse|attente|
+--------------+-------+---+-----+-------------+-------------+-------+
|cycliste_azeuj|    2.0| 49|femme| 14.564755189|15.8438435745|    0.5|
|cycliste_azex4|    2.0| 66|femme|7.68289414372|14.0118693694|    0.5|
+--------------+-------+---+-----+-------------+-------------+-------+
only showing top 2 rows



StructType(StructField(cycliste,StringType,true), StructField(sportif,DoubleType,true), StructField(age,IntegerType,true), StructField(sexe,StringType,true), StructField(nb_km,DoubleType,true), StructField(vitesse,DoubleType,true), StructField(attente,DoubleType,true))

### Trajets

In [179]:
val path    = "./logs/trajets.csv"
val trajets = spark.read.options(options).csv(path)

In [180]:
trajets.show(2)
trajets.schema

+--------------+--------------+--------------------+---------------+---------------+--------------+--------+---------------+
|station_depart|      cycliste|                velo|heure_de_depart|   heure_de_fin|         duree|distance|station_arrivee|
+--------------+--------------+--------------------+---------------+---------------+--------------+--------+---------------+
|         azjm1|cycliste_azeuj|velo_aetxc_148604...|1.48604456154E9|1.48604456238E9|0.841557025909|     1.0|          arpn2|
|         arikc|cycliste_azex4|velo_azjv1_148604...|1.48604456155E9|1.48604456262E9| 1.07023191452|     1.0|          arfm2|
+--------------+--------------+--------------------+---------------+---------------+--------------+--------+---------------+
only showing top 2 rows



StructType(StructField(station_depart,StringType,true), StructField(cycliste,StringType,true), StructField(velo,StringType,true), StructField(heure_de_depart,DoubleType,true), StructField(heure_de_fin,DoubleType,true), StructField(duree,DoubleType,true), StructField(distance,DoubleType,true), StructField(station_arrivee,StringType,true))

### Créer des time stamps

In [207]:
// cast a column to a timestamp
val trajets2 = trajets.withColumn ("timestamp_de_depart",trajets("heure_de_depart").cast("timestamp"))
val trajets3 = trajets2.withColumn("timestamp_de_fin"  , trajets("heure_de_fin"   ).cast("timestamp"))
trajets3.select("timestamp_de_depart", "timestamp_de_fin", "duree").show(4, false)


+----------------------+----------------------+--------------+
|timestamp_de_depart   |timestamp_de_fin      |duree         |
+----------------------+----------------------+--------------+
|2017-02-02 14:09:21.54|2017-02-02 14:09:22.38|0.841557025909|
|2017-02-02 14:09:21.55|2017-02-02 14:09:22.62|1.07023191452 |
|2017-02-02 14:09:21.55|2017-02-02 14:09:22.39|0.832741975784|
|2017-02-02 14:09:21.56|2017-02-02 14:09:22.39|0.83046913147 |
+----------------------+----------------------+--------------+
only showing top 4 rows



### Velos

In [217]:
val path    = "./logs/velos_etats.csv"
val velos = spark.read.options(options).csv(path).withColumn("timestamp",'time.cast("timestamp")).drop('time)

In [218]:
velos.show(2, false)

+------------------------+---------+-----------+-----------+-------------+----------------------+
|velo                    |n_message|station.nom|performance|nb_km_trajet |timestamp             |
+------------------------+---------+-----------+-----------+-------------+----------------------+
|velo_ari89_1486044398.55|16       |azigw      |1.0        |2.02238733734|2017-02-02 14:07:09.2 |
|velo_ari89_1486044398.55|14       |aztg7      |1.0        |14.4703218746|2017-02-02 14:07:08.16|
+------------------------+---------+-----------+-----------+-------------+----------------------+
only showing top 2 rows



In [227]:
velos.

Name: Unknown Error
Message: <console>:51: error: value last is not a member of org.apache.spark.sql.DataFrame
       velos.last()
             ^
StackTrace: 

In [225]:
velos.filter('velo === "velo_ari89_1486044398.55").sort('n_message asc).show(false)

+------------------------+---------+-----------+-----------+-------------+----------------------+
|velo                    |n_message|station.nom|performance|nb_km_trajet |timestamp             |
+------------------------+---------+-----------+-----------+-------------+----------------------+
|velo_ari89_1486044398.55|14       |aztg7      |1.0        |14.4703218746|2017-02-02 14:07:08.16|
|velo_ari89_1486044398.55|16       |azigw      |1.0        |2.02238733734|2017-02-02 14:07:09.2 |
+------------------------+---------+-----------+-----------+-------------+----------------------+



In [None]:


val veloDF = sqlContext.createDataFrame(rowRDD, schema)

veloDF.registerTempTable("velos")

In [36]:
var champs = List(  StructField("nom"          , StringType, true),
                    StructField("time"         , StringType, true),
                    StructField("station"      , StringType, true),
                    StructField("performance"  , StringType, true),
                    StructField("nb_km_trajet" , StringType, true))
var schema = StructType(champs)

val rowRDD = velos.map(_.split(",")).map(champs => Row(champs(0), champs(1), champs(2),champs(3),champs(4)))

val veloDF = sqlContext.createDataFrame(rowRDD, schema)

veloDF.registerTempTable("velos")

# 3) reading csv

In [29]:
// 1) création de la structure
var champs         = List(  StructField("nom"    , StringType, true) ,
                            StructField("heure"  , StringType, true) ,
                            StructField("velo"   , StringType, true) ,
                            StructField("action" , StringType, true) )
var schema         = StructType(champs)

// 2) lecture des données
val path           = "./logs_backup/cycliste_prises.csv"
val DataSet_prise  = sqlContext.read.schema(schema).csv(path)

// 3) nommage de la table
val table_prises   = "prises"
DataSet_prise.registerTempTable(table_prises)


In [30]:
var champs = List(  StructField("cyclise"  , StringType, true),
                    StructField("heure"    , StringType, true),
                    StructField("rendu"    , StringType, true),
                    StructField("duree"    , StringType, true),
                    StructField("velo"     , StringType, true))
var r_schema = StructType(champs)
val DataSet_rendu = sqlContext.read.schema(r_schema).csv("./logs_backup/cycliste_rendu.csv")
val table_rendu = "rendus"
DataSet_rendu.registerTempTable(table_rendu)

In [31]:
val requete = s"""  SELECT distinct  p.nom     , 
                                     p.heure   , 
                                     r.heure   ,
                                     r.rendu   ,
                                     r.duree   ,
                                     p.heure   , 
                                     r.velo
                    FROM        prises AS p 
                    INNER JOIN  rendus AS r 
                    ON          p.velo = r.velo
"""
sqlContext.sql(requete).show()

+--------------+-------------+-------------+-------------+-------------+-------------+----------+
|           nom|        heure|        heure|        rendu|        duree|        heure|      velo|
+--------------+-------------+-------------+-------------+-------------+-------------+----------+
|cycliste_aze10|1485439668.61|1485439670.98|1485439671.12|5.54097819705|1485439668.61|velo_artu3|
|cycliste_aze10|1485439719.74|1485439697.71|1485439697.95|5.65514540188|1485439719.74|velo_azk10|
|cycliste_aze10|1485439719.74|1485440167.85|1485440168.35|7.70852595438|1485439719.74|velo_azk10|
|cycliste_aze10|1485439757.55|1485440363.17|1485440363.83|10.7317745762|1485439757.55|velo_azru2|
|cycliste_aze10|1485439783.18| 1485439785.9|1485439786.53| 10.572863395|1485439783.18|velo_aryv3|
|cycliste_aze10|1485439783.18|1485440065.62|1485440066.32|4.89870374148|1485439783.18|velo_aryv3|
|cycliste_aze10|1485439821.97|1485439706.32|1485439706.32|            0|1485439821.97|velo_azrjc|
|cycliste_aze10|1485