In [1]:
println("ready")

ready


# -3) Créer une sparkSession

In [2]:
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().getOrCreate()

# -3.1) Importer les méthodes de conversion de type

In [3]:
import spark.implicits._

In [4]:
sc

org.apache.spark.SparkContext@4b45af19

# -2) imports

In [5]:
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf

# -1) refaire un spark context personnalisé

In [6]:
val appName = "rennes"
val master  = "local[3]"
val conf    = new SparkConf()
conf.setAppName(appName)
conf.setMaster(master)

org.apache.spark.SparkConf@1ac24d71

In [7]:
val spark3 = SparkSession.builder().master("local[3]").appName("Word Count").getOrCreate()
val sc3    = spark3.sparkContext
sc3.master

local[*]

In [8]:
val path      = "./logs_backup/cycliste_cyclistes.csv"
// cyclistes est une RDD
val cyclistes = sc.textFile(path)

In [9]:
cyclistes.count()

1764

In [10]:
cyclistes.take(2)

Array(cycliste_azetu,6,31,femme,8.11111516393,21.9593538999,100, cycliste_aztv4,2,19,femme,10.5299319612,22.1102025653,8)

# 1) Créer une première rdd

In [11]:
val velos = sc.textFile("./logs/velos_etats.csv")
velos.take(1)

Array(velo_ael10_1486042024.11,3,1486042054.48,aepfl,0.530143229629,0.688828593572)

In [12]:
val velos2    = velos.filter(!_.isEmpty() )
val nb_lignes = velos.count()
// Remarque : en scala il faut faire précéder les chaines par la lettre [s] pour pouvoir afficher la valeur de variables
println(s"fichier = ${velos.count()}, lignes non vides =  ${velos2.count()}")

fichier = 6390, lignes non vides =  3195


# 2) convertir une RDD en RowRDD
## 2.0) importer les librairies sql :Row et les types de conversion des strings

In [13]:
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types._;
import spark.implicits._

## 2.1) Transformer les lignes en tableau d'objet spécifiques : les 'Row'

In [14]:
import java.sql.Timestamp

In [15]:
 val rowRDD = velos2.map(_.split(",")).map(champs => Row( champs(0), 
                                                         champs(1).toInt, 
                                                         new Timestamp(champs(2).substring(0,10).toLong*1000),
                                                         champs(3),
                                                         champs(4).toFloat))
rowRDD.take(1)

Array([velo_ael10_1486042024.11,3,2017-02-02 13:27:34.0,aepfl,0.5301432])

# 3) définir le schéma de la table vélo

In [16]:
var champs = List(  StructField("nom"           , StringType    , true),
                    StructField("indice_message", IntegerType   , true),
                    StructField("time"          , TimestampType , true),
                    StructField("station"       , StringType    , true),
                    StructField("performance"   , FloatType     , true))
var schema = StructType(champs)

# 4) créer une DataFrame via spark Session

In [17]:
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().getOrCreate()

In [18]:
val veloDF = spark.createDataFrame(rowRDD, schema)
veloDF

[nom: string, indice_message: int ... 3 more fields]

In [19]:
veloDF.take(1)

Array([velo_ael10_1486042024.11,3,2017-02-02 13:27:34.0,aepfl,0.5301432])

# 3.1) créer un sqlContext

In [20]:
val sqlContext = new org.apache.spark.sql.SQLContext(sc)

In [21]:
sqlContext.tableNames()

Array()

# 2.5) Enregistrer la DataFrame en tant que table

In [22]:
val table_velos = "velos"
veloDF.registerTempTable(table_velos)

# 2.6) Requêter la table avec du SQL

##### requete 01 : compte le nb de lignes

In [23]:
val requete = s"Select * from ${table_velos}"
sqlContext.sql(requete).show()

+--------------------+--------------+--------------------+-------+-----------+
|                 nom|indice_message|                time|station|performance|
+--------------------+--------------+--------------------+-------+-----------+
|velo_ael10_148604...|             3|2017-02-02 13:27:...|  aepfl|  0.5301432|
|velo_ael10_148604...|             2|2017-02-02 13:27:...|  aepfl|  0.5301432|
|velo_ael10_148604...|             1|2017-02-02 13:27:...|  aryn3|  0.5301432|
|velo_ael10_148604...|             0|2017-02-02 13:27:...|  azyio|  0.5301432|
|velo_aej25_148604...|             3|2017-02-02 13:27:...|  arpsd| 0.74110186|
|velo_aej25_148604...|             2|2017-02-02 13:27:...|  aryuw| 0.74110186|
|velo_aej25_148604...|             1|2017-02-02 13:27:...|  azyu4| 0.74110186|
|velo_aej25_148604...|             0|2017-02-02 13:27:...|  azhkx| 0.74110186|
|velo_artdw_148604...|             3|2017-02-02 13:27:...|  azhkx|        1.0|
|velo_artdw_148604...|             2|2017-02-02 13:2

##### requete 02 : nb de nom distincts

In [24]:
val requete = s"Select count( distinct nom) from ${table_velos}"
sqlContext.sql(requete).show()

+-------------------+
|count(DISTINCT nom)|
+-------------------+
|                357|
+-------------------+



##### requete 03 : performance moyenne

In [25]:
// performance moyenne : 
val requete = s"Select mean( performance) from ${table_velos}"
sqlContext.sql(requete).show()

+------------------+
|  avg(performance)|
+------------------+
|0.7950239696506417|
+------------------+



##### requete 05 : conversion de timestamp en date

In [33]:
val requete = s"""
Select  nom,
        min(time) as premiere_sortie, 
        max(time) as derniere_sortie                
        from ${table_velos}
        group by nom
"""
sqlContext.sql(requete).show()

+--------------------+--------------------+--------------------+
|                 nom|     premiere_sortie|     derniere_sortie|
+--------------------+--------------------+--------------------+
|velo_ario0_148604...|2017-02-02 13:27:...|2017-02-02 13:27:...|
|velo_azrjn_148604...|2017-02-02 13:27:...|2017-02-02 13:27:...|
|velo_azid4_148604...|2017-02-02 13:27:...|2017-02-02 13:27:...|
|velo_arshx_148604...|2017-02-02 13:27:...|2017-02-02 13:27:...|
|velo_azjwn_148604...|2017-02-02 13:27:...|2017-02-02 13:27:...|
|velo_azrln_148604...|2017-02-02 13:28:...|2017-02-02 13:28:...|
|velo_azfx7_148604...|2017-02-02 13:28:...|2017-02-02 13:28:...|
|velo_artsw_148604...|2017-02-02 13:28:...|2017-02-02 13:28:...|
|velo_aryi1_148604...|2017-02-02 13:27:...|2017-02-02 13:27:...|
|velo_aetin_148604...|2017-02-02 13:27:...|2017-02-02 13:27:...|
|velo_ardx6_148604...|2017-02-02 13:27:...|2017-02-02 13:27:...|
|velo_arpc1_148604...|2017-02-02 13:28:...|2017-02-02 13:28:...|
|velo_aeqg2_148604...|201

# 3) reading csv

In [37]:
// 1) création de la structure
var champs         = List(  StructField("nom"    , StringType, true) ,
                            StructField("heure"  , StringType, true) ,
                            StructField("velo"   , StringType, true) ,
                            StructField("action" , StringType, true) )
var schema         = StructType(champs)

// 2) lecture des données
val path           = "./logs_backup/cycliste_prises.csv"
val DataSet_prise  = sqlContext.read.schema(schema).csv(path)

// 3) nommage de la table
val table_prises   = "prises"
DataSet_prise.registerTempTable(table_prises)


In [35]:
var champs = List(  StructField("cyclise"  , StringType, true),
                    StructField("heure"    , StringType, true),
                    StructField("rendu"    , StringType, true),
                    StructField("duree"    , StringType, true),
                    StructField("velo"     , StringType, true))
var r_schema      = StructType(champs)
val DataSet_rendu = sqlContext.read.schema(r_schema).csv("./logs_backup/cycliste_rendu.csv")
val table_rendu   = "rendus"
DataSet_rendu.registerTempTable(table_rendu)

In [36]:
val requete = s"""  SELECT distinct  p.nom     , 
                                     p.heure   , 
                                     r.heure   ,
                                     r.rendu   ,
                                     r.duree   ,
                                     p.heure   , 
                                     r.velo
                    FROM        prises AS p 
                    INNER JOIN  rendus AS r 
                    ON          p.velo = r.velo
"""
sqlContext.sql(requete).show()

+--------------+-------------+-------------+-------------+-------------+-------------+----------+
|           nom|        heure|        heure|        rendu|        duree|        heure|      velo|
+--------------+-------------+-------------+-------------+-------------+-------------+----------+
|cycliste_aze10|1485439668.61|1485439670.98|1485439671.12|5.54097819705|1485439668.61|velo_artu3|
|cycliste_aze10|1485439719.74|1485439697.71|1485439697.95|5.65514540188|1485439719.74|velo_azk10|
|cycliste_aze10|1485439719.74|1485440167.85|1485440168.35|7.70852595438|1485439719.74|velo_azk10|
|cycliste_aze10|1485439757.55|1485440363.17|1485440363.83|10.7317745762|1485439757.55|velo_azru2|
|cycliste_aze10|1485439783.18| 1485439785.9|1485439786.53| 10.572863395|1485439783.18|velo_aryv3|
|cycliste_aze10|1485439783.18|1485440065.62|1485440066.32|4.89870374148|1485439783.18|velo_aryv3|
|cycliste_aze10|1485439821.97|1485439706.32|1485439706.32|            0|1485439821.97|velo_azrjc|
|cycliste_aze10|1485