In [1]:
println("ready")

ready


# -3) vérifier que le spark context est disponible

In [2]:
sc

org.apache.spark.SparkContext@65ac2d42

# -2) imports

In [3]:
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf

# -1) refaire un spark context personnalisé

In [4]:
val appName = "rennes"
val master  = "local[3]"
val conf    = new SparkConf()
conf.setAppName(appName)
conf.setMaster(master)


org.apache.spark.SparkConf@40908e3f

In [5]:
sc.stop()
val sc = new SparkContext(conf)

In [7]:
sc.master

local[3]

# 1) Créer une première rdd

## 1.1) lecture d'un fichier

In [167]:
val cyclistes = sc.textFile("./logs_backup/cycliste_cyclistes.csv")

In [170]:
cyclistes.count()

1764

In [171]:
cyclistes.take(2)

Array(cycliste_azetu,6,31,femme,8.11111516393,21.9593538999,100, cycliste_aztv4,2,19,femme,10.5299319612,22.1102025653,8)

In [172]:
val velos = sc.textFile("./logs_backup/velos_etats.csv")

In [173]:
velos.take(1)

Array(velo_azem4,1485439420.58,azf38,0.95,97.3984151307)

# 1.2) filtrer

In [174]:
val velos2 = velos.filter(!_.isEmpty() )

In [176]:
val nb_lignes = velos.count()
println(s"fichier = ${velos.count()}, lignes non vides =  ${velos2.count()}")

fichier = 128668, lignes non vides =  64334


# 2) convertir un fichier en table sql

## 2.0) importer les librairies sql

In [179]:
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.{StructType, StructField, StringType};

# 2.1) créer un sqlContext

In [180]:
val sqlContext = new org.apache.spark.sql.SQLContext(sc)

# 2.2) définir le schéma de la table

In [181]:
var champs = List(  StructField("nom"          , StringType, true),
                    StructField("time"         , StringType, true),
                    StructField("station"      , StringType, true),
                    StructField("performance"  , StringType, true),
                    StructField("nb_km_trajet" , StringType, true))
var schema = StructType(champs)

## 2.3) Transformer les lignes en tableau d'objet spécifiques : les 'Row'

In [184]:
val rowRDD = velos2.map(_.split(",")).map(champs => Row(champs(0), champs(1), champs(2),champs(3),champs(4)))

# 2.4) créer une DataFrame via le sqlContext

In [185]:
val veloDF = sqlContext.createDataFrame(rowRDD, schema)

# 2.5) Enregistrer la DataFrame en tant que table

In [186]:
val table = "velos"
veloDF.registerTempTable(table)

# 2.6) Requêter la table avec du SQL

In [187]:
val requete = s"Select count(*) from ${table}"
sqlContext.sql(requete).show()

+--------+
|count(1)|
+--------+
|   64334|
+--------+



In [189]:
val requete = s"Select count( distinct nom) from ${table}"
sqlContext.sql(requete).show()

+-------------------+                                                           
|count(DISTINCT nom)|
+-------------------+
|                760|
+-------------------+



In [191]:
val requete = s"Select mean( performance) from ${table}"
sqlContext.sql(requete).show()

+--------------------------------+
|avg(CAST(performance AS DOUBLE))|
+--------------------------------+
|              0.7615443430900648|
+--------------------------------+



In [200]:
val requete = s"""
Select nom, 
       min ( nb_km_trajet), 
       mean( nb_km_trajet),  
       max (nb_km_trajet) ,
       count (distinct station)
from ${table}
group by nom
limit 100"""
sqlContext.sql(requete).show()

|       nom|min(nb_km_trajet)|avg(CAST(nb_km_trajet AS DOUBLE))|max(nb_km_trajet)|count(DISTINCT station)|
+----------+-----------------+---------------------------------+-----------------+-----------------------+
|velo_aeyi6|    106.124133158|                 250.712765889171|    98.2821292635|                     46|
|velo_azow8|    1000.72528089|                640.1191001270846|    994.915022987|                     60|
|velo_aeqx7|    49.6741505596|                    49.6741505596|    49.6741505596|                      1|
|velo_aeufh|     102.45730718|                 364.425467797552|    95.8711122327|                     48|
|velo_arod5|    101.702990964|               352.68417590080105|    90.3094170541|                     48|
|velo_artu7|    100.583333498|               374.04519581147616|    96.9156508692|                     60|
|velo_azdf6|    109.735413268|                    109.735413268|    109.735413268|                      1|
|velo_aeqf9|    1007.13554138|       

In [201]:
import org.joda.time.{DateTimeZone}
import org.joda.time.format.DateTimeFormat

In [206]:
val stri = new DateTimeZone.DateTime("1485439420.58").toDateTime.toString("yyyy/MM/dd")


Name: Unknown Error
Message: <console>:43: error: type DateTime is not a member of object org.joda.time.DateTimeZone
       val stri = new DateTimeZone.DateTime("1485439420.58").toDateTime.toString("yyyy/MM/dd")
                                   ^
StackTrace: 

In [230]:


val test = sqlContext.sql(s"""
Select  min(from_unixtime(time,'YYYY-MM-dd HH:mm:ss')) as premiere_sortie, 
        max(from_unixtime(time,'YYYY-MM-dd HH:mm:ss')) as derniere_sortie,
        max(from_unixtime(time,'YYYY-MM-dd HH:mm:ss')) - min(from_unixtime(time,'YYYY-MM-dd HH:mm:ss')) as duree
from ${table}
group by nom
""")

In [231]:
test.show()

+-------------------+-------------------+-----+
|    premiere_sortie|    derniere_sortie|duree|
+-------------------+-------------------+-----+
|2017-01-26 14:08:41|2017-01-26 14:28:02| null|
|2017-01-26 14:11:32|2017-01-26 14:38:02| null|
|2017-01-26 14:03:22|2017-01-26 14:03:22| null|
|2017-01-26 14:06:59|2017-01-26 14:25:08| null|
|2017-01-26 14:06:43|2017-01-26 14:22:34| null|
|2017-01-26 14:06:43|2017-01-26 14:17:00| null|
|2017-01-26 14:03:43|2017-01-26 14:03:43| null|
|2017-01-26 14:06:43|2017-01-26 14:29:22| null|
|2017-01-26 14:11:29|2017-01-26 14:21:55| null|
|2017-01-26 14:06:59|2017-01-26 14:14:36| null|
|2017-01-26 14:06:48|2017-01-26 14:20:03| null|
|2017-01-26 14:06:48|2017-01-26 14:25:03| null|
|2017-01-26 14:06:58|2017-01-26 14:10:09| null|
|2017-01-26 14:07:53|2017-01-26 14:20:23| null|
|2017-01-26 14:06:44|2017-01-26 14:18:25| null|
|2017-01-26 14:13:48|2017-01-26 14:26:40| null|
|2017-01-26 14:03:36|2017-01-26 14:03:36| null|
|2017-01-26 14:06:43|2017-01-26 14:14:49