<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import os
from pyspark.sql.functions import *
from pyspark.sql.types     import StructType, \
     StructField, FloatType, \
     IntegerType, StringType

In [2]:
spark

In [1]:
from pyspark.sql.functions import *
from pyspark.sql.types     import StructType, \
     StructField, FloatType, \
     IntegerType, StringType

meteoDataFrame  = spark.read.format('csv')\
    .option('sep',';')\
    .option('header','true')\
    .option('nullValue','mq')\
    .option('inferSchema', 'true')\
    .load('../donnees/meteo30')\
    .cache()

schema = StructType([
        StructField('Id'           , StringType() , True),
        StructField('ville'        , StringType() , True),
        StructField('latitude'     , FloatType() , True),
        StructField('longitude'    , FloatType() , True),
        StructField('altitude'     , IntegerType() , True)])

villes  = spark.read.format('csv')   \
      .option('sep',';')                \
      .option('mergeSchema', 'true')    \
      .option('header','true')          \
      .schema(schema)                   \
      .load('../donnees/meteo/postesSynop.csv')  \
      .cache()

@udf("string")
def formatVille(ville):
    if ville in ['CLERMONT-FD','MONT-DE-MARSAN',
                                   'ST-PIERRE','ST-BARTHELEMY METEO'] :
        return ville.title()
    else :
        if ville.find('-') != -1 :
            return ville[0:ville.find('-')].title()
        else:
            return ville.title()

villesT  = villes.select(
                col('Id').alias('id'),
                formatVille('ville').alias('ville'),
               'latitude',
               'longitude',
               'altitude')


meteo = meteoDataFrame.select(
                 col('numer_sta'),
                 to_timestamp(col('date').cast('string'),'yyyyMMddHHmmss'),
                 col('date')[0:4].cast('int') ,
                 col('date')[5:2].cast('int'),
                 col('date')[7:2].cast('int'),
                 col('date')[5:4],
                 round(col('t') - 273.15,2),
                 col('u') / 100 ,
                 col('vv') / 1000 ,
                 col('pres') / 1000,
                 coalesce( col('rr3'),
                           col('rr24')/8,
                           col('rr12')/4,
                           col('rr6')/2,
                           col('rr1')*3  ) )\
             .toDF('id','date','annee','mois','jour','mois_jour','temperature',
                   'humidite','visibilite','pression','precipitations')\
             .cache()

meteo.select('annee','mois','jour','temperature','humidite',
             'visibilite','pression').show(3)

+-----+----+----+-----------+--------+----------+--------+
|annee|mois|jour|temperature|humidite|visibilite|pression|
+-----+----+----+-----------+--------+----------+--------+
| 2019|  12|   1|        3.7|    0.79|      20.0|  100.86|
| 2019|  12|   1|        2.8|    0.87|     12.23|  101.38|
| 2019|  12|   1|        8.7|    0.75|      10.0|  101.39|
+-----+----+----+-----------+--------+----------+--------+
only showing top 3 rows



In [5]:
meteo.write\
       .mode('overwrite')\
       .format('parquet')\
       .partitionBy('annee')\
       .option('path', '../donnees/meteo/meteoFrance')\
       .save()

In [7]:
spark.sql("select * from parquet."+
          "`../donnees/meteo/meteoFrance` "+
          "where annee = 2020").show(5)

+----+-------------------+----+----+---------+-----------+--------+----------+--------+--------------+-----+
|  id|               date|mois|jour|mois_jour|temperature|humidite|visibilite|pression|precipitations|annee|
+----+-------------------+----+----+---------+-----------+--------+----------+--------+--------------+-----+
|7005|2020-08-01 00:00:00|   8|   1|     0801|       17.5|    0.95|     19.26|  100.69|           0.0| 2020|
|7015|2020-08-01 00:00:00|   8|   1|     0801|       20.7|    0.67|     53.38|  100.85|           0.0| 2020|
|7020|2020-08-01 00:00:00|   8|   1|     0801|       17.1|    0.97|       8.0|  101.49|           0.0| 2020|
|7027|2020-08-01 00:00:00|   8|   1|     0801|       19.3|    0.87|      60.0|  100.86|           0.0| 2020|
|7037|2020-08-01 00:00:00|   8|   1|     0801|       17.8|     0.9|     19.63|   99.81|           0.0| 2020|
+----+-------------------+----+----+---------+-----------+--------+----------+--------+--------------+-----+
only showing top 5 

In [8]:
meteoFance = spark.read.format('parquet').load('../donnees/meteo/meteoFrance')

In [17]:
spark.sql('CREATE DATABASE IF NOT EXISTS cours').show()

++
||
++
++



In [18]:
spark.sql('show databases').show()

+---------+
|namespace|
+---------+
|    cours|
|  default|
+---------+



In [19]:
spark.sql('CREATE TABLE IF NOT EXISTS meteo STORED AS ORC AS SELECT * FROM parquet.`../donnees/meteo/meteoFrance`').show()

++
||
++
++



In [20]:
spark.sql('show tables').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|  default|    meteo|      false|
+---------+---------+-----------+



In [21]:
spark.sql("""SELECT *
            FROM meteo
            WHERE ANNEE = 2020""").show()

+----+-------------------+----+----+---------+-----------+--------+----------+--------+--------------+-----+
|  id|               date|mois|jour|mois_jour|temperature|humidite|visibilite|pression|precipitations|annee|
+----+-------------------+----+----+---------+-----------+--------+----------+--------+--------------+-----+
|7005|2020-08-01 00:00:00|   8|   1|     0801|       17.5|    0.95|     19.26|  100.69|           0.0| 2020|
|7015|2020-08-01 00:00:00|   8|   1|     0801|       20.7|    0.67|     53.38|  100.85|           0.0| 2020|
|7020|2020-08-01 00:00:00|   8|   1|     0801|       17.1|    0.97|       8.0|  101.49|           0.0| 2020|
|7027|2020-08-01 00:00:00|   8|   1|     0801|       19.3|    0.87|      60.0|  100.86|           0.0| 2020|
|7037|2020-08-01 00:00:00|   8|   1|     0801|       17.8|     0.9|     19.63|   99.81|           0.0| 2020|
|7072|2020-08-01 00:00:00|   8|   1|     0801|       22.4|    0.78|      20.0|  100.42|           2.6| 2020|
|7110|2020-08-01 00

In [22]:
spark.sql("""SELECT annee, 
                    avg(temperature) temperature, 
                    avg(humidite) humidite, 
                    avg(visibilite) visibilite, 
                    avg(pression) pression
            FROM meteo
            GROUP BY ANNEE""").show()

+-----+------------------+------------------+------------------+------------------+
|annee|       temperature|          humidite|        visibilite|          pression|
+-----+------------------+------------------+------------------+------------------+
| 2003|14.888266247864602| 0.741586757490121|19.793412606456336| 99.70438287582083|
| 2007|14.955878142458142|0.7754617347834829|20.099471214811043| 99.82876070624651|
| 2015|14.204845295660801|0.7475355315160201|26.591579830541605| 99.98739669100065|
| 2013|14.263538136576747|0.7621931988781137|24.263107410182634| 99.76092405796415|
| 1997| 14.89913250559042| 0.767484417911904|17.773680462295996| 99.63939407093666|
| 2019|15.479840961873744|0.7458669954184782| 26.52829253250908| 99.92134292857976|
| 1998| 14.69787684670146|0.7690312430702502| 18.07777156381201| 99.71812050353148|
| 2020|16.862478665118193|0.7412149554933901| 28.09425989647236| 99.90010034101897|
| 2012|  14.5598693053622|0.7528102868351927|24.725182310235276| 99.75265946

In [23]:
spark.sql('USE cours').show()

++
||
++
++



In [24]:
spark.sql('show tables').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+

