# Initialisation des librairies

In [1]:
import os
from pyspark.sql.functions import *
from pyspark.sql.types     import StructType, \
     StructField, FloatType, \
     IntegerType, StringType

# L'objet SPARK Session

In [2]:
spark

In [3]:
spark.sparkContext

In [4]:
spark.version

'3.2.0'

# Le chargement des données

In [5]:
!mkdir /home/razvan/Documents/donnees

In [6]:
os.chdir("/home/razvan/Documents/donnees")

In [7]:
!wget https://donneespubliques.meteofrance.fr/donnees_libres/Txt/Synop/postesSynop.csv

--2021-12-07 11:52:24--  https://donneespubliques.meteofrance.fr/donnees_libres/Txt/Synop/postesSynop.csv
Résolution de donneespubliques.meteofrance.fr (donneespubliques.meteofrance.fr)... 137.129.43.49
Connexion vers donneespubliques.meteofrance.fr (donneespubliques.meteofrance.fr)|137.129.43.49|:443... connecté.
requête HTTP transmise, en attente de la réponse... 200 OK
Taille : 2618 (2,6K) [text/plain]
Enregistre : «postesSynop.csv»


2021-12-07 11:52:24 (405 MB/s) - «postesSynop.csv» enregistré [2618/2618]



In [8]:
!pwd

/home/razvan/Documents/donnees


In [9]:
!ls -al

total 12
drwxrwxr-x 2 razvan razvan 4096 déc.   7 11:52 .
drwxr-xr-x 4 razvan razvan 4096 déc.   7 11:52 ..
-rw-rw-r-- 1 razvan razvan 2618 déc.   7 10:10 postesSynop.csv


In [10]:
!mkdir -p meteo

In [11]:
os.chdir("/home/razvan/Documents/donnees/meteo")

In [12]:
!ls -al

total 8
drwxrwxr-x 2 razvan razvan 4096 déc.   7 11:52 .
drwxrwxr-x 3 razvan razvan 4096 déc.   7 11:52 ..


In [13]:
!curl -s https://raw.githubusercontent.com/rbizoi/AnalyserLesDonneesAvecSpark/main/traitementsMeteo/chargeMeteo.sh

#!/bin/bash
#
for annee in `seq 2019 2020`
do
    for mois in `seq 1 12`
    do
        fichier=`printf "https://donneespubliques.meteofrance.fr/donnees_libres/Txt/Synop/Archive/synop.%d%02d.csv.gz" $annee $mois`
        wget $fichier
        `printf "gunzip -d synop.%d%02d.csv.gz" $annee $mois`
    done
done


In [14]:
!bash <(curl -s https://raw.githubusercontent.com/rbizoi/AnalyserLesDonneesAvecSpark/main/traitementsMeteo/chargeMeteo.sh)

--2021-12-07 11:52:43--  https://donneespubliques.meteofrance.fr/donnees_libres/Txt/Synop/Archive/synop.201901.csv.gz
Résolution de donneespubliques.meteofrance.fr (donneespubliques.meteofrance.fr)... 137.129.43.49
Connexion vers donneespubliques.meteofrance.fr (donneespubliques.meteofrance.fr)|137.129.43.49|:443... connecté.
requête HTTP transmise, en attente de la réponse... 200 OK
Taille : 668601 (653K) [application/x-gzip]
Enregistre : «synop.201901.csv.gz»


2021-12-07 11:52:44 (1,03 MB/s) - «synop.201901.csv.gz» enregistré [668601/668601]

--2021-12-07 11:52:44--  https://donneespubliques.meteofrance.fr/donnees_libres/Txt/Synop/Archive/synop.201902.csv.gz
Résolution de donneespubliques.meteofrance.fr (donneespubliques.meteofrance.fr)... 137.129.43.49
Connexion vers donneespubliques.meteofrance.fr (donneespubliques.meteofrance.fr)|137.129.43.49|:443... connecté.
requête HTTP transmise, en attente de la réponse... 200 OK
Taille : 573872 (560K) [application/x-gzip]
Enregistre : «syn

In [15]:
!ls

synop.201901.csv  synop.201907.csv  synop.202001.csv  synop.202007.csv
synop.201902.csv  synop.201908.csv  synop.202002.csv  synop.202008.csv
synop.201903.csv  synop.201909.csv  synop.202003.csv  synop.202009.csv
synop.201904.csv  synop.201910.csv  synop.202004.csv  synop.202010.csv
synop.201905.csv  synop.201911.csv  synop.202005.csv  synop.202011.csv
synop.201906.csv  synop.201912.csv  synop.202006.csv  synop.202012.csv


# Lecture d'un fichier csv

In [16]:
spark.read.format("csv").option("sep", ";").\
           option("mergeSchema", "true").\
           option("header","true").\
           option("nullValue","mq").\
           load("donnees/postesSynop.csv").\
           filter("ID<8000").\
           toDF("Station","Ville","Latitude","Longitude","Altitude").\
           toPandas().head(5)

Unnamed: 0,Station,Ville,Latitude,Longitude,Altitude
0,7005,ABBEVILLE,50.136,1.834,69
1,7015,LILLE-LESQUIN,50.57,3.0975,47
2,7020,PTE DE LA HAGUE,49.725167,-1.939833,6
3,7027,CAEN-CARPIQUET,49.18,-0.456167,67
4,7037,ROUEN-BOOS,49.383,1.181667,151


In [17]:
donneesStations = spark.read.format("csv").\
                        option("sep", ";").\
                        option("mergeSchema", "true").\
                        option("header","true").\
                        option("nullValue","mq").\
                        load("file:/home/razvan/Documents/donnees/postesSynop.csv").\
                        filter("ID<8000").\
                        toDF("Station","Ville",\
                             "Latitude","Longitude","Altitude")

In [18]:
donneesStations.printSchema()

root
 |-- Station: string (nullable = true)
 |-- Ville: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Altitude: string (nullable = true)



In [19]:
donneesStations.show(5)

+-------+---------------+---------+---------+--------+
|Station|          Ville| Latitude|Longitude|Altitude|
+-------+---------------+---------+---------+--------+
|  07005|      ABBEVILLE|50.136000| 1.834000|      69|
|  07015|  LILLE-LESQUIN|50.570000| 3.097500|      47|
|  07020|PTE DE LA HAGUE|49.725167|-1.939833|       6|
|  07027| CAEN-CARPIQUET|49.180000|-0.456167|      67|
|  07037|     ROUEN-BOOS|49.383000| 1.181667|     151|
+-------+---------------+---------+---------+--------+
only showing top 5 rows



In [20]:
donneesStations.write.mode('overwrite').format('parquet').save('file:/home/razvan/Documents/donnees/postesSynop.parquet')

                                                                                

In [21]:
donnees = spark.read.format('parquet').load('file:/home/razvan/Documents/donnees/postesSynop.parquet')

In [22]:
!ls -al /home/razvan/Documents/donnees

total 20
drwxrwxr-x 4 razvan razvan 4096 déc.   7 11:53 .
drwxr-xr-x 4 razvan razvan 4096 déc.   7 11:52 ..
drwxrwxr-x 2 razvan razvan 4096 déc.   7 11:52 meteo
-rw-rw-r-- 1 razvan razvan 2618 déc.   7 10:10 postesSynop.csv
drwxr-xr-x 2 razvan razvan 4096 déc.   7 11:53 postesSynop.parquet


In [23]:
donnees.show(5)

+-------+---------------+---------+---------+--------+
|Station|          Ville| Latitude|Longitude|Altitude|
+-------+---------------+---------+---------+--------+
|  07005|      ABBEVILLE|50.136000| 1.834000|      69|
|  07015|  LILLE-LESQUIN|50.570000| 3.097500|      47|
|  07020|PTE DE LA HAGUE|49.725167|-1.939833|       6|
|  07027| CAEN-CARPIQUET|49.180000|-0.456167|      67|
|  07037|     ROUEN-BOOS|49.383000| 1.181667|     151|
+-------+---------------+---------+---------+--------+
only showing top 5 rows



In [24]:
spark.sql("select * from parquet."+
          "`/home/razvan/Documents/donnees/postesSynop.parquet`  "+
          "where Ville = 'ABBEVILLE'").show()

+-------+---------+---------+---------+--------+
|Station|    Ville| Latitude|Longitude|Altitude|
+-------+---------+---------+---------+--------+
|  07005|ABBEVILLE|50.136000| 1.834000|      69|
+-------+---------+---------+---------+--------+



## Chargement des Villes

In [25]:
schema = StructType([
        StructField('Id'           , StringType() , True),
        StructField('ville'        , StringType() , True),
        StructField('latitude'     , FloatType() , True),
        StructField('longitude'    , FloatType() , True),
        StructField('altitude'     , IntegerType() , True)])

villes  = spark.read.format('csv')   \
      .option('sep',';')                \
      .option('mergeSchema', 'true')    \
      .option('header','true')          \
      .schema(schema)                   \
      .load("file:/home/razvan/Documents/donnees/postesSynop.csv")  \
      .cache()

In [26]:
@udf("string")
def formatVille(ville):
    if ville in ['CLERMONT-FD','MONT-DE-MARSAN',
                                   'ST-PIERRE','ST-BARTHELEMY METEO'] :
        return ville.title()
    else :
        if ville.find('-') != -1 :
            return ville[0:ville.find('-')].title()
        else:
            return ville.title()

villesT  = villes.select(
                col('Id').alias('id'),
                formatVille('ville').alias('ville'),
               'latitude',
               'longitude',
               'altitude')

## Chargemet du répértoire meteo

In [27]:
meteoDataFrame  = spark.read.format('csv')\
                       .option('sep',';')\
                       .option('header','true')\
                       .option('nullValue','mq')\
                       .option('inferSchema', 'true')\
                       .load('/home/razvan/Documents/donnees/meteo')\
                       .cache()

                                                                                

In [28]:
meteo = meteoDataFrame.select(
                 col('numer_sta'),
                 to_timestamp(col('date').cast('string'),'yyyyMMddHHmmss'),
                 col('date')[0:4].cast('int') ,
                 col('date')[5:2].cast('int'),
                 col('date')[7:2].cast('int'),
                 col('date')[5:4],
                 round(col('t') - 273.15,2),
                 col('u') / 100 ,
                 col('vv') / 1000 ,
                 col('pres') / 1000,
                 coalesce( col('rr3'),
                           col('rr24')/8,
                           col('rr12')/4,
                           col('rr6')/2,
                           col('rr1')*3  ) )\
             .toDF('id','date','annee','mois','jour','mois_jour','temperature',
                   'humidite','visibilite','pression','precipitations')\
             .cache()

In [29]:
meteo.select('annee','mois','jour','temperature','humidite',
             'visibilite','pression').show(3)

[Stage 11:>                                                         (0 + 1) / 1]

+-----+----+----+-----------+--------+----------+--------+
|annee|mois|jour|temperature|humidite|visibilite|pression|
+-----+----+----+-----------+--------+----------+--------+
| 2019|  12|   1|        3.7|    0.79|      20.0|  100.86|
| 2019|  12|   1|        2.8|    0.87|     12.23|  101.38|
| 2019|  12|   1|        8.7|    0.75|      10.0|  101.39|
+-----+----+----+-----------+--------+----------+--------+
only showing top 3 rows



                                                                                

In [30]:
meteoFance = meteo.where('id < 8000')\
             .join(villesT,'id')\
             .select(initcap(regexp_replace('ville','-',' ')).alias('ville'),
                     'date','annee','mois','jour','temperature','humidite',
                     'visibilite','pression','precipitations')

In [31]:
meteoFance.write\
       .mode('overwrite')\
       .format('parquet')\
       .partitionBy('annee')\
       .option('path', '/home/razvan/Documents/donnees/meteoFrance')\
       .save()

                                                                                

In [32]:
meteoFance.printSchema()

root
 |-- ville: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- annee: integer (nullable = true)
 |-- mois: integer (nullable = true)
 |-- jour: integer (nullable = true)
 |-- temperature: double (nullable = true)
 |-- humidite: double (nullable = true)
 |-- visibilite: double (nullable = true)
 |-- pression: double (nullable = true)
 |-- precipitations: double (nullable = true)



In [33]:
spark.sql("select * from parquet."+
          "`/home/razvan/Documents/donnees/meteoFrance` "+
          "where annee = 2020").show()

+---------------+-------------------+----+----+-----------+--------+----------+--------+--------------+-----+
|          ville|               date|mois|jour|temperature|humidite|visibilite|pression|precipitations|annee|
+---------------+-------------------+----+----+-----------+--------+----------+--------+--------------+-----+
|      Abbeville|2020-10-01 00:00:00|  10|   1|       14.5|     0.9|     17.33|   99.52|           1.6| 2020|
|          Lille|2020-10-01 00:00:00|  10|   1|       15.4|    0.85|     46.41|    99.9|           0.0| 2020|
|Pte De La Hague|2020-10-01 00:00:00|  10|   1|       15.1|    0.75|      14.0|  100.05|           0.0| 2020|
|           Caen|2020-10-01 00:00:00|  10|   1|       14.4|    0.92|      3.83|   99.44|           1.0| 2020|
|          Rouen|2020-10-01 00:00:00|  10|   1|       13.8|    0.95|     41.72|   98.61|           1.0| 2020|
|          Reims|2020-10-01 00:00:00|  10|   1|       13.9|     0.9|      20.0|    99.6|           0.0| 2020|
|         

In [34]:
meteoFance = spark.read.format('parquet').load('/home/razvan/Documents/donnees/meteoFrance')

In [38]:
meteoFance.printSchema()

root
 |-- ville: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- mois: integer (nullable = true)
 |-- jour: integer (nullable = true)
 |-- temperature: double (nullable = true)
 |-- humidite: double (nullable = true)
 |-- visibilite: double (nullable = true)
 |-- pression: double (nullable = true)
 |-- precipitations: double (nullable = true)
 |-- annee: integer (nullable = true)



In [39]:
spark.sql("show databases").show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [40]:
spark.sql("show tables").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [None]:
spark.sql('CREATE TABLE IF NOT EXISTS meteo STORED AS ORC AS SELECT * FROM parquet.`/home/razvan/Documents/donnees/meteoFrance`').show()