<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Initialise-données" data-toc-modified-id="Initialise-données-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Initialise données</a></span></li><li><span><a href="#partitionBy" data-toc-modified-id="partitionBy-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>partitionBy</a></span></li><li><span><a href="#orderBy" data-toc-modified-id="orderBy-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>orderBy</a></span></li><li><span><a href="#rowsBetween" data-toc-modified-id="rowsBetween-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>rowsBetween</a></span></li></ul></div>

In [1]:
import os
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
spark

# Initialise données

In [3]:
from pyspark.sql.functions import *
from pyspark.sql.types     import StructType, \
     StructField, FloatType, \
     IntegerType, StringType

spark = SparkSession.builder\
          .config("spark.jars.packages",
                         "io.delta:delta-core_2.12:0.8.0") \
          .config("spark.sql.extensions",
                         "io.delta.sql.DeltaSparkSessionExtension")\
          .getOrCreate()

meteoDataFrame  = spark.read.format('csv')\
    .option('sep',';')\
    .option('header','true')\
    .option('nullValue','mq')\
    .option('inferSchema', 'true')\
    .load('donnees/meteo')\
    .cache()

meteoDataFrame.columns
meteoDataFrame.printSchema()

schema = StructType([
        StructField('Id'           , StringType() , True),
        StructField('ville'        , StringType() , True),
        StructField('latitude'     , FloatType() , True),
        StructField('longitude'    , FloatType() , True),
        StructField('altitude'     , IntegerType() , True)])

villes  = spark.read.format('csv')   \
      .option('sep',';')                \
      .option('mergeSchema', 'true')    \
      .option('header','true')          \
      .schema(schema)                   \
      .load('/user/spark/donnees/postesSynop.csv')  \
      .cache()

@udf("string")
def formatVille(ville):
    if ville in ['CLERMONT-FD','MONT-DE-MARSAN',
                                   'ST-PIERRE','ST-BARTHELEMY METEO'] :
        return ville.title()
    else :
        if ville.find('-') != -1 :
            return ville[0:ville.find('-')].title()
        else:
            return ville.title()

villesT  = villes.select(
                col('Id').alias('id'),
                formatVille('ville').alias('ville'),
               'latitude',
               'longitude',
               'altitude')


meteo = meteoDataFrame.select(
                 col('numer_sta'),
                 to_timestamp(col('date').cast('string'),'yyyyMMddHHmmss'),
                 col('date')[0:4].cast('int') ,
                 col('date')[5:2].cast('int'),
                 col('date')[7:2].cast('int'),
                 col('date')[5:4],
                 round(col('t') - 273.15,2),
                 col('u') / 100 ,
                 col('vv') / 1000 ,
                 col('pres') / 1000,
                 coalesce( col('rr3'),
                           col('rr24')/8,
                           col('rr12')/4,
                           col('rr6')/2,
                           col('rr1')*3  ) )\
             .toDF('id','date','annee','mois','jour','mois_jour','temperature',
                   'humidite','visibilite','pression','precipitations')\
             .cache()

meteo.select('annee','mois','jour','temperature','humidite',
             'visibilite','pression').show(3)

meteoFance = meteo.where('id < 8000')\
             .join(villesT,'id')\
             .select(initcap(regexp_replace('ville','-',' ')).alias('ville'),
                     'annee','mois','jour','temperature',
                     'humidite','visibilite','pression','precipitations')

meteoFance.write\
       .mode('overwrite')\
       .format('parquet')\
       .partitionBy('annee')\
       .option('path', '/user/spark/donnees/meteoFrance')\
       .save()

meteo.join(villesT,'id')\
     .select(initcap(regexp_replace('ville','-',' ')).alias('ville'),
                     'annee','mois','jour','temperature',
                     'humidite','visibilite','pression','precipitations')\
     .write\
     .mode('overwrite')\
     .format('parquet')\
     .partitionBy('annee')\
     .option('path', '/user/spark/donnees/meteoGlobal')\
     .save()

data = [('Ajaccio'     ,'dfa' ),
                  ('Angers'      ,'dfa' ),
                  ('Angoulème'   ,'dfa' ),
                  ('Besançon'    ,'dfa' ),
                  ('Biarritz'    ,'dfa' ),
                  ('Bordeaux'    ,'dfa' ),
                  ('Brest'       ,'dfa' ),
                  ('Caen'        ,'dfa' ),
                  ('Clermont-Fd' ,'dfa' ),
                  ('Dijon'       ,'dfa' ),
                  ('Embrun'      ,'dfa' ),
                  ('Grenoble'    ,'dfa' ),
                  ('Lille'       ,'dfa' ),
                  ('Limoges'     ,'dfa' ),
                  ('Lyon'        ,'dfa' ),
                  ('Marseille'   ,'dfa' ),
                  ('Montpellier' ,'dfa' ),
                  ('Nancy'       ,'dfa' ),
                  ('Nantes'      ,'dfa' ),
                  ('Nice'        ,'dfa' ),
                  ('Nîmes'       ,'dfa' ),
                  ('Orléans'     ,'dfa' ),
                  ('Paris'       ,'dfa' )]

dfa = spark.sparkContext.parallelize(data).toDF(['ville','valeur'])

data = [ ('Nancy'       ,'dfb' ),
          ('Nantes'      ,'dfb' ),
          ('Nice'        ,'dfb' ),
          ('Nîmes'       ,'dfb' ),
          ('Orléans'     ,'dfb' ),
          ('Paris'       ,'dfb' ),
          ('Perpignan'   ,'dfb' ),
          ('Poitiers'    ,'dfb' ),
          ('Reims'       ,'dfb' ),
          ('Rennes'      ,'dfb' ),
          ('Rouen'       ,'dfb' ),
          ('St-Quentin'  ,'dfb' ),
          ('Strasbourg'  ,'dfb' ),
          ('Toulon'      ,'dfb' ),
          ('Toulouse'    ,'dfb' ),
          ('Tours'       ,'dfb' ),
          ('Vichy'       ,'dfb' )]

dfb = spark.sparkContext.parallelize(data).toDF(['ville','valeur'])

root
 |-- numer_sta: integer (nullable = true)
 |-- date: long (nullable = true)
 |-- pmer: integer (nullable = true)
 |-- tend: integer (nullable = true)
 |-- cod_tend: integer (nullable = true)
 |-- dd: integer (nullable = true)
 |-- ff: double (nullable = true)
 |-- t: double (nullable = true)
 |-- td: double (nullable = true)
 |-- u: integer (nullable = true)
 |-- vv: integer (nullable = true)
 |-- ww: integer (nullable = true)
 |-- w1: integer (nullable = true)
 |-- w2: integer (nullable = true)
 |-- n: integer (nullable = true)
 |-- nbas: integer (nullable = true)
 |-- hbas: integer (nullable = true)
 |-- cl: integer (nullable = true)
 |-- cm: integer (nullable = true)
 |-- ch: integer (nullable = true)
 |-- pres: integer (nullable = true)
 |-- niv_bar: integer (nullable = true)
 |-- geop: integer (nullable = true)
 |-- tend24: integer (nullable = true)
 |-- tn12: double (nullable = true)
 |-- tn24: double (nullable = true)
 |-- tx12: double (nullable = true)
 |-- tx24: double (n

In [5]:
from pyspark.sql.functions import *
from pyspark.sql import Window

meteoFance = meteo.where('id < 8000')\
             .join(villes.withColumnRenamed('Id', 'id'),'id')\
             .select(initcap(regexp_replace('ville','-',' ')).alias('ville'),
                     'annee','mois','jour','temperature',
                     'humidite','visibilite','pression','precipitations')

meteoFance.count()

2941041

In [6]:
meteoFance.selectExpr('ville','annee','mois','jour','temperature as t',
                      'humidite as h','visibilite as v',
                      'pression as p','precipitations as e').show()

+-------------------+-----+----+----+---+----+-----+------+----+
|              ville|annee|mois|jour|  t|   h|    v|     p|   e|
+-------------------+-----+----+----+---+----+-----+------+----+
|          Abbeville| 2019|  12|   1|3.7|0.79| 20.0|100.86| 0.0|
|      Lille Lesquin| 2019|  12|   1|2.8|0.87|12.23|101.38| 0.0|
|    Pte De La Hague| 2019|  12|   1|8.7|0.75| 10.0|101.39| 0.0|
|     Caen Carpiquet| 2019|  12|   1|4.9| 0.8|30.18|100.62| 0.0|
|         Rouen Boos| 2019|  12|   1|3.5|0.84|39.54| 99.68| 0.0|
|       Reims Prunay| 2019|  12|   1|1.7|0.89| 20.0|100.53| 0.0|
|     Brest Guipavas| 2019|  12|   1|7.1|0.91| 30.3|100.09| 0.0|
|        Ploumanac'h| 2019|  12|   1|8.0|0.95| null|100.61| 2.0|
|  Rennes St Jacques| 2019|  12|   1|6.2|0.92|18.06| 100.7| 2.0|
|            Alencon| 2019|  12|   1|4.3|0.89|13.52| 99.52|-0.1|
|               Orly| 2019|  12|   1|4.7|0.77|17.23| 100.4| 0.0|
|    Troyes Barberey| 2019|  12|   1|3.9|0.83| 20.0| 100.1| 0.0|
|        Nancy Ochey| 201

In [7]:
meteoMM = meteoFance.where("ville = 'Mont De Marsan' and \
                                annee = 2019 and mois = 11 and jour < 11")\
                     .select('jour','temperature','humidite','visibilite',
                             'pression','precipitations')
meteoMM.show()

+----+-----------+--------+----------+--------+--------------+
|jour|temperature|humidite|visibilite|pression|precipitations|
+----+-----------+--------+----------+--------+--------------+
|   1|        8.9|    0.97|      3.77|  101.26|           0.0|
|   1|       10.6|    0.99|     11.17|  101.07|           0.0|
|   1|       12.9|    0.99|      4.58|  100.93|           0.2|
|   1|       15.9|    0.99|     29.86|  100.87|           5.0|
|   1|       18.3|    0.96|     11.98|  100.66|           1.2|
|   1|       20.8|    0.82|      60.0|  100.41|          -0.1|
|   1|       17.6|    0.97|     11.57|  100.39|           7.9|
|   1|       16.9|    0.95|      60.0|  100.31|           0.4|
|   2|       16.4|    0.95|     49.88|   100.1|           1.4|
|   2|       16.4|    0.97|      5.17|   99.98|           4.6|
|   2|       15.8|    0.97|       1.7|   99.95|           4.0|
|   2|       16.2|    0.96|     19.65|   99.95|           2.5|
|   2|       17.3|    0.85|     50.13|   99.76|        

# partitionBy
<img src="https://raw.githubusercontent.com/rbizoi/AnalyserLesDonneesAvecSpark/main/DataFrameSpark/images/M06-10.png" width="400">   

In [8]:
window = Window.partitionBy()
meteoMM.select('jour','precipitations',
               sum('precipitations').over(window)\
                   .alias("somme totale"))\
        .show(5)

+----+--------------+------------+
|jour|precipitations|somme totale|
+----+--------------+------------+
|   1|           0.0|       215.2|
|   1|           0.0|       215.2|
|   1|           0.2|       215.2|
|   1|           5.0|       215.2|
|   1|           1.2|       215.2|
+----+--------------+------------+
only showing top 5 rows



In [9]:
meteoMM.groupBy('jour')\
        .agg( round(sum('precipitations'),2).alias('precipitations'))\
        .orderBy(col('jour'))\
        .rollup('jour')\
        .agg( round(sum('precipitations'),2).alias('precipitations'))\
        .orderBy(col('jour').asc_nulls_last())\
        .show()

+----+--------------+
|jour|precipitations|
+----+--------------+
|   1|          14.6|
|   2|          18.1|
|   3|          16.5|
|   4|          31.5|
|   5|          52.5|
|   6|          21.1|
|   7|          22.1|
|   8|          26.1|
|   9|           5.8|
|  10|           6.9|
|null|         215.2|
+----+--------------+



In [10]:
total = Window.partitionBy()
jour  = Window.partitionBy('jour')
meteoMM.select('jour','precipitations',
               round(sum('precipitations').over(jour),2)\
                   .alias("somme journalière"),\
               sum('precipitations').over(total)\
                   .alias("somme totale"))\
        .show()

+----+--------------+-----------------+------------+
|jour|precipitations|somme journalière|somme totale|
+----+--------------+-----------------+------------+
|   1|           0.0|             14.6|       215.2|
|   1|           0.0|             14.6|       215.2|
|   1|           0.2|             14.6|       215.2|
|   1|           5.0|             14.6|       215.2|
|   1|           1.2|             14.6|       215.2|
|   1|          -0.1|             14.6|       215.2|
|   1|           7.9|             14.6|       215.2|
|   1|           0.4|             14.6|       215.2|
|   2|           1.4|             18.1|       215.2|
|   2|           4.6|             18.1|       215.2|
|   2|           4.0|             18.1|       215.2|
|   2|           2.5|             18.1|       215.2|
|   2|           0.8|             18.1|       215.2|
|   2|           0.6|             18.1|       215.2|
|   2|           1.0|             18.1|       215.2|
|   2|           3.2|             18.1|       

In [11]:
meteoMM = meteoFance.where("ville = 'Mont De Marsan'")\
           .select('ville','annee','mois',
                           'jour','temperature','precipitations')\
           .groupBy('ville', 'annee', 'mois', 'jour')\
           .agg( round(avg('temperature'),2).alias('temperature'),
                round(sum('precipitations'),2).alias('precipitations'))\
           .select('annee','mois','jour','temperature','precipitations')
meteoMM.show(5)

+-----+----+----+-----------+--------------+
|annee|mois|jour|temperature|precipitations|
+-----+----+----+-----------+--------------+
| 2007|   7|  12|      19.58|           0.0|
| 2010|   5|  25|      22.31|           0.2|
| 2016|   1|   2|       10.1|          23.9|
| 2010|   2|   9|       3.05|           0.2|
| 1997|   6|  30|      14.66|          28.8|
+-----+----+----+-----------+--------------+
only showing top 5 rows



In [12]:
jour          = Window.partitionBy('jour')
mois          = Window.partitionBy('mois')
moisAnnee     = Window.partitionBy('mois','annee')
annee         = Window.partitionBy('annee')
jourMois      = Window.partitionBy('jour','mois')
jourMoisAnnee = Window.partitionBy('jour','mois','annee')

meteoMM.select('jour','mois','annee',col('precipitations').alias('prec'),
          round(sum('precipitations').over(jourMoisAnnee),2).alias('s1'),
          round(sum('precipitations').over(jourMois),2).alias('s2'),
          round(sum('precipitations').over(moisAnnee),2).alias('s3'),
          round(sum('precipitations').over(mois),2).alias('s4'),
          round(sum('precipitations').over(annee),2).alias('s5'),
          round(sum('precipitations').over(jour),2).alias('s6'))\
       .show(28)

+----+----+-----+----+----+-----+------+-------+-------+------+
|jour|mois|annee|prec|  s1|   s2|    s3|     s4|     s5|    s6|
+----+----+-----+----+----+-----+------+-------+-------+------+
|  24|  12| 1996|2.38|2.38|17.33| 61.16|1775.66|  851.9|531.75|
|  24|  12| 1997| 0.0| 0.0|17.33| 95.53|1775.66| 956.12|531.75|
|  24|  12| 1998| 0.9| 0.9|17.33| 16.13|1775.66|1045.37|531.75|
|  24|  12| 1999|0.25|0.25|17.33|133.79|1775.66|1124.75|531.75|
|  24|  12| 2000|3.45|3.45|17.33| 86.75|1775.66|1262.65|531.75|
|  24|  12| 2001| 0.0| 0.0|17.33| 35.85|1775.66| 818.99|531.75|
|  24|  12| 2002|0.35|0.35|17.33| 94.74|1775.66| 768.37|531.75|
|  24|  12| 2003| 0.6| 0.6|17.33| 66.81|1775.66| 887.54|531.75|
|  24|  12| 2004| 2.2| 2.2|17.33|  86.6|1775.66| 784.89|531.75|
|  24|  12| 2005| 0.0| 0.0|17.33| 108.8|1775.66|  740.2|531.75|
|  24|  12| 2006| 0.0| 0.0|17.33|  59.2|1775.66|  762.2|531.75|
|  24|  12| 2007| 0.0| 0.0|17.33|  70.0|1775.66|  799.2|531.75|
|  24|  12| 2008| 0.0| 0.0|17.33|  86.6|

In [13]:
meteoMM.where('annee = 1997 and mois = 12 and jour = 31').agg(sum('precipitations').alias('s1')).show()

+---+
| s1|
+---+
|7.4|
+---+



In [14]:
meteoMM.where('mois = 12 and jour = 31').agg(sum('precipitations').alias('s2')).show()

+-----+
|   s2|
+-----+
|75.53|
+-----+



In [15]:
meteoMM.where('annee = 1996 and mois = 12').agg(sum('precipitations').alias('s3')).show()

+-----------------+
|               s3|
+-----------------+
|61.15999999999999|
+-----------------+



In [16]:
meteoMM.where('mois  = 12'  ).agg(sum('precipitations').alias('s4')).show()

+-------+
|     s4|
+-------+
|1775.66|
+-------+



In [17]:
meteoMM.where('annee = 1996').agg(sum('precipitations').alias('s5')).show()

+-----------------+
|               s5|
+-----------------+
|851.9000000000001|
+-----------------+



In [18]:
meteoMM.where('jour = 31').agg(sum('precipitations').alias('s6')).show()

+------------------+
|                s6|
+------------------+
|402.73999999999984|
+------------------+



In [19]:
meteoMM.agg(sum('precipitations')).show()

+-------------------+
|sum(precipitations)|
+-------------------+
|  21804.02999999999|
+-------------------+



# orderBy

<img src="https://raw.githubusercontent.com/rbizoi/AnalyserLesDonneesAvecSpark/main/DataFrameSpark/images/M06-11.png" width="400">  

In [20]:
jour    = Window.partitionBy('mois')
jourOby = Window.partitionBy('mois').orderBy('jour')
meteoMM.where("annee = 2019")\
       .select('mois','jour',
          col('temperature').alias('temp'),
          round(avg('temperature').over(jourOby),2).alias('s1'),
          round(avg('temperature').over(jour),2).alias('s2'),
          col('precipitations').alias('prec'),
          round(sum('precipitations').over(jourOby),2).alias('s3'),
          round(sum('precipitations').over(jour),2).alias('s4'))\
       .show(32)

+----+----+-----+----+----+----+----+----+
|mois|jour| temp|  s1|  s2|prec|  s3|  s4|
+----+----+-----+----+----+----+----+----+
|  12|   1|  6.1| 6.1|8.89|-0.1|-0.1|98.7|
|  12|   2| 6.57|6.34|8.89|-0.1|-0.2|98.7|
|  12|   3| 1.81|4.83|8.89| 0.0|-0.2|98.7|
|  12|   4| 3.39|4.47|8.89| 0.0|-0.2|98.7|
|  12|   5| 6.05|4.78|8.89|-0.1|-0.3|98.7|
|  12|   6| 5.23|4.86|8.89| 0.0|-0.3|98.7|
|  12|   7| 8.05|5.31|8.89|-0.1|-0.4|98.7|
|  12|   8|  9.4|5.83|8.89| 5.2| 4.8|98.7|
|  12|   9|10.36|6.33|8.89| 5.4|10.2|98.7|
|  12|  10| 8.01| 6.5|8.89| 0.0|10.2|98.7|
|  12|  11| 8.51|6.68|8.89| 7.8|18.0|98.7|
|  12|  12| 9.66|6.93|8.89|25.0|43.0|98.7|
|  12|  13|13.55|7.44|8.89|25.0|68.0|98.7|
|  12|  14|13.74|7.89|8.89|-0.1|67.9|98.7|
|  12|  15|12.65|8.21|8.89| 0.0|67.9|98.7|
|  12|  16|10.35|8.34|8.89| 0.0|67.9|98.7|
|  12|  17|10.99| 8.5|8.89| 0.0|67.9|98.7|
|  12|  18|10.66|8.62|8.89| 0.1|68.0|98.7|
|  12|  19|11.63|8.77|8.89|-0.1|67.9|98.7|
|  12|  20|12.23|8.95|8.89| 6.3|74.2|98.7|
|  12|  21|

In [21]:
jourPOby = Window.partitionBy('mois').orderBy('jour')
jourOby  = Window.orderBy('mois','jour')
meteoMM.where("annee = 2019")\
       .select('mois','jour',
          col('precipitations').alias('prec'),
          round(sum('precipitations').over(jourPOby),2).alias('s3'),
          round(sum('precipitations').over(jourOby),2).alias('s4'))\
       .show(35)

+----+----+----+----+-----+
|mois|jour|prec|  s3|   s4|
+----+----+----+----+-----+
|   1|   1|-0.3|-0.3| -0.3|
|   1|   2|-0.1|-0.4| -0.4|
|   1|   3| 0.0|-0.4| -0.4|
|   1|   4| 0.0|-0.4| -0.4|
|   1|   5| 0.0|-0.4| -0.4|
|   1|   6| 0.0|-0.4| -0.4|
|   1|   7| 0.0|-0.4| -0.4|
|   1|   8| 0.3|-0.1| -0.1|
|   1|   9| 0.1| 0.0|  0.0|
|   1|  10| 0.0| 0.0|  0.0|
|   1|  11| 0.0| 0.0|  0.0|
|   1|  12|-0.3|-0.3| -0.3|
|   1|  13| 2.2| 1.9|  1.9|
|   1|  14| 0.8| 2.7|  2.7|
|   1|  15| 0.0| 2.7|  2.7|
|   1|  16| 0.0| 2.7|  2.7|
|   1|  17| 2.8| 5.5|  5.5|
|   1|  18|-0.3| 5.2|  5.2|
|   1|  19|-0.3| 4.9|  4.9|
|   1|  20| 0.6| 5.5|  5.5|
|   1|  21|-0.1| 5.4|  5.4|
|   1|  22| 7.1|12.5| 12.5|
|   1|  23| 9.4|21.9| 21.9|
|   1|  24| 0.0|21.9| 21.9|
|   1|  25|-0.2|21.7| 21.7|
|   1|  26|-0.2|21.5| 21.5|
|   1|  27| 5.6|27.1| 27.1|
|   1|  28|13.7|40.8| 40.8|
|   1|  29| 4.2|45.0| 45.0|
|   1|  30| 8.8|53.8| 53.8|
|   1|  31|44.2|98.0| 98.0|
|   2|   1|15.2|15.2|113.2|
|   2|   2| 6.9|22.1

# rowsBetween

<img src="https://raw.githubusercontent.com/rbizoi/AnalyserLesDonneesAvecSpark/main/DataFrameSpark/images/M06-12.png" width="400">  

In [22]:
jourOby = Window.partitionBy('mois')\
                .orderBy('jour')\
                .rowsBetween(Window.unboundedPreceding,Window.currentRow)

In [23]:
meteoMM.where("annee = 2019")\
       .select('mois','jour',
          col('precipitations').alias('prec'),
          round(sum('precipitations')\
                   .over(jourOby),2).alias('s3'))\
       .show(32)

+----+----+----+----+
|mois|jour|prec|  s3|
+----+----+----+----+
|  12|   1|-0.1|-0.1|
|  12|   2|-0.1|-0.2|
|  12|   3| 0.0|-0.2|
|  12|   4| 0.0|-0.2|
|  12|   5|-0.1|-0.3|
|  12|   6| 0.0|-0.3|
|  12|   7|-0.1|-0.4|
|  12|   8| 5.2| 4.8|
|  12|   9| 5.4|10.2|
|  12|  10| 0.0|10.2|
|  12|  11| 7.8|18.0|
|  12|  12|25.0|43.0|
|  12|  13|25.0|68.0|
|  12|  14|-0.1|67.9|
|  12|  15| 0.0|67.9|
|  12|  16| 0.0|67.9|
|  12|  17| 0.0|67.9|
|  12|  18| 0.1|68.0|
|  12|  19|-0.1|67.9|
|  12|  20| 6.3|74.2|
|  12|  21|10.5|84.7|
|  12|  22|13.8|98.5|
|  12|  23| 0.4|98.9|
|  12|  24|-0.1|98.8|
|  12|  25| 0.0|98.8|
|  12|  26| 0.0|98.8|
|  12|  27| 0.0|98.8|
|  12|  28|-0.1|98.7|
|  12|  29| 0.0|98.7|
|  12|  30| 0.0|98.7|
|  12|  31| 0.0|98.7|
|   1|   1|-0.3|-0.3|
+----+----+----+----+
only showing top 32 rows



In [24]:
jour = Window.orderBy('mois').rowsBetween(-1, 1)
meteoMM.where('annee = 2019')\
       .groupBy('mois')\
       .agg( round(avg('temperature'),2).alias('temp'),
             round(sum('precipitations'),2).alias('prec'))\
       .select('mois','temp',
          round(avg('temp').over(jour),2).alias('s1'),
          'prec',
          round(sum('prec').over(jour),2).alias('s2'))\
       .show(32)

+----+-----+-----+-----+-----+
|mois| temp|   s1| prec|   s2|
+----+-----+-----+-----+-----+
|   1| 4.93| 6.66| 98.0|128.9|
|   2| 8.39| 7.96| 30.9|167.0|
|   3|10.56|10.38| 38.1|224.4|
|   4|12.19|12.32|155.4|286.1|
|   5|14.21|15.23| 92.6|353.5|
|   6|19.28|18.73|105.5|259.4|
|   7|22.69|21.16| 61.3|223.7|
|   8|21.52| 21.1| 56.9|149.6|
|   9|19.09|18.72| 31.4|182.7|
|  10|15.56|14.67| 94.4|491.1|
|  11| 9.37|11.27|365.3|558.4|
|  12| 8.89| 9.13| 98.7|464.0|
+----+-----+-----+-----+-----+

