<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#groupBy" data-toc-modified-id="groupBy-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>groupBy</a></span><ul class="toc-item"><li><span><a href="#rollup" data-toc-modified-id="rollup-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>rollup</a></span></li><li><span><a href="#cube" data-toc-modified-id="cube-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>cube</a></span></li></ul></li><li><span><a href="#pivot" data-toc-modified-id="pivot-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>pivot</a></span></li></ul></div>

In [1]:
import os
from distutils.sysconfig import get_python_lib,get_python_inc,get_python_version

In [2]:
os.environ['SPARK_HOME']=f'{get_python_lib()}{os.sep}pyspark'
os.environ['PYSPARK_PYTHON']=f'{get_python_lib()}{os.sep}pyspark'.split('Lib')[0]+'python.exe'

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types     import StructType, \
     StructField, FloatType, \
     IntegerType, StringType

In [4]:
referentiel = os.path.join(os.getcwd(),'warehouse')
os.makedirs(referentiel, exist_ok=True)

In [5]:
referentiel

'F:\\PythonFormation\\Spark-DataFrames\\warehouse'

In [6]:
spark = SparkSession \
    .builder \
    .appName("PresentationSpark") \
    .config("spark.sql.warehouse.dir", referentiel) \
    .config("spark.executor.cores", 8) \
    .config("spark.executor.memory", '24g') \
    .enableHiveSupport() \
    .getOrCreate()

In [7]:
spark

In [8]:
print(f"spark.executor.cores = {spark.conf.get('spark.executor.cores')}\nspark.executor.memory = {spark.conf.get('spark.executor.memory')}")

spark.executor.cores = 8
spark.executor.memory = 24g


In [9]:
from pyspark.sql.functions import *

meteo = spark.read.format('csv')\
    .option('sep',';')\
    .option('header','true')\
    .option('nullValue','mq')\
    .option('inferSchema', 'true')\
    .load('../donnees/meteo/*/') #    .cache()

meteo = meteo.select(
                 col('numer_sta'),
                 col('date')[0:4].cast('int') ,
                 col('date')[5:2].cast('int'),
                 col('date')[7:2].cast('int'),
                 col('date')[5:4],
                 round(col('t') - 273.15,2),
                 col('u') / 100 ,
                 col('vv') / 1000 ,
                 col('pres') / 1000,
                 coalesce( col('rr3'),
                           col('rr24')/8,
                           col('rr12')/4,
                           col('rr6')/2,
                           col('rr1')*3  ) )\
             .toDF('id','annee','mois','jour','mois_jour','temperature',
                   'humidite','visibilite','pression','precipitations') #             .cache()


from pyspark.sql.types     import StructType, \
     StructField, FloatType, \
     IntegerType, StringType

schema = StructType([
        StructField('Id'           , StringType() , True),
        StructField('ville'        , StringType() , True),
        StructField('latitude'     , FloatType() , True),
        StructField('longitude'    , FloatType() , True),
        StructField('altitude'     , IntegerType() , True)])

villes  = spark.read.format('csv')   \
      .option('sep',';')                \
      .option('mergeSchema', 'true')    \
      .option('header','true')          \
      .schema(schema)                   \
      .load('../donnees/postesSynop.csv')  \
      .cache()

meteo.count(), villes.count()

(4703265, 62)

# groupBy

<img src="https://raw.githubusercontent.com/rbizoi/AnalyserLesDonneesAvecSpark/main/DataFrameSpark/images/M06-03.png" width="400">    

In [10]:
meteo.where('id < 8000')\
     .select('annee','mois_jour','temperature','precipitations')\
     .describe().show()

+-------+------------------+-----------------+------------------+--------------------+
|summary|             annee|        mois_jour|       temperature|      precipitations|
+-------+------------------+-----------------+------------------+--------------------+
|  count|           3345920|          3345920|           3331578|             3316612|
|   mean|2009.5241057765877|666.2460330790933|12.730304780497685|  0.2587658467436155|
| stddev| 8.006457123929179|344.4202142570863| 7.336417149748348|  1.3217897238227296|
|    min|              1996|             0101|             -21.3|-0.30000000000000004|
|    max|              2023|             1231|              96.0|               443.0|
+-------+------------------+-----------------+------------------+--------------------+



In [11]:
meteo.where('id < 8000')\
     .select('annee','mois_jour','temperature')\
     .describe().show()

+-------+------------------+-----------------+------------------+
|summary|             annee|        mois_jour|       temperature|
+-------+------------------+-----------------+------------------+
|  count|           3345920|          3345920|           3331578|
|   mean|2009.5241057765877|666.2460330790933|12.730304780497685|
| stddev| 8.006457123929179|344.4202142570863| 7.336417149748348|
|    min|              1996|             0101|             -21.3|
|    max|              2023|             1231|              96.0|
+-------+------------------+-----------------+------------------+



In [12]:
meteo.where('id < 8000').count()

3345920

In [13]:
meteo.where('id < 8000')\
     .select('humidite','visibilite','pression')\
     .describe().show()

+-------+-------------------+------------------+------------------+
|summary|           humidite|        visibilite|          pression|
+-------+-------------------+------------------+------------------+
|  count|            3327879|           2903875|           3325109|
|   mean| 0.7544562076926593|22.576944668761783| 99.56771475762103|
| stddev|0.17774905844413927|15.256162059897415|2.5315149532100767|
|    min|               0.01|               0.0|              43.4|
|    max|                1.0|             80.01|            105.09|
+-------+-------------------+------------------+------------------+



In [14]:
meteo.where('id < 8000')\
     .groupBy('annee')\
     .avg('temperature','visibilite','pression').show(5)

+-----+------------------+------------------+-----------------+
|annee|  avg(temperature)|   avg(visibilite)|    avg(pression)|
+-----+------------------+------------------+-----------------+
| 2023|14.335714549927022| 25.97829452920015|99.53528064548905|
| 2022|13.956919192345685|25.500372519056686|99.68878205182264|
| 2019|13.185722746378657|25.786389970746523|99.53540068658667|
| 2020|13.526039191405296| 26.92226900837133| 99.6296426331237|
| 2018| 13.37747716669952|24.960805049257885|99.49398817480282|
+-----+------------------+------------------+-----------------+
only showing top 5 rows



In [15]:
meteo.where('id < 8000')\
     .groupBy('id','annee')\
     .max('temperature','visibilite','pression').show(5)

+----+-----+----------------+---------------+-------------+
|  id|annee|max(temperature)|max(visibilite)|max(pression)|
+----+-----+----------------+---------------+-------------+
|7627| 2022|            37.7|           20.0|        98.74|
|7747| 2020|            36.2|           60.0|       103.02|
|7481| 2022|            38.5|           80.0|       100.93|
|7627| 2023|            38.4|           20.0|        98.94|
|7558| 2019|            36.3|           20.0|        94.96|
+----+-----+----------------+---------------+-------------+
only showing top 5 rows



In [16]:
meteo.where('id < 8000')\
     .groupBy('id','annee')\
     .agg(
            count('id').alias('nb_villes'),
            round(avg('temperature'),2).alias('temperature'),
            round(avg('humidite'),2).alias('humidite'),
            round(avg('visibilite'),2).alias('visibilite'),
            round(avg('pression'),2).alias('pression'),
            round(sum('pression')).alias('precipitations'))\
     .orderBy("id","annee")\
     .show(28)

+----+-----+---------+-----------+--------+----------+--------+--------------+
|  id|annee|nb_villes|temperature|humidite|visibilite|pression|precipitations|
+----+-----+---------+-----------+--------+----------+--------+--------------+
|7005| 1996|     2913|       9.06|    0.81|     12.77|  100.67|      293245.0|
|7005| 1997|     2904|      10.54|    0.83|     12.91|   100.8|      292736.0|
|7005| 1998|     2912|      10.55|    0.83|     14.52|   100.7|      293250.0|
|7005| 1999|     2902|      11.14|    0.82|     13.85|   100.6|      291927.0|
|7005| 2000|     2903|      10.94|    0.84|     14.01|  100.57|      291960.0|
|7005| 2001|     2882|       10.6|    0.84|     14.87|  100.66|      290096.0|
|7005| 2002|     2894|      10.95|    0.83|     16.48|  100.61|      290059.0|
|7005| 2003|     2905|      10.93|    0.79|     16.98|  100.83|      292904.0|
|7005| 2004|     2911|      10.71|    0.83|     17.87|  100.75|      293285.0|
|7005| 2005|     2879|      10.79|    0.84|     17.3

In [17]:
meteo.where('id < 8000')\
     .groupBy('annee','mois')\
     .agg(round(sum('pression') / 1000).alias('precipitations'))\
     .orderBy('annee','mois')\
     .show(5)

+-----+----+--------------+
|annee|mois|precipitations|
+-----+----+--------------+
| 1996|   1|         963.0|
| 1996|   2|         908.0|
| 1996|   3|         966.0|
| 1996|   4|         941.0|
| 1996|   5|         974.0|
+-----+----+--------------+
only showing top 5 rows



## rollup
<img src="https://raw.githubusercontent.com/rbizoi/AnalyserLesDonneesAvecSpark/main/DataFrameSpark/images/M06-05.png" width="400">

In [18]:
meteo.where('id < 8000')\
     .groupBy('annee','mois')\
     .agg(round(sum('pression') / 1000).alias('precipitations'))\
     .rollup('annee','mois')\
     .agg( round(sum('precipitations')).alias('precipitations'))\
     .orderBy(col('annee').asc_nulls_last(),
              col('mois').asc_nulls_last())\
     .show(14)

+-----+----+--------------+
|annee|mois|precipitations|
+-----+----+--------------+
| 1996|   1|         963.0|
| 1996|   2|         908.0|
| 1996|   3|         966.0|
| 1996|   4|         941.0|
| 1996|   5|         974.0|
| 1996|   6|         952.0|
| 1996|   7|         978.0|
| 1996|   8|         974.0|
| 1996|   9|         943.0|
| 1996|  10|         959.0|
| 1996|  11|         943.0|
| 1996|  12|         976.0|
| 1996|null|       11477.0|
| 1997|   1|         982.0|
+-----+----+--------------+
only showing top 14 rows



In [19]:
meteo.where('id < 8000')\
     .groupBy('annee','mois')\
     .agg(round(sum('pression') / 1000).alias('precipitations'))\
     .orderBy('annee','mois')\
     .show()

+-----+----+--------------+
|annee|mois|precipitations|
+-----+----+--------------+
| 1996|   1|         963.0|
| 1996|   2|         908.0|
| 1996|   3|         966.0|
| 1996|   4|         941.0|
| 1996|   5|         974.0|
| 1996|   6|         952.0|
| 1996|   7|         978.0|
| 1996|   8|         974.0|
| 1996|   9|         943.0|
| 1996|  10|         959.0|
| 1996|  11|         943.0|
| 1996|  12|         976.0|
| 1997|   1|         982.0|
| 1997|   2|         890.0|
| 1997|   3|         985.0|
| 1997|   4|         951.0|
| 1997|   5|        1003.0|
| 1997|   6|         963.0|
| 1997|   7|         993.0|
| 1997|   8|         989.0|
+-----+----+--------------+
only showing top 20 rows



In [20]:
meteo.where('id < 8000')\
     .groupBy('annee','mois')\
     .agg(round(sum('pression') / 1000).alias('precipitations'))\
     .rollup('annee','mois')\
     .agg( round(sum('precipitations')).alias('precipitations'))\
     .orderBy('annee','mois')\
     .toPandas().head(10) #.show()

+-----+----+--------------+
|annee|mois|precipitations|
+-----+----+--------------+
| null|null|      331077.0|
| 1996|null|       11477.0|
| 1996|   1|         963.0|
| 1996|   2|         908.0|
| 1996|   3|         966.0|
| 1996|   4|         941.0|
| 1996|   5|         974.0|
| 1996|   6|         952.0|
| 1996|   7|         978.0|
| 1996|   8|         974.0|
| 1996|   9|         943.0|
| 1996|  10|         959.0|
| 1996|  11|         943.0|
| 1996|  12|         976.0|
| 1997|null|       11668.0|
| 1997|   1|         982.0|
| 1997|   2|         890.0|
| 1997|   3|         985.0|
| 1997|   4|         951.0|
| 1997|   5|        1003.0|
+-----+----+--------------+
only showing top 20 rows



In [21]:
meteo.where('id < 8000')\
     .groupBy('annee','mois')\
     .agg(round(sum('pression') / 1000).alias('precipitations'))\
     .rollup('annee','mois')\
     .agg(round(sum('precipitations')).alias('precipitations'))\
     .orderBy('annee','mois')\
     .toPandas().head(20) #.show(20)

+-----+----+--------------+
|annee|mois|precipitations|
+-----+----+--------------+
| null|null|      331077.0|
| 1996|null|       11477.0|
| 1996|   1|         963.0|
| 1996|   2|         908.0|
| 1996|   3|         966.0|
| 1996|   4|         941.0|
| 1996|   5|         974.0|
| 1996|   6|         952.0|
| 1996|   7|         978.0|
| 1996|   8|         974.0|
| 1996|   9|         943.0|
| 1996|  10|         959.0|
| 1996|  11|         943.0|
| 1996|  12|         976.0|
| 1997|null|       11668.0|
| 1997|   1|         982.0|
| 1997|   2|         890.0|
| 1997|   3|         985.0|
| 1997|   4|         951.0|
| 1997|   5|        1003.0|
+-----+----+--------------+
only showing top 20 rows



In [22]:
meteo.where('id < 8000')\
     .rollup('annee','mois')\
     .agg(
            round(sum('precipitations')).alias('precipitations'))\
     .orderBy(col('annee').asc_nulls_last(),
              col('mois').asc_nulls_last())\
     .toPandas().head(16) # .show(16)

+-----+----+--------------+
|annee|mois|precipitations|
+-----+----+--------------+
| 1996|   1|        3260.0|
| 1996|   2|        3056.0|
| 1996|   3|        1482.0|
| 1996|   4|        1682.0|
| 1996|   5|        3025.0|
| 1996|   6|        1444.0|
| 1996|   7|        2184.0|
| 1996|   8|        2608.0|
| 1996|   9|        2454.0|
| 1996|  10|        2580.0|
| 1996|  11|        6165.0|
| 1996|  12|        3121.0|
| 1996|null|       33061.0|
| 1997|   1|        1988.0|
| 1997|   2|        2242.0|
| 1997|   3|         421.0|
+-----+----+--------------+
only showing top 16 rows



In [23]:
meteo.where('id < 8000')\
     .rollup('annee','mois')\
     .agg(
            round(avg('temperature'),2).alias('temperature'),
            round(avg('humidite'),2).alias('humidite'),
            round(avg('visibilite'),2).alias('visibilite'),
            round(avg('pression'),2).alias('pression'),
            round(avg('precipitations'),2).alias('precipitations'))\
     .orderBy(col('annee').asc_nulls_last(),
              col('mois').asc_nulls_last())\
     .toPandas().head(16) #.show(16)

+-----+----+-----------+--------+----------+--------+--------------+
|annee|mois|temperature|humidite|visibilite|pression|precipitations|
+-----+----+-----------+--------+----------+--------+--------------+
| 1996|   1|       6.34|    0.86|     12.82|    98.8|          0.33|
| 1996|   2|       3.97|    0.79|     14.79|    99.2|          0.33|
| 1996|   3|       7.03|    0.73|      13.5|   99.31|          0.15|
| 1996|   4|       10.9|     0.7|     15.82|   99.41|          0.18|
| 1996|   5|      13.16|    0.76|      18.9|    99.2|          0.31|
| 1996|   6|      18.38|    0.69|     19.03|   99.87|          0.16|
| 1996|   7|      19.48|    0.69|     20.93|   99.68|          0.25|
| 1996|   8|      18.89|    0.73|      20.1|   99.47|           0.3|
| 1996|   9|      14.74|    0.73|     19.17|   99.37|           0.3|
| 1996|  10|      12.56|    0.81|     17.03|   99.67|           0.3|
| 1996|  11|       7.91|    0.84|     15.66|   99.24|           0.7|
| 1996|  12|       4.57|    0.85| 

## cube
<img src="https://raw.githubusercontent.com/rbizoi/AnalyserLesDonneesAvecSpark/main/DataFrameSpark/images/M06-06.png" width="400">

In [24]:
meteo.where('id < 8000')\
     .groupBy('annee','mois')\
     .agg(round(sum('pression') / 1000).alias('precipitations'))\
     .cube('annee','mois')\
     .agg( round(sum('precipitations')).alias('precipitations'))\
     .orderBy(col('annee'),col('mois'))\
     .toPandas().head(24) #.show(24)

+-----+----+--------------+
|annee|mois|precipitations|
+-----+----+--------------+
| null|null|      331077.0|
| null|   1|       28327.0|
| null|   2|       25789.0|
| null|   3|       28192.0|
| null|   4|       27229.0|
| null|   5|       28115.0|
| null|   6|       27264.0|
| null|   7|       28130.0|
| null|   8|       28156.0|
| null|   9|       27226.0|
| null|  10|       28123.0|
| null|  11|       27273.0|
| null|  12|       27253.0|
| 1996|null|       11477.0|
| 1996|   1|         963.0|
| 1996|   2|         908.0|
| 1996|   3|         966.0|
| 1996|   4|         941.0|
| 1996|   5|         974.0|
| 1996|   6|         952.0|
| 1996|   7|         978.0|
| 1996|   8|         974.0|
| 1996|   9|         943.0|
| 1996|  10|         959.0|
+-----+----+--------------+
only showing top 24 rows



In [25]:
meteo.where('id < 8000')\
     .groupBy('id','annee')\
     .agg(
            {'id':'count',
            'temperature':'avg',
            'humidite':'avg'}
     ).toPandas().head(10) #.show(10)

+----+-----+------------------+------------------+---------+
|  id|annee|     avg(humidite)|  avg(temperature)|count(id)|
+----+-----+------------------+------------------+---------+
|7627| 2022|0.7473913043478269|13.957147826086953|     2875|
|7747| 2020| 0.671087105624142|16.450120564932835|     5850|
|7481| 2022| 0.684143494679025|14.446206659800893|     2913|
|7627| 2023|0.7715090771558247|13.975416036308623|     2644|
|7558| 2019|0.7018670886075954|11.308966244725742|     2844|
|7481| 2019|0.6953703703703706| 13.50380658436214|     2916|
|7607| 2020|0.7780644056183653|14.408667351832822|     5850|
|7255| 2022|0.7192455418381346|13.646433470507539|     2916|
|7139| 2020|0.7633850931677043|12.455797101449276|     5798|
|7335| 2022|0.7429304407713506|13.738326446280992|     2904|
+----+-----+------------------+------------------+---------+
only showing top 10 rows



In [26]:
meteo.where('id < 8000')\
     .groupBy('id','annee')\
     .agg(
            {'id':'count',
            'temperature':'avg',
            'humidite':'avg'}
     ).toDF('id','annee','humidite','temperature','nb_villes').toPandas().head(10) #.show(10)

+----+-----+------------------+------------------+---------+
|  id|annee|          humidite|       temperature|nb_villes|
+----+-----+------------------+------------------+---------+
|7627| 2022|0.7473913043478269|13.957147826086953|     2875|
|7747| 2020| 0.671087105624142|16.450120564932835|     5850|
|7481| 2022| 0.684143494679025|14.446206659800893|     2913|
|7627| 2023|0.7715090771558247|13.975416036308623|     2644|
|7558| 2019|0.7018670886075954|11.308966244725742|     2844|
|7481| 2019|0.6953703703703706| 13.50380658436214|     2916|
|7607| 2020|0.7780644056183653|14.408667351832822|     5850|
|7255| 2022|0.7192455418381346|13.646433470507539|     2916|
|7139| 2020|0.7633850931677043|12.455797101449276|     5798|
|7335| 2022|0.7429304407713506|13.738326446280992|     2904|
+----+-----+------------------+------------------+---------+
only showing top 10 rows



In [27]:
meteo.where('id < 8000')\
     .groupBy('id')\
     .agg(
        round(skewness  ('temperature'),3).alias('skewness'  ),
        round(kurtosis  ('temperature'),3).alias('kurtosis'  ),
        round(variance  ('temperature'),3).alias('variance'  ),
        round(var_pop   ('temperature'),3).alias('var_pop'   ),
        round(stddev    ('temperature'),3).alias('stddev'    ),
        round(stddev_pop('temperature'),3).alias('stddev_pop'))\
     .orderBy('id')\
     .toPandas().head(15) #.show(15)

+----+--------+--------+--------+-------+------+----------+
|  id|skewness|kurtosis|variance|var_pop|stddev|stddev_pop|
+----+--------+--------+--------+-------+------+----------+
|7005|   0.101|   0.004|  40.411|  40.41| 6.357|     6.357|
|7015|   0.149|  -0.155|  47.047| 47.047| 6.859|     6.859|
|7020|  -0.124|  -0.399|   17.14| 17.139|  4.14|      4.14|
|7027|   0.125|  -0.008|  36.452| 36.452| 6.038|     6.038|
|7037|   0.187|  -0.065|  44.414| 44.414| 6.664|     6.664|
|7072|   0.265|   -0.22|  56.351| 56.349| 7.507|     7.507|
|7110|   0.068|   0.183|  24.959| 24.959| 4.996|     4.996|
|7117|   0.073|     1.6|  18.734| 18.734| 4.328|     4.328|
|7130|   0.168|   0.009|  42.106| 42.105| 6.489|     6.489|
|7139|   0.207|   -0.08|   47.15|  47.15| 6.867|     6.867|
|7149|    0.19|  -0.248|  53.089| 53.089| 7.286|     7.286|
|7168|   0.207|  -0.185|  59.848| 59.847| 7.736|     7.736|
|7181|   0.164|  -0.305|  61.405| 61.404| 7.836|     7.836|
|7190|   0.128|  -0.451|  67.874| 67.873

In [28]:
meteo.where('id < 8000 and annee > 2014')\
     .groupBy('id','annee')\
     .agg( round(avg('temperature'),2).alias('temperature'))\
     .orderBy("id","annee")\
     .toPandas().head(10) #.show(10)

+----+-----+-----------+
|  id|annee|temperature|
+----+-----+-----------+
|7005| 2015|      11.27|
|7005| 2016|      10.92|
|7005| 2017|      11.42|
|7005| 2018|      11.56|
|7005| 2019|      11.51|
|7005| 2020|      12.13|
|7005| 2022|      12.04|
|7005| 2023|      12.49|
|7015| 2015|      11.38|
|7015| 2016|      11.16|
+----+-----+-----------+
only showing top 10 rows



# pivot
<img src="https://raw.githubusercontent.com/rbizoi/AnalyserLesDonneesAvecSpark/main/DataFrameSpark/images/M06-04.png" width="400"> 

In [35]:
meteo.where('id < 8000 and annee > 2014')\
      .groupBy('id')\
      .pivot('annee')\
      .agg( round(avg('temperature'),2))\
      .sort('id')\
      .toPandas().head(10) #.show(10)

Unnamed: 0,id,2015,2016,2017,2018,2019,2020,2022,2023
0,7005,11.27,10.92,11.42,11.56,11.51,12.13,12.04,12.49
1,7015,11.38,11.16,11.61,11.95,11.79,12.45,12.45,12.91
2,7020,12.43,12.22,12.74,12.32,12.98,12.75,13.2,13.27
3,7027,11.57,11.15,11.68,11.64,11.58,12.19,12.35,12.82
4,7037,11.2,10.7,11.18,11.57,11.37,11.98,12.0,12.39
5,7072,11.36,10.67,11.06,11.87,11.42,12.16,12.17,12.72
6,7110,11.96,11.64,11.69,11.81,11.79,12.25,12.55,12.82
7,7117,12.44,12.14,12.41,12.25,12.34,12.72,12.76,13.28
8,7130,12.36,11.82,12.15,12.65,12.43,13.09,13.41,13.79
9,7139,11.49,10.94,11.47,11.91,11.71,12.46,12.5,12.84


In [30]:
villes.select('ville',
               round('altitude',-2).alias('altitude'))\
      .groupBy('altitude')\
      .agg(collect_list('ville').alias('ville par altitude')).toPandas().head()
# .show(truncate=False)

Unnamed: 0,altitude,ville par altitude
0,300,"[NANCY-OCHEY, BALE-MULHOUSE, CLERMONT-FD, GOUR..."
1,900,[EMBRUN]
2,800,[LE PUY-LOUDES]
3,100,"[ABBEVILLE, CAEN-CARPIQUET, REIMS-PRUNAY, BRES..."
4,400,"[LIMOGES-BELLEGARDE, TARBES-OSSUN, ST GIRONS]"


In [34]:
meteo.where('id < 8000')\
     .groupBy('annee','mois')\
     .agg(round(sum('pression') / 1000).alias('pression'))\
     .orderBy('annee','mois')\
     .toPandas().head()

Unnamed: 0,annee,mois,pression
0,1996,1,963.0
1,1996,2,908.0
2,1996,3,966.0
3,1996,4,941.0
4,1996,5,974.0
