<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Jointures" data-toc-modified-id="Jointures-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Jointures</a></span><ul class="toc-item"><li><span><a href="#inner" data-toc-modified-id="inner-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>inner</a></span></li><li><span><a href="#outer" data-toc-modified-id="outer-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>outer</a></span></li></ul></li></ul></div>

In [23]:
import os
from pyspark.sql.functions import *
from pyspark.sql.types     import StructType, \
     StructField, FloatType, \
     IntegerType, StringType

In [24]:
spark

In [25]:
from pyspark.sql.functions import *
from pyspark.sql.types     import StructType, \
     StructField, FloatType, \
     IntegerType, StringType

meteoDataFrame  = spark.read.format('csv')\
    .option('sep',';')\
    .option('header','true')\
    .option('nullValue','mq')\
    .option('inferSchema', 'true')\
    .load('../donnees/meteo30')\
    .cache()

schema = StructType([
        StructField('Id'           , StringType() , True),
        StructField('ville'        , StringType() , True),
        StructField('latitude'     , FloatType() , True),
        StructField('longitude'    , FloatType() , True),
        StructField('altitude'     , IntegerType() , True)])

villes  = spark.read.format('csv')   \
      .option('sep',';')                \
      .option('mergeSchema', 'true')    \
      .option('header','true')          \
      .schema(schema)                   \
      .load('../donnees/meteo/postesSynop.csv')  \
      .cache()

@udf("string")
def formatVille(ville):
    if ville in ['CLERMONT-FD','MONT-DE-MARSAN',
                                   'ST-PIERRE','ST-BARTHELEMY METEO'] :
        return ville.title()
    else :
        if ville.find('-') != -1 :
            return ville[0:ville.find('-')].title()
        else:
            return ville.title()

villesT  = villes.select(
                col('Id').alias('id'),
                formatVille('ville').alias('ville'),
               'latitude',
               'longitude',
               'altitude')


meteo = meteoDataFrame.select(
                 col('numer_sta'),
                 to_timestamp(col('date').cast('string'),'yyyyMMddHHmmss'),
                 col('date')[0:4].cast('int') ,
                 col('date')[5:2].cast('int'),
                 col('date')[7:2].cast('int'),
                 col('date')[5:4],
                 round(col('t') - 273.15,2),
                 col('u') / 100 ,
                 col('vv') / 1000 ,
                 col('pres') / 1000,
                 coalesce( col('rr3'),
                           col('rr24')/8,
                           col('rr12')/4,
                           col('rr6')/2,
                           col('rr1')*3  ) )\
             .toDF('id','date','annee','mois','jour','mois_jour','temperature',
                   'humidite','visibilite','pression','precipitations')\
             .cache()

meteo.select('annee','mois','jour','temperature','humidite',
             'visibilite','pression').show(3)

+-----+----+----+-----------+--------+----------+--------+
|annee|mois|jour|temperature|humidite|visibilite|pression|
+-----+----+----+-----------+--------+----------+--------+
| 2019|  12|   1|        3.7|    0.79|      20.0|  100.86|
| 2019|  12|   1|        2.8|    0.87|     12.23|  101.38|
| 2019|  12|   1|        8.7|    0.75|      10.0|  101.39|
+-----+----+----+-----------+--------+----------+--------+
only showing top 3 rows



In [26]:
villesT.show(3)

+-----+---------------+---------+---------+--------+
|   id|          ville| latitude|longitude|altitude|
+-----+---------------+---------+---------+--------+
|07005|      Abbeville|   50.136|    1.834|      69|
|07015|          Lille|    50.57|   3.0975|      47|
|07020|Pte De La Hague|49.725166|-1.939833|       6|
+-----+---------------+---------+---------+--------+
only showing top 3 rows



In [27]:
data = [('Ajaccio'     ,'dfa' ),
        ('Angers'      ,'dfa' ),
        ('Angoulème'   ,'dfa' ),
        ('Besançon'    ,'dfa' ),
        ('Biarritz'    ,'dfa' ),
        ('Bordeaux'    ,'dfa' ),
        ('Brest'       ,'dfa' ),
        ('Caen'        ,'dfa' ),
        ('Clermont-Fd' ,'dfa' ),
        ('Dijon'       ,'dfa' ),
        ('Embrun'      ,'dfa' ),
        ('Grenoble'    ,'dfa' ),
        ('Lille'       ,'dfa' ),
        ('Limoges'     ,'dfa' ),
        ('Lyon'        ,'dfa' ),
        ('Marseille'   ,'dfa' ),
        ('Montpellier' ,'dfa' ),
        ('Nancy'       ,'dfa' ),
        ('Nantes'      ,'dfa' ),
        ('Nice'        ,'dfa' ),
        ('Nîmes'       ,'dfa' ),
        ('Orléans'     ,'dfa' ),
        ('Paris'       ,'dfa' )]

dfa = spark.sparkContext.parallelize(data).toDF(['ville','valeur'])

data = [ ('Nancy'       ,'dfb' ),
          ('Nantes'      ,'dfb' ),
          ('Nice'        ,'dfb' ),
          ('Nîmes'       ,'dfb' ),
          ('Orléans'     ,'dfb' ),
          ('Paris'       ,'dfb' ),
          ('Perpignan'   ,'dfb' ),
          ('Poitiers'    ,'dfb' ),
          ('Reims'       ,'dfb' ),
          ('Rennes'      ,'dfb' ),
          ('Rouen'       ,'dfb' ),
          ('St-Quentin'  ,'dfb' ),
          ('Strasbourg'  ,'dfb' ),
          ('Toulon'      ,'dfb' ),
          ('Toulouse'    ,'dfb' ),
          ('Tours'       ,'dfb' ),
          ('Vichy'       ,'dfb' )]

dfb = spark.sparkContext.parallelize(data).toDF(['ville','valeur'])

# Jointures

<img src="https://raw.githubusercontent.com/rbizoi/AnalyserLesDonneesAvecSpark/main/DataFrameSpark/images/M06-07.png" width="400">   

## inner

In [28]:
meteo.join(villesT,
       meteo.id == villesT.id)\
       .select('ville','annee','mois_jour',
               'temperature','precipitations')\
       .show(10)

+---------------+-----+---------+-----------+--------------+
|          ville|annee|mois_jour|temperature|precipitations|
+---------------+-----+---------+-----------+--------------+
|      Abbeville| 2019|     1201|        3.7|           0.0|
|          Lille| 2019|     1201|        2.8|           0.0|
|Pte De La Hague| 2019|     1201|        8.7|           0.0|
|           Caen| 2019|     1201|        4.9|           0.0|
|          Rouen| 2019|     1201|        3.5|           0.0|
|          Reims| 2019|     1201|        1.7|           0.0|
|          Brest| 2019|     1201|        7.1|           0.0|
|    Ploumanac'H| 2019|     1201|        8.0|           2.0|
|         Rennes| 2019|     1201|        6.2|           2.0|
|        Alencon| 2019|     1201|        4.3|          -0.1|
+---------------+-----+---------+-----------+--------------+
only showing top 10 rows



In [29]:
meteo.join(villes,
       meteo['id'].eqNullSafe(villes['Id']))\
       .select('ville','annee','mois_jour',
               'temperature','precipitations')\
       .show(10)

+-----------------+-----+---------+-----------+--------------+
|            ville|annee|mois_jour|temperature|precipitations|
+-----------------+-----+---------+-----------+--------------+
|        ABBEVILLE| 2019|     1201|        3.7|           0.0|
|    LILLE-LESQUIN| 2019|     1201|        2.8|           0.0|
|  PTE DE LA HAGUE| 2019|     1201|        8.7|           0.0|
|   CAEN-CARPIQUET| 2019|     1201|        4.9|           0.0|
|       ROUEN-BOOS| 2019|     1201|        3.5|           0.0|
|     REIMS-PRUNAY| 2019|     1201|        1.7|           0.0|
|   BREST-GUIPAVAS| 2019|     1201|        7.1|           0.0|
|      PLOUMANAC'H| 2019|     1201|        8.0|           2.0|
|RENNES-ST JACQUES| 2019|     1201|        6.2|           2.0|
|          ALENCON| 2019|     1201|        4.3|          -0.1|
+-----------------+-----+---------+-----------+--------------+
only showing top 10 rows



In [30]:
meteo.join(villes.withColumnRenamed('Id', 'id'),'id')\
       .select('ville','annee','mois_jour',
               'temperature','precipitations')\
       .show(10)

+-----------------+-----+---------+-----------+--------------+
|            ville|annee|mois_jour|temperature|precipitations|
+-----------------+-----+---------+-----------+--------------+
|        ABBEVILLE| 2019|     1201|        3.7|           0.0|
|    LILLE-LESQUIN| 2019|     1201|        2.8|           0.0|
|  PTE DE LA HAGUE| 2019|     1201|        8.7|           0.0|
|   CAEN-CARPIQUET| 2019|     1201|        4.9|           0.0|
|       ROUEN-BOOS| 2019|     1201|        3.5|           0.0|
|     REIMS-PRUNAY| 2019|     1201|        1.7|           0.0|
|   BREST-GUIPAVAS| 2019|     1201|        7.1|           0.0|
|      PLOUMANAC'H| 2019|     1201|        8.0|           2.0|
|RENNES-ST JACQUES| 2019|     1201|        6.2|           2.0|
|          ALENCON| 2019|     1201|        4.3|          -0.1|
+-----------------+-----+---------+-----------+--------------+
only showing top 10 rows



In [31]:
dfa.join(dfb,'ville').show()

+-------+------+------+
|  ville|valeur|valeur|
+-------+------+------+
|  Nancy|   dfa|   dfb|
| Nantes|   dfa|   dfb|
|   Nice|   dfa|   dfb|
|  Nîmes|   dfa|   dfb|
|Orléans|   dfa|   dfb|
|  Paris|   dfa|   dfb|
+-------+------+------+



In [32]:
dfa.join(dfb,dfa['ville'] == dfb['ville'],'inner').show()

+-------+------+-------+------+
|  ville|valeur|  ville|valeur|
+-------+------+-------+------+
|  Nancy|   dfa|  Nancy|   dfb|
| Nantes|   dfa| Nantes|   dfb|
|   Nice|   dfa|   Nice|   dfb|
|  Nîmes|   dfa|  Nîmes|   dfb|
|Orléans|   dfa|Orléans|   dfb|
|  Paris|   dfa|  Paris|   dfb|
+-------+------+-------+------+



## outer 

In [33]:
dfa.join(dfb,dfa['ville'] == dfb['ville'],'outer').show(50)

+-----------+------+----------+------+
|      ville|valeur|     ville|valeur|
+-----------+------+----------+------+
|    Ajaccio|   dfa|      null|  null|
|     Angers|   dfa|      null|  null|
|  Angoulème|   dfa|      null|  null|
|   Besançon|   dfa|      null|  null|
|   Biarritz|   dfa|      null|  null|
|   Bordeaux|   dfa|      null|  null|
|      Brest|   dfa|      null|  null|
|       Caen|   dfa|      null|  null|
|Clermont-Fd|   dfa|      null|  null|
|      Dijon|   dfa|      null|  null|
|     Embrun|   dfa|      null|  null|
|   Grenoble|   dfa|      null|  null|
|      Lille|   dfa|      null|  null|
|    Limoges|   dfa|      null|  null|
|       Lyon|   dfa|      null|  null|
|  Marseille|   dfa|      null|  null|
|Montpellier|   dfa|      null|  null|
|      Nancy|   dfa|     Nancy|   dfb|
|     Nantes|   dfa|    Nantes|   dfb|
|       Nice|   dfa|      Nice|   dfb|
|      Nîmes|   dfa|     Nîmes|   dfb|
|    Orléans|   dfa|   Orléans|   dfb|
|      Paris|   dfa|     

In [34]:
dfa.join(dfb,dfa['ville'] == dfb['ville'],'full').show(50)

+-----------+------+----------+------+
|      ville|valeur|     ville|valeur|
+-----------+------+----------+------+
|    Ajaccio|   dfa|      null|  null|
|     Angers|   dfa|      null|  null|
|  Angoulème|   dfa|      null|  null|
|   Besançon|   dfa|      null|  null|
|   Biarritz|   dfa|      null|  null|
|   Bordeaux|   dfa|      null|  null|
|      Brest|   dfa|      null|  null|
|       Caen|   dfa|      null|  null|
|Clermont-Fd|   dfa|      null|  null|
|      Dijon|   dfa|      null|  null|
|     Embrun|   dfa|      null|  null|
|   Grenoble|   dfa|      null|  null|
|      Lille|   dfa|      null|  null|
|    Limoges|   dfa|      null|  null|
|       Lyon|   dfa|      null|  null|
|  Marseille|   dfa|      null|  null|
|Montpellier|   dfa|      null|  null|
|      Nancy|   dfa|     Nancy|   dfb|
|     Nantes|   dfa|    Nantes|   dfb|
|       Nice|   dfa|      Nice|   dfb|
|      Nîmes|   dfa|     Nîmes|   dfb|
|    Orléans|   dfa|   Orléans|   dfb|
|      Paris|   dfa|     

In [35]:
dfa.join(dfb,dfa['ville'] == dfb['ville'],'left').show(50)

+-----------+------+-------+------+
|      ville|valeur|  ville|valeur|
+-----------+------+-------+------+
|    Ajaccio|   dfa|   null|  null|
|     Angers|   dfa|   null|  null|
|  Angoulème|   dfa|   null|  null|
|   Besançon|   dfa|   null|  null|
|   Biarritz|   dfa|   null|  null|
|   Bordeaux|   dfa|   null|  null|
|      Brest|   dfa|   null|  null|
|       Caen|   dfa|   null|  null|
|Clermont-Fd|   dfa|   null|  null|
|      Dijon|   dfa|   null|  null|
|     Embrun|   dfa|   null|  null|
|   Grenoble|   dfa|   null|  null|
|      Lille|   dfa|   null|  null|
|    Limoges|   dfa|   null|  null|
|       Lyon|   dfa|   null|  null|
|  Marseille|   dfa|   null|  null|
|Montpellier|   dfa|   null|  null|
|      Nancy|   dfa|  Nancy|   dfb|
|     Nantes|   dfa| Nantes|   dfb|
|       Nice|   dfa|   Nice|   dfb|
|      Nîmes|   dfa|  Nîmes|   dfb|
|    Orléans|   dfa|Orléans|   dfb|
|      Paris|   dfa|  Paris|   dfb|
+-----------+------+-------+------+



In [36]:
dfa.join(dfb,dfa['ville'] == dfb['ville'],'right').show(50)

+-------+------+----------+------+
|  ville|valeur|     ville|valeur|
+-------+------+----------+------+
|  Nancy|   dfa|     Nancy|   dfb|
| Nantes|   dfa|    Nantes|   dfb|
|   Nice|   dfa|      Nice|   dfb|
|  Nîmes|   dfa|     Nîmes|   dfb|
|Orléans|   dfa|   Orléans|   dfb|
|  Paris|   dfa|     Paris|   dfb|
|   null|  null| Perpignan|   dfb|
|   null|  null|  Poitiers|   dfb|
|   null|  null|     Reims|   dfb|
|   null|  null|    Rennes|   dfb|
|   null|  null|     Rouen|   dfb|
|   null|  null|St-Quentin|   dfb|
|   null|  null|Strasbourg|   dfb|
|   null|  null|    Toulon|   dfb|
|   null|  null|  Toulouse|   dfb|
|   null|  null|     Tours|   dfb|
|   null|  null|     Vichy|   dfb|
+-------+------+----------+------+

