<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import os
from distutils.sysconfig import get_python_lib,get_python_inc,get_python_version

In [2]:
os.environ['SPARK_HOME']=f'{get_python_lib()}{os.sep}pyspark'
os.environ['PYSPARK_PYTHON']=f'{get_python_lib()}{os.sep}pyspark'.split('Lib')[0]+'python.exe'

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types     import StructType, \
     StructField, FloatType, \
     IntegerType, StringType

In [4]:
spark = SparkSession \
    .builder \
    .appName("demo") \
    .config("spark.sql.warehouse.dir", 'F:\\FormationPython\\spark\\warehouse') \
    .enableHiveSupport() \
    .getOrCreate()

In [5]:
spark

In [6]:
from pyspark.sql.functions import *
from pyspark.sql.types     import StructType, \
     StructField, FloatType, \
     IntegerType, StringType

meteoDataFrame  = spark.read.format('csv')\
    .option('sep',';')\
    .option('header','true')\
    .option('nullValue','mq')\
    .option('inferSchema', 'true')\
    .load('./donnees/meteo')\
    .cache()

schema = StructType([
        StructField('Id'           , StringType() , True),
        StructField('ville'        , StringType() , True),
        StructField('latitude'     , FloatType() , True),
        StructField('longitude'    , FloatType() , True),
        StructField('altitude'     , IntegerType() , True)])

villes  = spark.read.format('csv')   \
      .option('sep',';')                \
      .option('mergeSchema', 'true')    \
      .option('header','true')          \
      .schema(schema)                   \
      .load('./donnees/postesSynop.csv')  \
      .cache()

@udf("string")
def formatVille(ville):
    if ville in ['CLERMONT-FD','MONT-DE-MARSAN',
                                   'ST-PIERRE','ST-BARTHELEMY METEO'] :
        return ville.title()
    else :
        if ville.find('-') != -1 :
            return ville[0:ville.find('-')].title()
        else:
            return ville.title()

villesT  = villes.select(
                col('Id').alias('id'),
                formatVille('ville').alias('ville'),
               'latitude',
               'longitude',
               'altitude')


meteo = meteoDataFrame.select(
                 col('numer_sta'),
                 col('date')[0:4].cast('int') ,
                 col('date')[5:2].cast('int'),
                 col('date')[7:2].cast('int'),
                 col('date')[5:4],
                 round(col('t') - 273.15,2),
                 col('u') / 100 ,
                 col('vv') / 1000 ,
                 col('pres') / 1000,
                 coalesce( col('rr3'),
                           col('rr24')/8,
                           col('rr12')/4,
                           col('rr6')/2,
                           col('rr1')*3  ) )\
             .toDF('id','annee','mois','jour','mois_jour','temperature',
                   'humidite','visibilite','pression','precipitations')\
             .cache()

meteo.select('annee','mois','jour','temperature','humidite',
             'visibilite','pression').toPandas().head(5)

Unnamed: 0,annee,mois,jour,temperature,humidite,visibilite,pression
0,2023,8,1,15.4,0.97,20.0,99.46
1,2023,8,1,15.6,0.95,51.23,99.63
2,2023,8,1,16.3,0.92,12.0,100.43
3,2023,8,1,15.9,0.93,49.78,99.81
4,2023,8,1,15.6,0.98,3.18,98.61


In [7]:
meteo.write\
       .mode('overwrite')\
       .format('parquet')\
       .partitionBy('annee')\
       .option('path', './donnees/meteoFrance')\
       .save()

In [8]:
spark.sql("select * from parquet."+
          "`./donnees/meteoFrance` "+
          "where annee = 2020").toPandas().head(5)

Unnamed: 0,id,mois,jour,mois_jour,temperature,humidite,visibilite,pression,precipitations,annee
0,7005,10,1,1001,14.5,0.9,17.33,99.52,1.6,2020
1,7015,10,1,1001,15.4,0.85,46.41,99.9,0.0,2020
2,7020,10,1,1001,15.1,0.75,14.0,100.05,0.0,2020
3,7027,10,1,1001,14.4,0.92,3.83,99.44,1.0,2020
4,7037,10,1,1001,13.8,0.95,41.72,98.61,1.0,2020


In [9]:
meteoFance = spark.read.format('parquet').load('./donnees/meteoFrance')

In [16]:
spark.sql('CREATE DATABASE cours').toPandas().head()

In [17]:
spark.sql('show databases').toPandas()

Unnamed: 0,namespace
0,cours
1,default


In [22]:
meteo=spark.sql("""
                SELECT * 
                FROM parquet.`./donnees/meteoFrance`""") 

In [23]:
meteo.toPandas().head()

Unnamed: 0,id,mois,jour,mois_jour,temperature,humidite,visibilite,pression,precipitations,annee
0,7005,9,1,901,12.4,0.94,20.0,101.16,0.0,2017
1,7015,9,1,901,10.7,0.96,13.44,101.51,0.2,2017
2,7020,9,1,901,14.0,0.85,16.0,101.87,0.4,2017
3,7027,9,1,901,12.4,0.94,24.19,101.2,0.0,2017
4,7037,9,1,901,11.8,0.93,42.51,100.23,0.0,2017


In [24]:
meteo.write.saveAsTable(name="meteo", mode="overwrite")

In [26]:
spark.sql('show tables').toPandas()

Unnamed: 0,namespace,tableName,isTemporary
0,default,meteo,False


In [28]:
spark.sql("""SELECT *
            FROM meteo
            WHERE ANNEE = 2020""").toPandas().head(10)

Unnamed: 0,id,mois,jour,mois_jour,temperature,humidite,visibilite,pression,precipitations,annee
0,7005,10,1,1001,14.5,0.9,17.33,99.52,1.6,2020
1,7015,10,1,1001,15.4,0.85,46.41,99.9,0.0,2020
2,7020,10,1,1001,15.1,0.75,14.0,100.05,0.0,2020
3,7027,10,1,1001,14.4,0.92,3.83,99.44,1.0,2020
4,7037,10,1,1001,13.8,0.95,41.72,98.61,1.0,2020
5,7072,10,1,1001,13.9,0.9,20.0,99.6,0.0,2020
6,7110,10,1,1001,10.8,0.95,17.63,99.34,1.0,2020
7,7117,10,1,1001,13.1,0.81,,99.7,0.0,2020
8,7130,10,1,1001,13.4,0.96,59.81,99.95,6.7,2020
9,7139,10,1,1001,14.3,0.92,20.0,98.7,1.2,2020


In [29]:
spark.sql("""SELECT annee, 
                    avg(temperature) temperature, 
                    avg(humidite) humidite, 
                    avg(visibilite) visibilite, 
                    avg(pression) pression
            FROM meteo
            GROUP BY ANNEE""").toPandas().head(20)

Unnamed: 0,annee,temperature,humidite,visibilite,pression
0,2018,15.173137,0.755002,25.491561,99.831094
1,2023,16.813339,0.746461,26.553942,100.009556
2,2020,15.652178,0.750186,27.456131,99.983949
3,2017,14.882964,0.751057,26.415576,100.065136
4,2022,15.933596,0.746009,26.35093,100.055303
5,2019,15.479841,0.745867,26.528293,99.921343


In [30]:
spark.sql('USE cours').toPandas()

In [31]:
spark.sql('show tables').toPandas()

Unnamed: 0,namespace,tableName,isTemporary
