In [1]:
sc

# Loading (remote) E-OBS observations and summer days (SU) calculation

Git repo: https://github.com/rmsryu/spark-climate-data.git
Data set: https://www.ecad.eu/download/ensembles/download.php

E-OBS data is loaded from the remote OpenDAP repository at KNMI. Please, note that when new versions become available the link is no longer valid since the URL is updated with the version number. Therefore, if the code below doesn't work please check the current E-OBS version and update the URL (https://www.ecad.eu/download/ensembles/download.php). 

# Study of daily precipitation from Station Aalsmeer Netherlands
Data source: https://climexp.knmi.nl/data/rrrr458.dat
Data provide by: Royal Netherlands Meteorological Institute (KNMI)

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from pyspark import SparkFiles
from pyspark.sql.types import *
from pyspark.sql.functions import *



In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
data_files = 'hdfs://hadoop-vm.internal.cloudapp.net:9000/precipitation/data/*/*.parquet'
# Obtain dataset
data = spark.read.parquet(data_files)
data = data.withColumn("precipitation", col("precipitation").cast("float"))
data = data.withColumn("date", col("date").cast("date"))
data.createOrReplaceTempView("climate")
data.cache()

                                                                                

DataFrame[station: string, date: date, precipitation: float]

In [5]:
data.summary().show()

[Stage 4:>                                                          (0 + 1) / 1]

+-------+------------------+-----------------+
|summary|           station|    precipitation|
+-------+------------------+-----------------+
|  count|          13120148|         13120148|
|   mean|483.02698064076714| 2.13989743152896|
| stddev|287.72368836944077|4.339730993138887|
|    min|               001|              0.0|
|    25%|             228.0|              0.0|
|    50%|             466.0|              0.1|
|    75%|             737.0|              2.4|
|    max|               983|            208.0|
+-------+------------------+-----------------+



                                                                                

In [122]:
spark.sql("SELECT station,\
              year(min(date)) min_year, \
              year(max(date)) - year(min(date)) num_years, \
              count(precipitation) count,\
              mean(precipitation) mean, \
              stddev(precipitation) stddev, \
              min(precipitation) min, \
              percentile(precipitation,0.25) 25pct, \
              percentile(precipitation,0.50) 50pct, \
              percentile(precipitation,0.75) 75pct, \
              max(precipitation) max \
            FROM climate GROUP BY station ORDER BY station").show()



+-------+--------+---------+-----+------------------+------------------+---+-----+-------------------+------------------+-----+
|station|min_year|num_years|count|              mean|            stddev|min|25pct|              50pct|             75pct|  max|
+-------+--------+---------+-----+------------------+------------------+---+-----+-------------------+------------------+-----+
|    001|    1940|       83|29556| 2.174519556030149| 4.370148656161713|0.0|  0.0|0.10000000149011612|               2.5| 76.6|
|    003|    1902|       51|18482|2.0838329184571815| 4.277456332945257|0.0|  0.0|0.10000000149011612| 2.299999952316284| 78.4|
|    004|    1904|        1|  419| 1.617899758558996|3.4800441747162214|0.0|  0.0|0.10000000149011612| 1.399999976158142| 26.9|
|    006|    1905|       38|13970|1.7968360776213679| 3.740703703355201|0.0|  0.0|0.10000000149011612| 1.899999976158142| 82.1|
|    007|    1906|       44|15101|1.9131845566100272|3.8756137261021313|0.0|  0.0|                0.0| 2

                                                                                

Finding stations with no missing values


In [153]:
stations_years_complete  = spark.sql("SELECT station, year(date) year FROM climate GROUP BY station, year(date) having count(precipitation) = 365 ")

In [154]:
stations_years_complete = stations_years_complete.filter(col('year') > 1900)

In [155]:
stations_years_complete_g = stations_years_complete.groupby("station").count()

In [162]:
stations_years_complete_f = stations_years_complete_g.filter(col("count") > 90)
stations_years_complete_f.summary().show()



+-------+------------------+------------------+
|summary|           station|             count|
+-------+------------------+------------------+
|  count|                26|                26|
|   mean|432.38461538461536|  91.6923076923077|
| stddev| 276.8216865670863|0.5491251783869141|
|    min|               011|                91|
|    25%|             145.0|                91|
|    50%|             438.0|                92|
|    75%|             663.0|                92|
|    max|               961|                93|
+-------+------------------+------------------+



                                                                                

In [172]:
# Stations with more than 90 years of data
stations_for_analysis = stations_years_complete_f.select("station")

In [173]:
 stations_filter = [item.station for item in stations_for_analysis.collect()]

                                                                                

In [174]:
data.filter(data.station.isin(stations_filter)).summary().show()



+-------+-----------------+------------------+
|summary|          station|     precipitation|
+-------+-----------------+------------------+
|  count|          1345761|           1345761|
|   mean|434.9810397239926|2.1177081224171546|
| stddev|271.9042656388644| 4.274620138428894|
|    min|              011|               0.0|
|    25%|            145.0|               0.0|
|    50%|            438.0|               0.1|
|    75%|            663.0|               2.4|
|    max|              961|             118.5|
+-------+-----------------+------------------+



                                                                                

In [175]:
len(stations_filter)

26