In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType, StructType, StructField

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
schema = StructType([StructField('STN---', IntegerType(), False),
                     StructField('WBAN', IntegerType(), True),
                     StructField('YEARMODA', StringType(), True),
                     StructField('TEMP', FloatType(), True),
                     StructField('DEWP', FloatType(), True),
                     StructField('SLP', FloatType(), True),
                     StructField('STP', FloatType(), True),
                     StructField('VISIB', FloatType(), True),
                     StructField('WDSP', FloatType(), True),
                     StructField('MXSPD', FloatType(), True),
                     StructField('GUST', FloatType(), True),
                     StructField('MAX', StringType(), True),
                     StructField('MIN', StringType(), True),
                     StructField('PRCP', StringType(), True),
                     StructField('SNDP', FloatType(), True),
                     StructField('FRSHTT', StringType(), True)])

In [4]:
weather = spark.read \
               .format("csv") \
               .option("header", "true") \
               .load("data/2019/*.csv", schema=schema)
weather.createOrReplaceTempView('weather')     

In [5]:
weather.show()

+------+-----+--------+----+----+------+------+-----+----+-----+-----+------+-----+-----+-----+------+
|STN---| WBAN|YEARMODA|TEMP|DEWP|   SLP|   STP|VISIB|WDSP|MXSPD| GUST|   MAX|  MIN| PRCP| SNDP|FRSHTT|
+------+-----+--------+----+----+------+------+-----+----+-----+-----+------+-----+-----+-----+------+
|958360|99999|20190101|78.8|54.9|9999.9|9999.9|999.9| 8.8| 13.0|999.9| 96.1*| 61.9|0.00G|999.9|000000|
|958360|99999|20190102|73.1|53.7|9999.9|9999.9|999.9| 9.5| 14.0|999.9| 89.2*|57.4*|0.00G|999.9|000000|
|958360|99999|20190103|79.5|47.4|9999.9|9999.9|999.9| 3.2|  8.0|999.9| 96.6*| 57.2|0.00G|999.9|000000|
|958360|99999|20190104|82.7|52.0|9999.9|9999.9|999.9|13.0| 19.0|999.9|109.8*| 60.6|0.02G|999.9|000000|
|958360|99999|20190105|61.9|47.7|9999.9|9999.9|999.9| 8.5| 15.9|999.9| 70.5*|52.3*|0.02G|999.9|010000|
|958360|99999|20190106|68.6|48.1|9999.9|9999.9|999.9| 9.2| 13.0|999.9| 79.9*| 52.0|0.00G|999.9|000000|
|958360|99999|20190107|75.3|53.3|9999.9|9999.9|999.9| 5.9|  9.9|999.9| 87

In [6]:
weather.select('WBAN').describe().show()

+-------+------------------+
|summary|              WBAN|
+-------+------------------+
|  count|           4158416|
|   mean|  86601.7904581937|
| stddev|30631.749499926314|
|    min|               102|
|    max|             99999|
+-------+------------------+



In [8]:
df = spark.sql("""

SELECT `STN---` AS STN_NO
     , CASE WHEN WBAN = 99999 THEN NULL ELSE WBAN END AS WBAN
     , TO_DATE(YEARMODA, 'yyyyMMdd') as DT_REF
     , CASE WHEN TEMP * 10 = 99999 THEN NULL ELSE TEMP END AS TEMP
     , CASE WHEN DEWP * 10 = 99999 THEN NULL ELSE DEWP END AS DEWP
     , CASE WHEN SLP * 10 = 99999 THEN NULL ELSE SLP END AS SLP
     , CASE WHEN STP * 10 = 99999 THEN NULL ELSE STP END AS STP
     , CASE WHEN VISIB * 10 = 9999 THEN NULL ELSE VISIB END AS VISIB
     , CASE WHEN WDSP * 10 = 9999 THEN NULL ELSE WDSP END AS WDSP
     , CASE WHEN MXSPD * 10 = 9999 THEN NULL ELSE MXSPD END AS MXSPD
     , CASE WHEN GUST * 10 = 9999 THEN NULL ELSE GUST END AS GUST
     , CASE WHEN MAX = '9999.9' THEN NULL ELSE CAST( REGEXP_REPLACE(MAX, '\\\Q*\\\E', '') AS REAL ) END AS MAX
     , CASE WHEN MIN = '9999.9' THEN NULL ELSE CAST( REGEXP_REPLACE(MIN, '\\\Q*\\\E', '') AS REAL ) END AS MIN
     , CASE WHEN PRCP = '99.99' OR PRCP = '0.00G' THEN NULL ELSE CAST( REGEXP_REPLACE(PRCP, '\\\QG\\\E', '') AS REAL ) END AS PRCP
     , CASE WHEN SNDP * 10 = 9999 THEN NULL ELSE SNDP END AS SNDP
     , CAST(SUBSTR(FRSHTT, 1, 1) AS INTEGER) AS FOG
     , CAST(SUBSTR(FRSHTT, 2, 1) AS INTEGER) AS RAIN
     , CAST(SUBSTR(FRSHTT, 3, 1) AS INTEGER) AS SNOW
     , CAST(SUBSTR(FRSHTT, 4, 1) AS INTEGER) AS HAIL
     , CAST(SUBSTR(FRSHTT, 5, 1) AS INTEGER) AS THUNDER
     , CAST(SUBSTR(FRSHTT, 6, 1) AS INTEGER) AS TORNADO
FROM weather
""")
df.show()
df.createOrReplaceTempView('weather')

+------+----+----------+----+----+----+----+-----+----+-----+----+-----+----+----+----+---+----+----+----+-------+-------+
|STN_NO|WBAN|    DT_REF|TEMP|DEWP| SLP| STP|VISIB|WDSP|MXSPD|GUST|  MAX| MIN|PRCP|SNDP|FOG|RAIN|SNOW|HAIL|THUNDER|TORNADO|
+------+----+----------+----+----+----+----+-----+----+-----+----+-----+----+----+----+---+----+----+----+-------+-------+
|958360|null|2019-01-01|78.8|54.9|null|null| null| 8.8| 13.0|null| 96.1|61.9|null|null|  0|   0|   0|   0|      0|      0|
|958360|null|2019-01-02|73.1|53.7|null|null| null| 9.5| 14.0|null| 89.2|57.4|null|null|  0|   0|   0|   0|      0|      0|
|958360|null|2019-01-03|79.5|47.4|null|null| null| 3.2|  8.0|null| 96.6|57.2|null|null|  0|   0|   0|   0|      0|      0|
|958360|null|2019-01-04|82.7|52.0|null|null| null|13.0| 19.0|null|109.8|60.6|0.02|null|  0|   0|   0|   0|      0|      0|
|958360|null|2019-01-05|61.9|47.7|null|null| null| 8.5| 15.9|null| 70.5|52.3|0.02|null|  0|   1|   0|   0|      0|      0|
|958360|null|201

In [10]:
df.count()

4158416