In [1]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pipelines.util.platform import start_spark
from pipelines.util.storage import read_csv,read_csvfolder,write_partby_parquet,write_parquet,read_parquet_filter
from pipelines.extraction.extract_wtdata import get_latestextract_wtdata
from pipelines.validation.validate_wtdata import get_lastvalidated_wtdata,validate_missing_days,validate_wind_speed,validate_wind_direction,validate_power_output
from pipelines.metadata import MetaData
import datetime

In [3]:
sps, logger, conf = start_spark(app_name='process_wtdata')

local


In [3]:
md = MetaData()
watermark_date = md.get_watermark_date(sps)
sch = T.StructType([
    T.StructField("timestamp", T.TimestampType(), True),
    T.StructField("turbine_id", T.IntegerType(), True),
    T.StructField("wind_speed", T.DoubleType(), True),
    T.StructField("wind_direction", T.IntegerType(), True),
    T.StructField("power_output", T.DoubleType(), True),
    T.StructField("outputdate", T.DateType(), True)
])
pqoptions = {
    "schema": sch,
    "inferSchema": "false"
}

wtdata_latest = get_latestextract_wtdata(sps, md, pqoptions)

if (watermark_date != "1900-01-01"):
    wtdata_missing = validate_missing_days(sps, md, wtdata_latest, pqoptions)
    wtdata_validated = wtdata_missing.union(wtdata_latest)
else:
    wtdata_validated = wtdata_latest.withColumn("is_missing", F.lit(False))

In [4]:
#wtdata_validated.count()
wtdata_validated.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- wind_speed: double (nullable = true)
 |-- wind_direction: integer (nullable = true)
 |-- power_output: double (nullable = true)
 |-- outputdate: date (nullable = true)
 |-- turbine_id: integer (nullable = true)
 |-- is_missing: boolean (nullable = false)



In [5]:
wtdata_validated = validate_wind_speed(wtdata_validated)
wtdata_validated = validate_power_output(wtdata_validated)
wtdata_validated = validate_wind_direction(wtdata_validated)

In [6]:
wtdata_validated.printSchema()
#wtdata_validated.show(truncate=False)

root
 |-- timestamp: timestamp (nullable = true)
 |-- wind_speed: double (nullable = true)
 |-- wind_direction: integer (nullable = true)
 |-- power_output: double (nullable = true)
 |-- outputdate: date (nullable = true)
 |-- turbine_id: integer (nullable = true)
 |-- is_missing: boolean (nullable = false)
 |-- wind_speed_cleaned: double (nullable = true)
 |-- power_output_cleaned: double (nullable = true)
 |-- wind_direction_cleaned: integer (nullable = true)



In [7]:
write_parquet(sps, wtdata_validated, "{storage_path}".format(storage_path = sps.conf.get('storage.validated')))

In [8]:
md.set_watermark_date(sps, wtdata_validated)