In [1]:
import pandas as pd
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import monotonically_increasing_id, udf
from pyspark.sql.types import *
import warnings
warnings.filterwarnings("ignore")

import schemas
import tools

In [2]:
def load_parquet(path, schema:StructType = None):
    """
    load apache parquet file
    """
    return sqlContext.read.schema(schema).parquet(path) if schema is not None else sqlContext.read.parquet(path)

def load_parquet_from_weather_toPandas(path, schema:StructType = None):
    """
    load apache parquet file
    """
    weather = load_parquet(path) \
                    .withColumnRenamed("__index_level_0__", "TIME") \
                    .withColumn("id", monotonically_increasing_id())
    weather.createOrReplaceTempView("weather_temp")
    weather_dic = spark.sql("select * from weather_temp where id in (select max(id) as id from weather_temp group by TIME)").toPandas()
    return weather_dic

def load_csv(path, schema:StructType = None):
    """
    load csv file
    """
    # return sqlContext.read.schema(schema).csv(path, sep=";", header=True, schema=schema) if schema is not None else sqlContext.read.schema(schema).csv(path, sep=";", header=True)
    return sqlContext.read.csv(path, sep=";", header=True, schema=schema)


In [3]:
def regist_udf_grid(name:str, df:pd.DataFrame, type = DoubleType()):
    _ = spark.udf.register(name, lambda g, t: list(df[df['TIME'] == t][g])[0], type)


In [4]:
def udf_by_grid(df:pd.DataFrame, type = DoubleType()):
    return udf(lambda g, t: list(df[df['TIME'] == t][g])[0], type)
def udf_by_ws():
    schema = StructType([
        StructField("u_interp", DoubleType(), True),
        StructField("v_interp", DoubleType(), True)
    ])
    return udf(lambda s1, d1, s2, d2, z: tools.wind_interp(s1, d1, s2, d2, z), schema)

In [5]:
def udf_regist():
    udf_ws10  = udf_by_grid(ws10_dic, DoubleType())
    udf_ws100  = udf_by_grid(ws100_dic, DoubleType())
    udf_wd10  = udf_by_grid(wd10_dic, IntegerType())
    udf_wd100  = udf_by_grid(wd100_dic, IntegerType())
    udf_ws_interp  = udf_by_ws()
    return udf_ws10, udf_ws100, udf_wd10, udf_wd100, udf_ws_interp

In [6]:
# initialise sparkContext\
spark = SparkSession.builder \
    .master("local") \
    .appName("WindTurbine") \
    .config("spark.executor.memory", "8gb") \
    .config("spark.cores.max", "4") \
    .getOrCreate()

sc = spark.sparkContext

# using SQLContext to read parquet file
sqlContext = SQLContext(sc)

In [7]:
 # to read parquet file
settlement = load_parquet("data/ITU_DATA/settlement/2018.parquet" ,schemas.settlement_schema) \
                .fillna({"VAERDI":0})
settlement = settlement.withColumn("VAERDI", settlement["VAERDI"].cast(DoubleType())).where("TIME_CET like '%:00:%'").select("*")
# settlement.createOrReplaceTempView("settlement")

In [8]:
windmills = load_csv("data/windmill_cleaned.csv", schemas.windmills_schema)
# windmills.createOrReplaceTempView("windmills")

In [9]:
ws10_dic = load_parquet_from_weather_toPandas("data/ITU_DATA/prognosis/ENetNEA/wind_speed_10m.parquet")
ws100_dic = load_parquet_from_weather_toPandas("data/ITU_DATA/prognosis/ENetNEA/wind_speed_100m.parquet")
wd10_dic = load_parquet_from_weather_toPandas("data/ITU_DATA/prognosis/ENetNEA/wind_direction_10m.parquet")
wd100_dic = load_parquet_from_weather_toPandas("data/ITU_DATA/prognosis/ENetNEA/wind_direction_100m.parquet")


In [10]:
udf_ws10, udf_ws100, udf_wd10, udf_wd100, udf_ws_interp = udf_regist()

In [11]:
basicDF = settlement.join(windmills, on="GSRN") \
                    .where(windmills.grid != 0) \
                    .withColumn("ws10", udf_ws10("grid", "TIME_CET")) \
                    .withColumn("ws100", udf_ws100("grid", "TIME_CET")) \
                    .withColumn("wd10", udf_wd10("grid", "TIME_CET")) \
                    .withColumn("wd100", udf_wd100("grid", "TIME_CET")) \
                    .withColumn("wsCol", udf_ws_interp("ws10","wd10","ws100","wd10","Navhub_height")) \
                    .select("GSRN", "TIME_CET","Navhub_height", "VAERDI", "wsCol.u_interp", "wsCol.v_interp")
# basicDF.show(1)

In [40]:
basicDF.select("GSRN", "TIME_CET","Navhub_height").where("Navhub_height == 0").show(1)

+----+--------+-------------+
|GSRN|TIME_CET|Navhub_height|
+----+--------+-------------+
+----+--------+-------------+



In [12]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.clustering import KMeans

In [13]:
feat_cols = ["VAERDI", "u_interp", "v_interp"]
vec_assembler = VectorAssembler(inputCols=feat_cols, outputCol='features')
final_data = vec_assembler.transform(basicDF)

In [34]:
final_data.show(1)

+------------------+-------------------+------+--------------------+------------------+--------------------+
|              GSRN|           TIME_CET|VAERDI|            u_interp|          v_interp|            features|
+------------------+-------------------+------+--------------------+------------------+--------------------+
|570715000000062988|2018-07-02 00:00:00|   0.0|-0.21473597519158166|-4.097413372855558|[0.0,-0.214735975...|
+------------------+-------------------+------+--------------------+------------------+--------------------+
only showing top 1 row



In [44]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False)
scalerModel = scaler.fit(final_data)
cluster_final_data = scalerModel.transform(final_data)

KeyboardInterrupt: 

In [14]:
kmean3 = KMeans(featuresCol='features', k=3)

In [16]:
model_k3 = kmean3.fit(final_data)

KeyboardInterrupt: 