In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import udf
import schemas
import pandas as pd

In [2]:
# initialise sparkContext
spark = SparkSession.builder \
    .master('local') \
    .appName('WindTurbine') \
    .config('spark.executor.memory', '8gb') \
    .config("spark.cores.max", "4") \
    .getOrCreate()

sc = spark.sparkContext

# using SQLContext to read parquet file
sqlContext = SQLContext(sc)


In [79]:
# to read parquet file
settlement_schema = StructType([
    StructField("GSRN", StringType(), False),
    # StructField("TS_ID", StringType(), True),
    StructField("VAERDI", StringType(), False),
    StructField("TIME_CET", StringType(), False),
])

settlement = sqlContext.read.schema(settlement_schema) \
            .parquet('data/ITU_DATA/settlement/2018.parquet')

settlement.fillna({'VAERDI':0})
# windmills = sqlContext.read.csv('data/windmill_cleaned.csv', sep=';', header=True)
# wind_speed_10m = sqlContext.read.parquet('data/ITU_DATA/prognosis/ENetNEA/wind_speed_10m.parquet')

DataFrame[GSRN: string, VAERDI: string, TIME_CET: string]

In [3]:
windmills = sqlContext.read.csv('data/windmill_cleaned.csv', sep=';', header=True)

In [6]:
windmills.select("*").where("grid == 61.2").show(1)

+----+------------+-----------+-------------+---------+-----+-----+-----------+--------------+-------------+----+-------------+
|GSRN|Turbine_type|Parent_GSRN|BBR_municipal|Placement|UTM_x|UTM_y|Capacity_kw|Rotor_diameter|Navhub_height|grid|grid_in_range|
+----+------------+-----------+-------------+---------+-----+-----+-----------+--------------+-------------+----+-------------+
+----+------------+-----------+-------------+---------+-----+-----+-----------+--------------+-------------+----+-------------+



In [80]:
settlement = settlement.withColumn('VAERDI', settlement["VAERDI"].cast(DoubleType())).where("TIME_CET like '%:00:%'").select("*")
settlement.createOrReplaceTempView("settlement")

In [82]:
spark.sql("select * from settlement s").show()

+------------------+------+-------------------+
|              GSRN|VAERDI|           TIME_CET|
+------------------+------+-------------------+
|570715000000062988|   0.0|2018-07-02 00:00:00|
|570715000000062988|   0.0|2018-07-02 01:00:00|
|570715000000062988|   0.0|2018-07-02 02:00:00|
|570715000000062988|   0.0|2018-07-02 03:00:00|
|570715000000062988|   0.0|2018-07-02 04:00:00|
|570715000000062988|   0.0|2018-07-02 05:00:00|
|570715000000062988|   0.0|2018-07-02 06:00:00|
|570715000000062988|   0.0|2018-07-02 07:00:00|
|570715000000062988|   0.0|2018-07-02 08:00:00|
|570715000000062988|   0.0|2018-07-02 09:00:00|
|570715000000062988|   0.0|2018-07-02 10:00:00|
|570715000000062988|   0.0|2018-07-02 11:00:00|
|570715000000062988|   0.0|2018-07-02 12:00:00|
|570715000000062988|   0.0|2018-07-02 13:00:00|
|570715000000062988|   0.0|2018-07-02 14:00:00|
|570715000000062988|   0.0|2018-07-02 15:00:00|
|570715000000062988|   0.0|2018-07-02 16:00:00|
|570715000000062988|   0.0|2018-07-02 17

In [4]:
wind_speed_10m = wind_speed_10m.withColumnRenamed('__index_level_0__', 'TIME')
wind_speed_10m = wind_speed_10m.withColumn("id", monotonically_increasing_id())

In [5]:
settlement.createOrReplaceTempView("settlement")
windmills.createOrReplaceTempView("windmills")
wind_speed_10m.createOrReplaceTempView("wind_speed_10m")

In [6]:
settlement = spark.sql("select s.GSRN, s.VAERDI, s.TIME_CET from settlement s where s.TIME_CET like '%:00:%'")
settlement.createOrReplaceTempView("settlement")
settlement.show(1)

+------------------+------+-------------------+
|              GSRN|VAERDI|           TIME_CET|
+------------------+------+-------------------+
|570715000000062988|     0|2018-07-02 00:00:00|
+------------------+------+-------------------+
only showing top 1 row



In [59]:
windmills = spark.sql("select * from windmills where grid != 0")
windmills.createOrReplaceTempView("windmills")

In [64]:
wind_speed_10m = spark.sql("select * from wind_speed_10m where id in (select max(id) as id from wind_speed_10m group by TIME)")
wind_speed_10m.createOrReplaceTempView("wind_speed_10m")

+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+

In [9]:
test = wind_speed_10m.toPandas()

In [10]:
test.head(2)

Unnamed: 0,190,191,231,232,233,273,274,275,314,315,...,1445,1446,1447,1448,1449,1450,1451,predicted_ahead,TIME,id
0,5.3,4.5,5.9,2.7,4.3,5.5,2.3,2.8,3.5,4.9,...,6.0,5.9,6.8,1.1,2.9,3.8,3.9,1,2018-02-22 01:00:00,0
1,5.3,3.3,6.0,2.4,4.2,5.6,2.3,2.5,4.5,6.2,...,5.8,6.3,4.5,4.2,3.7,3.4,2.1,2,2018-02-22 02:00:00,1


In [92]:
a = 190
"`" + str(list(test[test['TIME'] == '2018-07-02 00:00:00']['650'])[0]) +  "`"

'`2.5`'

In [90]:
list(test[test['TIME'] == '2018-07-02 00:00:00']['650'])[0]

2.5

In [69]:
basicDF = spark.sql("select * from settlement s join windmills w on s.GSRN == w.GSRN")
basicDF.createOrReplaceTempView("basicDF")
basicDF.show(1)

+------------------+------+-------------------+------------------+------------+-----------+-------------+---------+--------+---------+-----------+--------------+-------------+----+--------------------+
|              GSRN|VAERDI|           TIME_CET|              GSRN|Turbine_type|Parent_GSRN|BBR_municipal|Placement|   UTM_x|    UTM_y|Capacity_kw|Rotor_diameter|Navhub_height|grid|       grid_in_range|
+------------------+------+-------------------+------------------+------------+-----------+-------------+---------+--------+---------+-----------+--------------+-------------+----+--------------------+
|570715000000062988|     0|2018-07-02 00:00:00|570715000000062988|           W|       null|          741|      HAV|599547.0|6181039.0|     2300.0|          84.6|         61.2| 945|985,1028,986,1030...|
+------------------+------+-------------------+------------------+------------+-----------+-------------+---------+--------+---------+-----------+--------------+-------------+----+------------

In [93]:
grid = udf(lambda g, t: "`" + str(list(test[test['TIME'] == t][g])[0]) + "`", StringType())


In [94]:
a = spark.udf.register("extractInfo", grid)

In [95]:
spark.sql("select extractInfo(b.`grid`, w.`TIME`) as aa from basicDF b join wind_speed_10m w on b.`TIME_CET` == w.`TIME`").show(1)

Py4JJavaError: An error occurred while calling o699.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 72.0 failed 1 times, most recent failure: Lost task 0.0 in stage 72.0 (TID 2865, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 267, in main
    ("%d.%d" % sys.version_info[:2], version))
Exception: Python in worker has different version 2.7 than that in driver 3.7, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage10.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.GeneratedMethodAccessor88.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:497)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 267, in main
    ("%d.%d" % sys.version_info[:2], version))
Exception: Python in worker has different version 2.7 than that in driver 3.7, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:81)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:64)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage10.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more
