In [22]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Preprocessor pipeline 2")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
# Load the csv file from the chosen dataset into the df variable to preprocess in the rest of the notebook.
df = spark.read.format("csv").option("header", "true") \
       .load("/home/jovyan/data/complete_dataset.csv")

In [23]:
df.head(1)

[Row(date='2015-01-01', demand='99635.03', RRP='25.63369643387471', demand_pos_RRP='97319.24000000002', RRP_positive='26.415952619440922', demand_neg_RRP='2315.79', RRP_negative='-7.239999999999997', frac_at_neg_RRP='0.020833334', min_temperature='13.3', max_temperature='26.9', solar_exposure='23.6', rainfall='0.0', school_day='N', holiday='Y')]

In [24]:
import pyspark.sql.functions as F

df2 = df.withColumn("date", df["date"].cast('timestamp')) \
    .withColumn("demand", df["demand"].cast('float')) \
    .withColumn("RRP", df["RRP"].cast('float')) \
    .withColumn("demand_pos_RRP", df["demand_pos_RRP"].cast('float')) \
    .withColumn("RRP_positive", df["RRP_positive"].cast('float')) \
    .withColumn("demand_neg_RRP", df["demand_neg_RRP"].cast('float')) \
    .withColumn("frac_at_neg_RRP", df["frac_at_neg_RRP"].cast('float')) \
    .withColumn("min_temperature", df["min_temperature"].cast('float')) \
    .withColumn("solar_exposure", df["solar_exposure"].cast('float')) \
    .withColumn("rainfall", df["rainfall"].cast('float')) \
    .withColumn("school_day", df["school_day"].cast('boolean')) \
    .withColumn("holiday", df["holiday"].cast('boolean'))

In [25]:
df.printSchema()
print(df.rdd.id())
print(df.head(1))

df2.printSchema()
print(df2.rdd.id())
print(df2.head(1))

root
 |-- date: string (nullable = true)
 |-- demand: string (nullable = true)
 |-- RRP: string (nullable = true)
 |-- demand_pos_RRP: string (nullable = true)
 |-- RRP_positive: string (nullable = true)
 |-- demand_neg_RRP: string (nullable = true)
 |-- RRP_negative: string (nullable = true)
 |-- frac_at_neg_RRP: string (nullable = true)
 |-- min_temperature: string (nullable = true)
 |-- max_temperature: string (nullable = true)
 |-- solar_exposure: string (nullable = true)
 |-- rainfall: string (nullable = true)
 |-- school_day: string (nullable = true)
 |-- holiday: string (nullable = true)

60
[Row(date='2015-01-01', demand='99635.03', RRP='25.63369643387471', demand_pos_RRP='97319.24000000002', RRP_positive='26.415952619440922', demand_neg_RRP='2315.79', RRP_negative='-7.239999999999997', frac_at_neg_RRP='0.020833334', min_temperature='13.3', max_temperature='26.9', solar_exposure='23.6', rainfall='0.0', school_day='N', holiday='Y')]
root
 |-- date: timestamp (nullable = true)
 |

In [26]:
df3 = df2.withColumn("ts", F.date_format(F.to_timestamp("date","yyyy-MM-dd"),"yyyy-MM-dd hh:ss:SSa")) \
    .withColumn("date", df2["date"].cast('timestamp')) \
    .withColumn("demand", df2["demand"].cast('float')) \
    .withColumn("RRP", df2["RRP"].cast('float')) \
    .withColumn("demand_pos_RRP", df2["demand_pos_RRP"].cast('float')) \
    .withColumn("RRP_positive", df2["RRP_positive"].cast('float')) \
    .withColumn("demand_neg_RRP", df2["demand_neg_RRP"].cast('float')) \
    .withColumn("frac_at_neg_RRP", df2["frac_at_neg_RRP"].cast('float')) \
    .withColumn("min_temperature", df2["min_temperature"].cast('float')) \
    .withColumn("solar_exposure", df2["solar_exposure"].cast('float')) \
    .withColumn("rainfall", df2["rainfall"].cast('float')) \
    .withColumn("school_day", df2["school_day"].cast('boolean')) \
    .withColumn("holiday", df2["holiday"].cast('boolean'))

In [27]:
df4 = df3.withColumn("date", df3["date"].cast('timestamp')) \
    .withColumn("day", F.dayofweek(df3["ts"]).cast('integer')) \
    .withColumn("month", F.month(df3["ts"]).cast('integer')) \
    .withColumn("year", F.year(df3["ts"]).cast('integer')) \
    .withColumn("demand", df3["demand"].cast('float')) \
    .withColumn("RRP", df3["RRP"].cast('float')) \
    .withColumn("demand_pos_RRP", df3["demand_pos_RRP"].cast('float')) \
    .withColumn("RRP_positive", df3["RRP_positive"].cast('float')) \
    .withColumn("demand_neg_RRP", df3["demand_neg_RRP"].cast('float')) \
    .withColumn("frac_at_neg_RRP", df3["frac_at_neg_RRP"].cast('float')) \
    .withColumn("min_temperature", df3["min_temperature"].cast('float')) \
    .withColumn("solar_exposure", df3["solar_exposure"].cast('float')) \
    .withColumn("rainfall", df3["rainfall"].cast('float')) \
    .withColumn("school_day", df3["school_day"].cast('boolean')) \
    .withColumn("holiday", df3["holiday"].cast('boolean'))

In [28]:
# Check the effect of deleted columns
df5 = df4.dropna()
print((df4.count(), len(df4.columns)))
print((df5.count(), len(df5.columns)))

(2106, 18)
(2102, 18)


In [29]:
# Save df6 as a csv file in the jupyter lab environment of the virtual machine.
df5.toPandas().to_csv('complete_dataset_preprocessed.csv')

  series = series.astype(t, copy=False)
