In [57]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

# Spark session & context
spark = SparkSession.builder.master('local[2]').getOrCreate()
sc = spark.sparkContext

Import libraries for training the ML models

In [58]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression


When we read in Time Series data, we need to "bin" it appropriately.  

We need to decide what to do if we have several values in a bin.  Take the average?  Just pick one?

In [79]:
# units are seconds
ts_bin_size = 60 * 60 * 24  # Round to nearest day

Read the OHLC data

In [80]:
bitcoin_price_ohlc_DF = spark.read.csv("data/btc/price-ohlc.csv", inferSchema=True, header=True)
bitcoin_price_ohlc_DF = bitcoin_price_ohlc_DF.withColumn(
  'ts_bin',
  F.round(F.col('time') / ts_bin_size)
)
bitcoin_price_ohlc_DF.printSchema()


root
 |-- time: integer (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- ts_bin: double (nullable = true)



Next, read in price average data

In [81]:
bitcoin_price_DF = spark.read.csv("data/btc/price.csv", inferSchema=True, header=True)
bitcoin_price_DF = bitcoin_price_DF.withColumn(
  'ts_bin',
  F.round(F.col('time') / ts_bin_size)
)
bitcoin_price_DF.show()


+----------+--------------------+-------+
|      time|               price| ts_bin|
+----------+--------------------+-------+
|1279324800|0.049510000000000005|14807.0|
|1279411200|             0.08584|14808.0|
|1279497600|              0.0808|14809.0|
|1279584000| 0.07473333333333332|14810.0|
|1279670400|             0.07921|14811.0|
|1279756800|            0.055945|14812.0|
|1279843200|0.062283333333333336|14813.0|
|1279929600|             0.05454|14814.0|
|1280016000|              0.0505|14815.0|
|1280102400|               0.056|14816.0|
|1280188800|0.059844444444444436|14817.0|
|1280275200|              0.0589|14818.0|
|1280361600| 0.06920000000000001|14819.0|
|1280448000| 0.06428333333333333|14820.0|
|1280534400|             0.06785|14821.0|
|1280620800|              0.0611|14822.0|
|1280707200|                0.06|14823.0|
|1280793600|                0.06|14824.0|
|1280880000|             0.05795|14825.0|
|1280966400|               0.061|14826.0|
+----------+--------------------+-

We can do a filtered binned join to join these together, which is necessary because our timestamps don't match perfectly

In [82]:
combined = bitcoin_price_ohlc_DF.join(bitcoin_price_DF, bitcoin_price_DF.ts_bin == bitcoin_price_ohlc_DF.ts_bin, 'outer') \
  .select(
    bitcoin_price_ohlc_DF.ts_bin, 
    bitcoin_price_ohlc_DF.time, 
    bitcoin_price_ohlc_DF.open, 
    bitcoin_price_ohlc_DF.close, 
    bitcoin_price_DF.price
    ) \
  .sort(F.desc("ts_bin"))
combined.show()


In [None]:
feature_assembler = VectorAssembler(inputCols=["time", "open", "high", "low", "close"], handleInvalid='skip')
output = feature_assembler.transform(bitcoin_price_ohlc_DF)
output.limit(2).show()

+----------+--------------------+--------------------+--------------------+--------------------+--------+------------------------------------+
|      time|                open|                high|                 low|               close|  ts_bin|VectorAssembler_749876957ca6__output|
+----------+--------------------+--------------------+--------------------+--------------------+--------+------------------------------------+
|1279407600|0.049510000000000005|0.049510000000000005|0.049510000000000005|0.049510000000000005|355391.0|                [1.2794076E9,0.04...|
|1279422000|             0.05941|             0.05941|             0.05941|             0.05941|355395.0|                [1.279422E9,0.059...|
+----------+--------------------+--------------------+--------------------+--------------------+--------+------------------------------------+



In [None]:
regressor = LinearRegression(featuresCol='', labelCol='')