In [None]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark import SQLContext
spark = SparkSession.builder.config(conf=SparkConf()).getOrCreate()
# Print config
# print(spark.sparkContext.getConf().getAll())
path_to_csv = "./Datasets/Orlen.csv"

In [None]:
df = spark.read.csv(path_to_csv, header=True, inferSchema=True)
# print(df.show())

In [None]:
df_cleared = df.drop("Adj Close")
# print(df_cleared.show())

In [None]:
from pyspark.sql.window import Window
import pyspark.sql.functions as func
df_lag = df_cleared.withColumn('prev_day_price',
                               func.lag(df_cleared['Close']).over(
                                   Window.orderBy("Date")))
# df_lag.show()

df_with_prev_day_price = df_lag.withColumn(
    'daily_return',
    (df_lag['Close'] - df_lag['prev_day_price']) / df_lag['Close'])
# df_with_prev_day_price.show()

In [None]:
from pyspark.sql import functions as F
labeled = df_with_prev_day_price.withColumn(
    'profit',
    (F.when(df_with_prev_day_price["daily_return"] < 0, 0).otherwise(1)))
# labeled.show()

In [None]:
final_df = labeled.where(labeled["daily_return"].isNotNull())
result_df = final_df
# final_df.show()

In [None]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
final_df = final_df.drop("prev_day_price").withColumnRenamed(
    existing="daily_return", new="Daily return")

converted_df = final_df.select(
    final_df["Open"].cast("float"), final_df["High"].cast("float"),
    final_df["Low"].cast("float"), final_df["Close"].cast("float"),
    final_df["Daily return"].cast("float"), final_df["profit"].cast("int"))

train, test = converted_df.randomSplit([0.1, 0.9])
print("We have %d training examples and %d test examples." % (train.count(),
                                                              test.count()))

In [None]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
featuresCols = converted_df.columns
featuresCols.remove('profit')
print(featuresCols)

# This concatenates all feature columns into a single feature vector in a new column "rawFeatures".
vectorAssembler_dt = VectorAssembler(
    inputCols=featuresCols, outputCol="rawFeatures")
# This identifies categorical features and indexes them.
vectorIndexer_dt = VectorIndexer(
    inputCol="rawFeatures", outputCol="features", maxCategories=4)

dt = DecisionTreeRegressor(labelCol="Close")

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
# Define a grid of hyperparameters to test:
#  - maxDepth: max depth of each decision tree in the GBT ensemble
# In this example notebook, we keep these values small.
# In practice, to get the highest accuracy, you would likely want to try deeper trees (10 or higher)
# and more trees in the ensemble (>100).

max_Depth_Range = list(range(1, 10))
min_InstancesPerNode = list(range(1, 8))
max_Bins = list(range(16, 48))


paramGrid_dt = ParamGridBuilder()\
    .addGrid(dt.maxDepth, max_Depth_Range)\
    .addGrid(dt.maxBins,max_Bins )\
    .addGrid(dt.minInstancesPerNode,min_InstancesPerNode )\
    .addGrid(dt.maxMemoryInMB, [1500]).build()

# We define an evaluation metric.  This tells CrossValidator how well we are doing by comparing the true
# labels with predictions.
evaluator_dt = RegressionEvaluator(
    metricName="rmse",
    labelCol=dt.getLabelCol(),
    predictionCol=dt.getPredictionCol())

# Declare the CrossValidator, which runs model tuning for us.
cv_dt = CrossValidator(
    estimator=dt, evaluator=evaluator_dt, estimatorParamMaps=paramGrid_dt)

pipeline_dt = Pipeline(stages=[vectorAssembler_dt, vectorIndexer_dt, cv_dt])

pipelineModel_dt = pipeline_dt.fit(train)

predictions_dt = pipelineModel_dt.transform(test)

rmse_dt = evaluator_dt.evaluate(predictions_dt)

print("RMSE on our test set: %g" % rmse_dt)

df_to_plot_dt = predictions_dt.drop("Open", "High", "Low", "Daily return",
                                    "profit", "rawFeatures", 'features')
df_to_plot_dt.show()

import matplotlib.pyplot as plt_dt
df_to_plot_dt = df_to_plot_dt.toPandas()
plt_dt.figure(figsize=(14, 14))
plt_dt.plot(df_to_plot_dt)
plt_dt.show()

In [None]:
# predictions.show()
# print(predictions.schema)
# predictions.select('features').show(20,False)

In [None]:
# Train a RandomForest model.
# Random Forest
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
# Define a grid of hyperparameters to test:
#  - maxDepth: max depth of each decision tree in the GBT ensemble
# In this example notebook, we keep these values small.
# In practice, to get the highest accuracy, you would likely want to try deeper trees (10 or higher)
# and more trees in the ensemble (>100).

featuresCols = converted_df.columns
featuresCols.remove('profit')
# print(featuresCols)

# This concatenates all feature columns into a single feature vector in a new column "rawFeatures".
vectorAssembler_rf = VectorAssembler(
    inputCols=featuresCols, outputCol="rawFeatures")
# This identifies categorical features and indexes them.
vectorIndexer_rf = VectorIndexer(
    inputCol="rawFeatures", outputCol="features", maxCategories=4)

rf = RandomForestRegressor(labelCol='Close', featuresCol="features")

paramGrid_rf = ParamGridBuilder()\
    .addGrid(rf.maxDepth, [2,10,20])\
    .addGrid(rf.maxBins, [16, 32, 64 ,128])\
    .addGrid(rf.minInfoGain, [0.1])\
    .build()

# We define an evaluation metric.  This tells CrossValidator how well we are doing by comparing the true
# labels with predictions.
evaluator_rf = RegressionEvaluator(
    metricName="rmse",
    labelCol=rf.getLabelCol(),
    predictionCol=rf.getPredictionCol())

# Declare the CrossValidator, which runs model tuning for us.
cv_rf = CrossValidator(
    estimator=rf, evaluator=evaluator_rf, estimatorParamMaps=paramGrid_rf)

pipeline_rf = Pipeline(stages=[vectorAssembler_rf, vectorIndexer_rf, cv_rf])

pipelineModel_rf = pipeline_rf.fit(train)

predictions_rf = pipelineModel_rf.transform(test)

rmse_rf = evaluator_rf.evaluate(predictions_rf)

print("RMSE on our test set: %g" % rmse_rf)

df_to_plot_rf = predictions_rf.drop("Open", "High", "Low", "Daily return",
                                    "profit", "rawFeatures", 'features')

import matplotlib.pyplot as plt_rf
df_to_plot_rf = df_to_plot_rf.toPandas()
plt_rf.figure(figsize=(14, 14))
plt_rf.plot(df_to_plot_rf)
plt_rf.show()

In [None]:
# Train a GBTrees model.
# Random Forest
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
# Define a grid of hyperparameters to test:
#  - maxDepth: max depth of each decision tree in the GBT ensemble
# In this example notebook, we keep these values small.
# In practice, to get the highest accuracy, you would likely want to try deeper trees (10 or higher)
# and more trees in the ensemble (>100).

featuresCols = converted_df.columns
featuresCols.remove('profit')
# print(featuresCols)

# This concatenates all feature columns into a single feature vector in a new column "rawFeatures".
vectorAssembler_gbt = VectorAssembler(
    inputCols=featuresCols, outputCol="rawFeatures")
# This identifies categorical features and indexes them.
vectorIndexer_gbt = VectorIndexer(
    inputCol="rawFeatures", outputCol="features", maxCategories=4)

gbt = GBTRegressor(labelCol='Close', featuresCol="features")

paramGrid_gbt = ParamGridBuilder()\
    .addGrid(gbt.maxDepth, [2, 5])\
    .addGrid(gbt.maxIter, [10, 20])\
    .build()

# We define an evaluation metric.  This tells CrossValidator how well we are doing by comparing the true
# labels with predictions.
evaluator_gbt = RegressionEvaluator(
    metricName="rmse",
    labelCol=gbt.getLabelCol(),
    predictionCol=gbt.getPredictionCol())

# Declare the CrossValidator, which runs model tuning for us.
cv_gbt = CrossValidator(
    estimator=gbt, evaluator=evaluator_gbt, estimatorParamMaps=paramGrid_gbt)

pipeline_gbt = Pipeline(
    stages=[vectorAssembler_gbt, vectorIndexer_gbt, cv_gbt])

pipelineModel_gbt = pipeline_gbt.fit(train)

predictions_gbt = pipelineModel_gbt.transform(test)

rmse_gbt = evaluator_gbt.evaluate(predictions_gbt)

print("RMSE on our test set: %g" % rmse_gbt)

df_to_plot_gbt = predictions_gbt.drop("Open", "High", "Low", "Daily return",
                                      "profit", "rawFeatures", 'features')

import matplotlib.pyplot as plt_gbt
df_to_plot_gbt = df_to_plot_gbt.toPandas()
plt_gbt.figure(figsize=(14, 14))
plt_gbt.plot(df_to_plot_gbt)
plt_gbt.show()

In [46]:
import pandas as pd
from pandas_datareader import data, wb
import datetime
from pyspark import SQLContext
import matplotlib.pyplot as plt


def RSI(dataframe, column, window_length, avg_type):
    data = dataframe.toPandas()
    # Get just the close
    close = data[column]
    # Get the difference in price from previous step
    delta = close.diff()
    # Get rid of the first row, which is NaN since it did not have a previous
    # row to calculate the differences
    # Make the positive gains (up) and negative gains (down) Series
    up, down = delta.copy(), delta.copy()
    up[up < 0] = 0
    down[down > 0] = 0
    if avg_type == "EWMA":
        roll_up = up.ewm(com=14).mean()
        roll_down = down.abs().ewm(com=14).mean()
        RS = roll_up / roll_down
        RSI = 100.0 - (100.0 / (1.0 + RS))
    elif avg_type == "SMA":
        roll_up = pd.rolling_mean(up, window_length)
        roll_down = pd.rolling_mean(down.abs(), window_length)
        RS = roll_up / roll_down
        RSI = 100.0 - (100.0 / (1.0 + RS))
    from pyspark.sql.types import FloatType
    temp_df = spark.createDataFrame(data=RSI, schema=FloatType()).fillna(0)
    from pyspark.sql.functions import monotonically_increasing_id
    df1 = temp_df.select("*").withColumn("id", monotonically_increasing_id())
    df1 = df1.withColumnRenamed('value', 'RSI')
    df2 = converted_df.select("*").withColumn("id",
                                              monotonically_increasing_id())
    joined_df = df1.join(df2, df1.id == df2.id)
    return joined_df


test = RSI(converted_df, "Close", 14, 'SMA')

test.show()

+---------+---+-----+-----+-----+-----+-------------+------+---+
|      RSI| id| Open| High|  Low|Close| Daily return|profit| id|
+---------+---+-----+-----+-----+-----+-------------+------+---+
|      0.0|  0|7.802|7.809|7.802|7.809|  0.013189909|     1|  0|
|      0.0|  1| 7.95| 7.95|7.782|7.782|-0.0034695452|     0|  1|
|      0.0|  2|7.688|7.688|7.688|7.688| -0.012226847|     0|  2|
|      0.0|  3|7.526|7.526|7.459|7.459| -0.030701166|     0|  3|
|      0.0|  4|7.328|7.328|7.328|7.328| -0.017876638|     0|  4|
|      0.0|  5|7.514|7.514|7.514|7.514|  0.024753792|     1|  5|
|      0.0|  6|7.317|7.317|7.317|7.317| -0.026923602|     0|  6|
|      0.0|  7|7.502|7.569|7.502|7.569|  0.033293698|     1|  7|
|      0.0|  8|7.701|7.701|7.701|7.701|   0.01714063|     1|  8|
|      0.0|  9|7.671|7.671|7.671|7.671| -0.003910833|     0|  9|
|      0.0| 10|7.935|7.935|7.935|7.935|   0.03327032|     1| 10|
|      0.0| 11|7.978|7.978|7.978|7.978|  0.005389822|     1| 11|
|      0.0| 12|8.089|8.08

	Series.rolling(window=14,center=False).mean()
	Series.rolling(window=14,center=False).mean()
