In [1]:
import findspark
findspark.init()
import pyspark

from pyspark.sql import SparkSession
from pyspark import SparkConf

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, VectorIndexer, StringIndexer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# Algorithms
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier

# Others
from collections import OrderedDict

# Graphs libs
import matplotlib.pyplot as plt

# Supress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

# Spark context simple configuration
spark = SparkSession.builder.config(conf=SparkConf()).getOrCreate()

spark.version

'2.2.0'

In [2]:
%load_ext autoreload
%autoreload 2

from Helpers.technical_indicators import calc_ti
from Helpers.generated_features import features_from_OHLC
from Helpers.CustomTS import TrainValidationSplitSorted
from Helpers.best_model_params import *

from ProcessingData.processing import initial_processing, calc_profit, transform_date, train_test_split, complete_processing
from Stats.measures import calc_metrics

In [3]:
ManualSplit = True
SORT = True
CHUNKS = 5

CV = False

DEBUG = False
RANDOM_SEED = 1

In [4]:
# path_to_csv = "s3://stocksets100/Orlen.csv"
path = "./Datasets/KGHA.csv"
df = complete_processing(spark, path)

train, test = train_test_split(spark, df, CHUNKS, SORT, ManualSplit, RANDOM_SEED)

We have 1046 training examples and 261 test examples. 



In [5]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
featuresCols = df.columns
featuresCols.remove('Profit')
featuresCols.remove('id')

print(featuresCols)

# Vector Assembler
# This concatenates all feature columns into a single feature vector in a new column "rawFeatures".
# Used for assembling features into a vector.
# We will pass all the columns that we are going to use for the prediction to the VectorAssembler and
# it will create a new vector column.
vectorAssembler_rt = VectorAssembler(
    inputCols=featuresCols, outputCol="features")

['Open', 'High', 'Volume', 'Low', 'Close', 'avg_price_5', 'avg_price_30', 'avg_price_365', 'ratio_avg_price_5_30', 'ratio_avg_price_5_365', 'ratio_avg_price_30_365', 'avg_volume_5', 'avg_volume_30', 'avg_volume_365', 'ratio_avg_volume_5_30', 'ratio_avg_volume_5_365', 'ratio_avg_volume_30_365', 'std_price_5', 'std_price_30', 'std_price_365', 'ratio_std_price_5_30', 'ratio_std_price_5_365', 'ratio_std_price_30_365', 'std_volume_5', 'std_volume_30', 'std_volume_365', 'ratio_std_volume_5_30', 'ratio_std_volume_5_365', 'ratio_std_volume_30_365', 'return_1', 'return_5', 'return_30', 'return_365', 'moving_avg_5', 'moving_avg_30', 'moving_avg_365', 'MACD', 'CCI', 'OBV', 'RSI']


In [6]:
# Patrameters grid testing
# rt = DecisionTreeClassifier(
#     labelCol='Profit', featuresCol="features", minInfoGain=0.01,  maxBins=200)

rt = RandomForestClassifier(
    labelCol='Profit', featuresCol="features", numTrees=10, maxBins=300)

max_Depth_Range = [3]
min_InstancesPerNode = list(range(5, 15))

paramGrid = ParamGridBuilder() \
    .addGrid(rt.maxDepth, max_Depth_Range) \
    .addGrid(rt.maxMemoryInMB, [1000] ).build()

# We define an evaluation metric. This tells Validator how well we are doing by comparing the true
# labels with predictions.

evaluator = MulticlassClassificationEvaluator(
    labelCol=rt.getLabelCol(),
    metricName='accuracy',
    predictionCol=rt.getPredictionCol())

In [None]:
# Declare the CrossValidator, which runs model tuning for us.
if CV:
    val = CrossValidator(
        estimator=rt,
        evaluator=multi,
        estimatorParamMaps=paramGrid,
        numFolds=2)
else:
    val = TrainValidationSplitSorted(
        chunks = CHUNKS,
        spark = spark,
        estimator=rt,
        estimatorParamMaps=paramGrid,
        evaluator=evaluator)

In [None]:
# Creating Final pipeline object
pipeline_rt = Pipeline(stages=[vectorAssembler_rt, val])

# FITTING!
pipelineModel_rt = pipeline_rt.fit(train)

# Getting the Best Model
best_classifier = pipelineModel_rt.stages[-1].bestModel

In [None]:
# Feature importance
# print('Features importances' + str(best_classifier.featureImportances))
final_features = best_classifier.featureImportances

feature_dict = {}
for feature, importance in zip(featuresCols, final_features):
    feature_dict[feature] = importance

feature_dict = OrderedDict(sorted(feature_dict.items(), key=lambda t: t[1], reverse=True)) 

i = 1
for feature, importance in feature_dict.items():
    print("{} ; {} ; {}".format(i, feature, round(importance, 3)))
    i+=1

In [None]:
# Making Predictions!
predictions = pipelineModel_rt.transform(test)

In [None]:
#evaluate results
calc_metrics(predictions)

In [None]:
if DEBUG != True:
    df_to_plot_rt = predictions.select('prediction', 'Profit')
    df_to_plot_rt = df_to_plot_rt.toPandas()
    plt.figure(figsize=(24, 3))
    plt.plot(df_to_plot_rt)
    plt.legend(df_to_plot_rt.columns)
    plt.show()

In [None]:
import pandas as pd
i = 0
# path_to_csv = "s3://stocksets100/Orlen.csv"
path = "./Datasets/KGHA.csv"
df = complete_processing(spark, path)
ManualSplit = False
results = []
for i in range(30):
    train, test = train_test_split(spark, df, CHUNKS, SORT, ManualSplit, RANDOM_SEED + i)
    predictions = pipelineModel_rt.transform(test)
    results.append(calc_metrics(predictions)['accuracy'])
    i+=1
    print("#########################################################################")

In [None]:
from scipy import stats
stats_df = pd.DataFrame( {'Accuracy' : results})
print(stats_df)

In [None]:
plt.boxplot(stats_df['Accuracy'], labels=stats_df.columns)
plt.show()