In [1]:
import findspark
findspark.init()
import pyspark

from pyspark.sql import SparkSession
from pyspark import SparkConf

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, VectorIndexer, StringIndexer, Normalizer, StandardScaler, MaxAbsScaler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator


# Algorithms
from pyspark.ml.classification import DecisionTreeClassifier, LinearSVC, RandomForestClassifier


# Others
from collections import OrderedDict

# Graphs libs
import matplotlib.pyplot as plt

# Supress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

# Spark context simple configuration
spark = SparkSession.builder.config(conf=SparkConf()).getOrCreate()

spark.version


'2.2.0'

In [2]:
%load_ext autoreload
%autoreload 2

from Helpers.technical_indicators import calc_ti
from Helpers.generated_features import features_from_OHLC
from Helpers.CustomTS import TrainValidationSplitSorted
from Helpers.best_model_params import *

from ProcessingData.processing import initial_processing, calc_profit, transform_date, train_test_split, complete_processing
from Stats.measures import calc_metrics

In [3]:
ManualSplit = True
SORT = True
CHUNKS = 10

CV = False

DEBUG = False
RANDOM_SEED = 1

In [4]:
# path_to_csv = "s3://stocksets100/Orlen.csv"
path = "./Datasets/Orlen.csv"
df = complete_processing(spark, path)

train, test = train_test_split(spark, df, CHUNKS, SORT, ManualSplit, RANDOM_SEED)

We have 1211 training examples and 134 test examples. 



In [5]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
featuresCols = df.columns
featuresCols.remove('Profit')
featuresCols.remove('id')
print(featuresCols)

['Volume', 'MACD', 'CCI', 'OBV', 'RSI']


In [6]:
# Patrameters grid testing
# classifier = DecisionTreeClassifier(
#     labelCol='Profit', featuresCol="features", minInfoGain=0.01,  maxBins=200)

classifier = RandomForestClassifier(
    labelCol='Profit', featuresCol="features", numTrees=50, maxBins=100)
max_Depth_Range = [2]
min_InstancesPerNode = list(range(5, 15))
paramGrid = ParamGridBuilder() \
    .addGrid(classifier.maxDepth, max_Depth_Range).build() \

# classifier = LinearSVC(regParam=0.5, labelCol='Profit', featuresCol="features")
# max_Iter= [10]
# paramGrid = ParamGridBuilder() \
#     .addGrid(classifier.maxIter, max_Iter).build()

# We define an evaluation metric. This tells Validator how well we are doing by comparing the true
# labels with predictions.

evaluator = MulticlassClassificationEvaluator(
    labelCol=classifier.getLabelCol(),
    metricName='accuracy',
    predictionCol=classifier.getPredictionCol())

In [7]:
# Declare the CrossValidator, which runs model tuning for us.
if CV:
    validator = CrossValidator(
        estimator=classifier,
        evaluator=multi,
        estimatorParamMaps=paramGrid,
        numFolds=2)
else:
    validator = TrainValidationSplitSorted(
        chunks=CHUNKS,
        spark=spark,
        estimator=classifier,
        estimatorParamMaps=paramGrid,
        evaluator=evaluator)

In [8]:
scaler = None
# scaler = Normalizer(inputCol="features", outputCol="features", p=1.0)
# scaler = StandardScaler(inputCol="rawFeatures", outputCol='features', withMean=True, withStd=True)
# scaler = MaxAbsScaler(inputCol="rawFeatures", outputCol="features")

In [9]:
# Vector Assembler
# This concatenates all feature columns into a single feature vector in a new column "rawFeatures".
# Used for assembling features into a vector.
# We will pass all the columns that we are going to use for the prediction to the VectorAssembler and
# it will create a new vector column.
# Creating Final pipeline object
if scaler is not None:
    vectorAssembler_rt = VectorAssembler(
    inputCols=featuresCols, outputCol="rawfeatures")
    pipeline = Pipeline(stages=[vectorAssembler_rt,scaler, validator])
else:
    vectorAssembler_rt = VectorAssembler(
    inputCols=featuresCols, outputCol="features")
    pipeline = Pipeline(stages=[vectorAssembler_rt, validator])

In [10]:
# FITTING!
pipelineModel = pipeline.fit(train)

# Getting the Best Model
best_classifier = pipelineModel.stages[-1].bestModel

In [11]:
# Feature importance
final_features = best_classifier.featureImportances

feature_dict = {}
for feature, importance in zip(featuresCols, final_features):
    feature_dict[feature] = importance

feature_dict = OrderedDict(sorted(feature_dict.items(), key=lambda t: t[1], reverse=True)) 

i = 1
for feature, importance in feature_dict.items():
    print("{} ; {} ; {}".format(i, feature, round(importance, 3)))
    i+=1

1 ; Volume ; 0.642
2 ; MACD ; 0.15
3 ; OBV ; 0.09
4 ; RSI ; 0.072
5 ; CCI ; 0.046


In [12]:
# Making Predictions!
predictions = pipelineModel.transform(test)

In [13]:
#evaluate results
calc_metrics(predictions)
predictions.show(100)

Summary Stats
DenseMatrix([[ 23.,  47.],
             [  4.,  60.]])
Accuracy = 0.6194
Recall = 0.6194
F1 Score = 0.6194
+------+------+------+------+--------+-------+------+--------------------+--------------------+--------------------+----------+
|Volume|    id|Profit|  MACD|     CCI|    OBV|   RSI|            features|       rawPrediction|         probability|prediction|
+------+------+------+------+--------+-------+------+--------------------+--------------------+--------------------+----------+
|   5.0|1228.0|   1.0| 0.785|  97.708|24057.0|63.352|[5.0,0.785,97.708...|[26.5225222145962...|[0.53045044429192...|       0.0|
|   0.0|1229.0|   1.0| 0.801|   88.15|24057.0|68.631|[0.0,0.801,88.15,...|[21.5383210929688...|[0.43076642185937...|       1.0|
| 100.0|1230.0|   0.0| 1.051| 166.881|24157.0|97.778|[100.0,1.051,166....|[37.9994576702553...|[0.75998915340510...|       0.0|
|   0.0|1231.0|   1.0| 1.112| 114.067|24157.0|50.842|[0.0,1.112,114.06...|[21.6635782539120...|[0.4332715650782

In [None]:
if DEBUG != True:
    df_to_plot_rt = predictions.select('prediction', 'Profit')
    df_to_plot_rt = df_to_plot_rt.toPandas()
    plt.figure(figsize=(24, 3))
    plt.plot(df_to_plot_rt)
    plt.legend(df_to_plot_rt.columns)
    plt.show()

In [None]:
import pandas as pd
i = 0
# path_to_csv = "s3://stocksets100/Orlen.csv"
path = "./Datasets/Orlen.csv"
df = complete_processing(spark, path)
ManualSplit = False
results = []
pipeline_test = Pipeline(stages=[vectorAssembler_rt,scaler2, best_classifier])

for i in range(30):
    train, test = train_test_split(spark, df, CHUNKS, SORT, ManualSplit, RANDOM_SEED + i)
    predictions = pipeline_test.fit(test).transform(test)
    
    results.append(calc_metrics(predictions)['accuracy'])
    i+=1
    print("#########################################################################")

In [None]:
from scipy import stats
stats_df = pd.DataFrame( {'Accuracy' : results})
print(stats_df)

In [None]:
plt.boxplot(stats_df['Accuracy'], labels=stats_df.columns)
plt.show()