In [None]:
import findspark
findspark.init()
import pyspark

from pyspark.sql import SparkSession
from pyspark import SparkConf

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, VectorIndexer, StringIndexer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# Algorithms
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier

# Others
import pandas as pd
import numpy as np
import datetime
import time

# Graphs libs
import matplotlib.pyplot as plt

# Supress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

# Spark context simple configuration
spark = SparkSession.builder.config(conf=SparkConf()).getOrCreate()

spark.version

In [None]:
%load_ext autoreload
%autoreload 2

from Helpers.technical_indicators import calc_ti
from Helpers.generated_features import features_from_OHLC
from Helpers.CustomTS import TrainValidationSplitSorted
from Helpers.best_model_params import *

from ProcessingData.processing import initial_processing, calc_profit, transform_date, train_test_split, complete_processing
from Stats.measures import calc_metrics

In [None]:
ManualSplit = True
SORT = True
CHUNKS = 10

CV = False

DEBUG = False
RANDOM_SEED = 1

In [None]:
# path_to_csv = "s3://stocksets100/Orlen.csv"
path = "./Datasets/KGHA.csv"
df = complete_processing(spark, path)

In [None]:
train, test = train_test_split(spark, df, CHUNKS, SORT, ManualSplit, RANDOM_SEED)

In [None]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
featuresCols = df.columns
featuresCols.remove('Profit')
featuresCols.remove('id')

print(featuresCols)

# Vector Assembler
# This concatenates all feature columns into a single feature vector in a new column "rawFeatures".
# Used for assembling features into a vector.
# We will pass all the columns that we are going to use for the prediction to the VectorAssembler and
# it will create a new vector column.
vectorAssembler_rt = VectorAssembler(
    inputCols=featuresCols, outputCol="features")

In [None]:
# Patrameters grid testing
# rt = DecisionTreeClassifier(
#     labelCol='Profit', featuresCol="features", minInfoGain=0.01,  maxBins=200)

rt = RandomForestClassifier(
    labelCol='Profit', featuresCol="features", numTrees=10, maxBins=300)

max_Depth_Range = [3]
min_InstancesPerNode = list(range(5, 15))

paramGrid = ParamGridBuilder() \
    .addGrid(rt.maxDepth, max_Depth_Range) \
    .addGrid(rt.maxMemoryInMB, [1000] ).build()

# We define an evaluation metric. This tells Validator how well we are doing by comparing the true
# labels with predictions.

evaluator = MulticlassClassificationEvaluator(
    labelCol=rt.getLabelCol(),
    metricName='accuracy',
    predictionCol=rt.getPredictionCol())

In [None]:
# Declare the CrossValidator, which runs model tuning for us.
if CV:
    val = CrossValidator(
        estimator=rt,
        evaluator=multi,
        estimatorParamMaps=paramGrid,
        numFolds=2)
else:
    val = TrainValidationSplitSorted(
        chunks = CHUNKS,
        spark = spark,
        estimator=rt,
        estimatorParamMaps=paramGrid,
        evaluator=multi)

In [None]:
# Creating Final pipeline object
pipeline_rt = Pipeline(stages=[vectorAssembler_rt, val])

# FITTING!
pipelineModel_rt = pipeline_rt.fit(train)

# Getting the Best Model
best_classifier = pipelineModel_rt.stages[-1].bestModel

In [None]:
# Feature importance
print('Features importances' + str(best_classifier.featureImportances))
final_features = best_classifier.featureImportances
   
for feature, importance in zip(featuresCols, final_features):
    print("{} - {}".format(feature, round(importance, 3)))

In [None]:
# Making Predictions!
predictions = pipelineModel_rt.transform(test)

In [None]:
#evaluate results
calc_metrics(predictions)

In [None]:
if DEBUG != True:
    df_to_plot_rt = predictions_rt.select('prediction', 'Profit')
    print(df_to_plot_rt)
    df_to_plot_rt = df_to_plot_rt.toPandas()
    plt_dt.figure(figsize=(24, 3))
    plt_dt.plot(df_to_plot_rt)
    plt_dt.legend(df_to_plot_rt.columns)
    plt_dt.show()

In [None]:
# final_model = pipelineModel_rt
from random import *

for i in range(30):
    new_train, new_test = converted_df.randomSplit([0.1, 0.9], seed=i + 1)
    new_test = new_test.sort(new_test.id.asc())

    predictions = pipelineModel_rt.transform(new_test)

    # Calculating metrics
    AreaUnderROC = evaluator_rt.evaluate(predictions)
    print("AreaUnderROC on our test set: %g" % AreaUnderROC)

    # Calculating metrics
    AreaUnderPR = evaluator_rt_PR.evaluate(predictions)
    print("AreaUnderPR on our test set: %g" % AreaUnderPR)

    # Accuracy
    accuracy = multi.evaluate(predictions)
    print("Accuracy is equal to {}%".format(round(accuracy, 3)))

    #evaluate results
    testCount = predictions.count()

    FP = predictions.where("prediction = 1 AND Profit = 0").count()
    FN = predictions.where("prediction = 0 AND Profit = 1").count()   
    TP = predictions.where("prediction = 1 AND Profit = 1").count() 
    TN = predictions.where("prediction = 0 AND Profit = 0").count()

    print("Count | FP | FN | TP | TN")
    print(
        str(testCount) + " | " + str(FP) + " | " + str(FN) + " | " + str(TP) +
        " | " + str(TN))

    print("####################################################")