In [None]:
import findspark
findspark.init()
import pyspark

from pyspark.sql import SparkSession
from pyspark import SparkConf

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, VectorIndexer, StringIndexer, Normalizer, StandardScaler, MaxAbsScaler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator


# Algorithms
from pyspark.ml.classification import DecisionTreeClassifier, LinearSVC, RandomForestClassifier


# Others
from collections import OrderedDict

# Graphs libs
import matplotlib.pyplot as plt

# Supress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

# Spark context simple configuration
conf = SparkConf()
conf.setAppName('ipython-notebook').set("spark.driver.memory", "4g")
spark = SparkSession.builder.config(conf=conf).getOrCreate()

spark.version


In [None]:
spark.sparkContext.getConf().getAll()

In [None]:
%load_ext autoreload
%autoreload 2

from Helpers.technical_indicators import calc_ti
from Helpers.generated_features import features_from_OHLC
from Helpers.CustomTS import TrainValidationSplitSorted
from Helpers.best_model_params import *
from Helpers.Models import *
from Helpers.parse import tree_json

from ProcessingData.processing import *
from Stats.measures import *
from Helpers.udf import BuyAndHoldClassifier, ReverseTradeClassifier

In [None]:
ManualSplit = False

TRAIN_FOLD = 7
TEST_FOLD= 3

SORTED = False

RANDOM_SEED = 1

In [None]:
path = "./Datasets/WIG20.csv"
df = complete_processing(spark, path)

train, test = train_test_split(spark, df, TRAIN_FOLD, TEST_FOLD, ManualSplit, RANDOM_SEED)

In [None]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
featuresCols = df.columns
featuresCols.remove('Profit')
featuresCols.remove('id')
print(featuresCols)

In [None]:
# Patrameters grid testing
classifier, paramGrid = getDecisonTreewithGrid(max_Bins=250,
                                               min_InstancesPerNode=[1],
                                               max_Depth_Range=[6,7],
                                                min_infoGain=[0.000001])

evaluator = MulticlassClassificationEvaluator(
    labelCol=classifier.getLabelCol(),
    metricName='accuracy',
    predictionCol=classifier.getPredictionCol())

In [None]:
# Declare the CrossValidator, which runs model tuning for us.
if SORTED:
    validator = TrainValidationSplitSorted(
    train_fold=TRAIN_FOLD,
    test_fold=TEST_FOLD,
    spark=spark,
    estimator=classifier,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator)
else:
    validator = TrainValidationSplit(
    trainRatio = 0.7,
    estimator=classifier,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator)

In [None]:
scaler = MaxAbsScaler(inputCol="rawFeatures", outputCol="features")

In [None]:
# Vector Assembler
# This concatenates all feature columns into a single feature vector in a new column "rawFeatures".
# Used for assembling features into a vector.
# We will pass all the columns that we are going to use for the prediction to the VectorAssembler and
# it will create a new vector column.
# Creating Final pipeline object
if scaler is not None:
    vectorAssembler_rt = VectorAssembler(
    inputCols=featuresCols, outputCol="rawFeatures")
    pipeline = Pipeline(stages=[vectorAssembler_rt,scaler, validator])
else:
    vectorAssembler_rt = VectorAssembler(
    inputCols=featuresCols, outputCol="features")
    pipeline = Pipeline(stages=[vectorAssembler_rt, validator])

In [None]:
# FITTING!
import time
print("Training Started!")
start = time.time()
pipelineModel = pipeline.fit(train)
end = time.time()

In [None]:
print("Training finished in: " + str( round((end - start)/60)) )

In [None]:
# Getting the Best Model
best_classifier = pipelineModel.stages[-1].bestModel

In [None]:
best_tree_par(best_classifier)
tree_feature_importances(best_classifier,featuresCols)

In [None]:
# Making Predictions!
predictions = pipelineModel.transform(test)

In [None]:
#evaluate results
calc_metrics(predictions)

In [None]:
df_to_plot_rt = predictions.select('prediction', 'Profit')
df_to_plot_rt = df_to_plot_rt.toPandas()
plt.figure(figsize=(24, 3))
plt.plot(df_to_plot_rt)
plt.legend(df_to_plot_rt.columns)
plt.show()

In [None]:
import pandas as pd
from random import randint
i = 0
# path_to_csv = "s3://stocksets100/Orlen.csv"
path = "./Datasets/WIG20Verify.csv"
df = complete_processing(spark, path)
ManualSplit = False
results_MC = []
Mlavg_a = []
Mlavg_se = []
Mlavg_sp = []
Mlavg_p = []
i = 0
if scaler is not None:
    pipeline_test = Pipeline(stages=[vectorAssembler_rt,scaler, best_classifier])
else:
    pipeline_test = Pipeline(stages=[vectorAssembler_rt, best_classifier])

while len(results_MC) != 30:
    train, test = validate(df, RANDOM_SEED + i)
    predictions = pipeline_test.fit(test).transform(test)
    temp_dcit = get_metrics(predictions,0.1)
    if temp_dcit is None:
        continue
    results_MC.append(temp_dcit)
    Mlavg_a.append(temp_dcit['accuracy'])
    Mlavg_se.append(temp_dcit['sensitivity'])
    Mlavg_sp.append(temp_dcit['specificity'])
    Mlavg_p.append(temp_dcit['precision'])
    i+=1

In [None]:
plt.scatter(list(range(1, 31)),Mlavg_a)
plt.xlabel('Nr próbki')
plt.ylabel('Jakość w %')

plt.show()

avg_a = sum(Mlavg_a)/len(Mlavg_a)
print(round(avg_a,2))

avg_se = sum(Mlavg_se)/len(Mlavg_se)
print(round(avg_se,2))

avg_sp = sum(Mlavg_sp)/len(Mlavg_sp)
print(round(avg_sp,2))

avg_p = sum(Mlavg_p)/len(Mlavg_p)
print(round(avg_p,2))

In [None]:
from scipy import stats
k2, p = stats.normaltest(Mlavg_a)
alpha = 5e-2
print(p)
if p < alpha:  # null hypothesis: x comes from a normal distribution
    print("The null hypothesis can be rejected")
else:
    print("The null hypothesis cannot be rejected")

In [14]:
from sklearn.dummy import DummyClassifier
i = 0
avg_accuracy = 0
avg_sensitivity = 0
avg_specificity = 0
avg_precision = 0
path = "./Datasets/OrlenVerify.csv"
ManualSplit = False
results_BC = []
Blavg_a = []
Blavg_se = []
Blavg_sp = []
Blavg_p = []

while len(results_BC) != 30:
    df = simple_processing(spark, path)
    train, test  = validate(df, RANDOM_SEED + i)
    temp_dcit = get_metrics(test,0.0)
    if temp_dcit is None:
        continue
    results_BC.append(temp_dcit)
    Blavg_a.append(temp_dcit['accuracy'])
    Blavg_se.append(temp_dcit['sensitivity'])
    Blavg_sp.append(temp_dcit['specificity'])
    Blavg_p.append(temp_dcit['precision'])
    i+=1
    
    if len(results_BC) == 30:
        break

0.458,0.472,0.444,0.465,0.451
0.488,0.494,0.482,0.496,0.48
0.487,0.498,0.477,0.483,0.492
0.499,0.496,0.502,0.544,0.455
0.468,0.451,0.486,0.488,0.449
0.469,0.453,0.486,0.484,0.455
0.488,0.481,0.496,0.502,0.475
0.47,0.44,0.504,0.5,0.444
0.481,0.478,0.485,0.52,0.442
0.472,0.477,0.467,0.502,0.442
0.468,0.476,0.459,0.481,0.453
0.478,0.491,0.464,0.498,0.457
0.5,0.479,0.519,0.473,0.525
0.46,0.427,0.494,0.47,0.451
0.468,0.43,0.506,0.472,0.464
0.498,0.5,0.496,0.502,0.494
0.51,0.483,0.537,0.516,0.504
0.483,0.463,0.504,0.509,0.459
0.499,0.494,0.504,0.53,0.469
0.491,0.494,0.487,0.516,0.466
0.489,0.471,0.506,0.475,0.502
0.495,0.471,0.519,0.488,0.502
0.489,0.494,0.485,0.479,0.5
0.473,0.445,0.5,0.462,0.484
0.502,0.514,0.49,0.508,0.496
0.482,0.472,0.494,0.5,0.466
0.479,0.461,0.498,0.488,0.471
0.482,0.459,0.506,0.492,0.473
0.475,0.46,0.493,0.519,0.434
0.479,0.489,0.469,0.469,0.489


In [None]:
plt.scatter(list(range(1, 31)),Blavg_a)
plt.xlabel('Nr próbki')
plt.ylabel('Jakość w %')

plt.show()

Bavg_a = sum(Blavg_a)/len(Blavg_a)
print(round(Bavg_a,2))

Bavg_se = sum(Blavg_se)/len(Blavg_se)
print(round(Bavg_se,2))

Bavg_sp = sum(Blavg_sp)/len(Blavg_sp)
print(round(Bavg_sp,2))

Bavg_p = sum(Blavg_p)/len(Blavg_p)
print(round(Bavg_p,2))

In [None]:
from scipy import stats
k2, p = stats.normaltest(Blavg_a)
alpha = 5e-2
print(p)
if p < alpha:  # null hypothesis: x comes from a normal distribution
    print("The null hypothesis can be rejected")
else:
    print("The null hypothesis cannot be rejected")

In [None]:
from scipy import stats
w, p = stats.levene(Mlavg_a,Blavg_a)
alpha = 5e-2
print(p)
if p < alpha:  # null hypothesis: 
    print("The null hypothesis can be rejected")
else:
    print("The null hypothesis cannot be rejected")
    
s, p = stats.ttest_rel(Mlavg_a,Blavg_a)
alpha = 5e-2
print(p)
if p < alpha:  # null hypothesis: 
    print("The null hypothesis can be rejected")
else:
    print("The null hypothesis cannot be rejected")

In [None]:
from scipy import stats
# stats_df = pd.DataFrame( {'Accuracy_MC' : results})
stats_df = pd.DataFrame( {'Accuracy_BC' : Blavg_a ,'Accuracy_MC' : Mlavg_a})


In [None]:
stats_df.plot.box(figsize=(10, 10))
axes = plt.gca()
axes.set_ylim([0.4,0.8])
plt.show()