In [89]:
# Spark libs
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.window import Window
import pyspark.sql.functions as func
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from pyspark.sql.functions import monotonically_increasing_id
from pyspark import SQLContext
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf
from pyspark.sql.functions import lit
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import StringIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler, VectorIndexer

# Algos# Algorithm
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier

# Others
import pandas as pd
import numpy as np
import datetime
import time

# Graphs libs
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt_dt

# Supress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# S3 Service
# import boto3
# from io import StringIO

# # Let's use Amazon S3
# s3 = boto3.resource('s3')

# Spark context simple configuration
spark = SparkSession.builder.config(conf=SparkConf()).getOrCreate()

spark.version

'2.2.0'

In [90]:
import numpy as np

from pyspark.ml.tuning import TrainValidationSplit, TrainValidationSplitModel
from pyspark.sql.functions import rand
from pandas import Series
from sklearn.model_selection import TimeSeriesSplit


class TrainValidationSplitSorted(TrainValidationSplit):

    def _fit(self, dataset):
        est = self.getOrDefault(self.estimator)
        epm = self.getOrDefault(self.estimatorParamMaps)
        numModels = len(epm)
        eva = self.getOrDefault(self.evaluator)
        tRatio = self.getOrDefault(self.trainRatio)
        seed = self.getOrDefault(self.seed)
        randCol = self.uid + "_rand"
        df = dataset.select("*", rand(seed).alias(randCol))
        metrics = [0.0] * numModels
        condition = (df[randCol] >= tRatio)
        
        value = 10
        if df is not None:
            df.show()
        dfp = df.toPandas()
        dfp = np.array_split(dfp, value)
        train = spark.createDataFrame(data=dfp[0])
        for i in range(1, value):
            p = spark.createDataFrame(data=dfp[i])
            train = train.union(p)
                
       
        validation = spark.createDataFrame(data=dfp[i])
        
        validation = validation.sort(validation.id.asc())
        train = train.sort(train.id.asc())
        
#         validation = df.filter(condition)
#         train = df.filter(~condition)
        
        print(train)
        train.show(20)
        print ('#######################################################################')
        validation.show(20)
        models = est.fit(train, epm)
        for j in range(numModels):
            model = models[j]
            metric = eva.evaluate(model.transform(validation, epm[j]))
            metrics[j] += metric
        if eva.isLargerBetter():
            bestIndex = np.argmax(metrics)
        else:
            bestIndex = np.argmin(metrics)
        bestModel = est.fit(dataset, epm[bestIndex])
        return self._copyValues(TrainValidationSplitModel(bestModel, metrics))

In [91]:
OBV_i = False
CCI_i = False
RSI_i = False
MACD_i = False
ManualSplit = False
Date_Convert = False

DEBUG = False
SORT = True
CV = False

DT = True

RANDOM_SEED = 1

In [92]:
def features_from_OHLC(spark_df):
    """ Generate features for a stock/index based on 
           historical price and performance
    Args:
        df (dataframe with columns "Open", "Close", "High", 
               "Low", "Volume", "Adjusted Close")
    Returns:
        dataframe, data set with new features
    """
    df = spark_df
    df = df.toPandas()

    # 31 original features
    # average price
    df['avg_price_5'] = pd.rolling_mean(df['Close'], window=5).shift(1)
    df['avg_price_30'] = pd.rolling_mean(df['Close'], window=21).shift(1)
    df['avg_price_365'] = pd.rolling_mean(df['Close'], window=252).shift(1)
    df['ratio_avg_price_5_30'] = df['avg_price_5'] / df['avg_price_30']
    df['ratio_avg_price_5_365'] = df['avg_price_5'] / df['avg_price_365']
    df['ratio_avg_price_30_365'] = df['avg_price_30'] / df['avg_price_365']
    # average volume
    df['avg_volume_5'] = pd.rolling_mean(df['Volume'], window=5).shift(1)
    df['avg_volume_30'] = pd.rolling_mean(df['Volume'], window=21).shift(1)
    df['avg_volume_365'] = pd.rolling_mean(df['Volume'], window=252).shift(1)
    df['ratio_avg_volume_5_30'] = df['avg_volume_5'] / df['avg_volume_30']
    df['ratio_avg_volume_5_365'] = df['avg_volume_5'] / df['avg_volume_365']
    df['ratio_avg_volume_30_365'] = df['avg_volume_30'] / df['avg_volume_365']
    # standard deviation of prices
    df['std_price_5'] = pd.rolling_std(df['Close'], window=5).shift(1)
    df['std_price_30'] = pd.rolling_std(df['Close'], window=21).shift(1)
    df['std_price_365'] = pd.rolling_std(df['Close'], window=252).shift(1)
    df['ratio_std_price_5_30'] = df['std_price_5'] / df['std_price_30']
    df['ratio_std_price_5_365'] = df['std_price_5'] / df['std_price_365']
    df['ratio_std_price_30_365'] = df['std_price_30'] / df['std_price_365']
    # standard deviation of volumes
    df['std_volume_5'] = pd.rolling_std(df['Volume'], window=5).shift(1)
    df['std_volume_30'] = pd.rolling_std(df['Volume'], window=21).shift(1)
    df['std_volume_365'] = pd.rolling_std(df['Volume'], window=252).shift(1)
    df['ratio_std_volume_5_30'] = df['std_volume_5'] / df['std_volume_30']
    df['ratio_std_volume_5_365'] = df['std_volume_5'] / df['std_volume_365']
    df['ratio_std_volume_30_365'] = df['std_volume_30'] / df['std_volume_365']
    # return
    df['return_1'] = ((
        df['Close'] - df['Close'].shift(1)) / df['Close'].shift(1)).shift(1)
    df['return_5'] = ((
        df['Close'] - df['Close'].shift(5)) / df['Close'].shift(5)).shift(1)
    df['return_30'] = ((
        df['Close'] - df['Close'].shift(21)) / df['Close'].shift(21)).shift(1)
    df['return_365'] = ((df['Close'] - df['Close'].shift(252)) /
                        df['Close'].shift(252)).shift(1)
    df['moving_avg_5'] = pd.rolling_mean(df['return_1'], window=5)
    df['moving_avg_30'] = pd.rolling_mean(df['return_1'], window=21)
    df['moving_avg_365'] = pd.rolling_mean(df['return_1'], window=252)
    df = df.dropna(axis=0)
    result_df = spark.createDataFrame(df.round(3))
    return result_df

In [93]:
def RSI(dataframe, window_length, avg_type, column='Close'):
    data = dataframe.toPandas()
    # Get just the close
    close = data['Close']
    # Get the difference in price from previous step
    delta = close.diff()
    # Get rid of the first row, which is NaN since it did not have a previous
    # row to calculate the differences
    # Make the positive gains (up) and negative gains (down) Series
    up, down = delta.copy(), delta.copy()
    up[up < 0] = 0
    down[down > 0] = 0
    if avg_type == "EWMA":
        roll_up = up.ewm(span=window_length, min_periods=window_length).mean()
        roll_down = down.abs().ewm(
            span=window_length, min_periods=window_length).mean()
    elif avg_type == "SMA":
        roll_up = pd.rolling_mean(up, window_length)
        roll_down = pd.rolling_mean(down.abs(), window_length)
    RS = roll_up / roll_down
    RSI = 100.0 - (100.0 / (1.0 + RS))
    RSI = pd.DataFrame({'RSI': RSI})
    data = data.join(RSI)
    result_df = spark.createDataFrame(data.round(3))
    return result_df

In [94]:
# Commodity Channel Index
def CCI(spark_df, ndays):
    data = spark_df.toPandas()
    TP = (data['High'] + data['Low'] + data['Close']) / 3
    CCI = pd.Series(
        (TP - pd.rolling_mean(TP, ndays)) /
        (0.015 * pd.rolling_std(TP, ndays)),
        name='CCI')
    data = data.join(CCI)
    result_df = spark.createDataFrame(data.round(3))
    return result_df

In [95]:
# Moving average convergence divergence
def MACD(dataframe, nfast=12, nslow=26, signal=9, column='Close'):
    data = dataframe.toPandas()
    # Get just the close
    price = data[column]
    # Get the difference in price from previous step
    emaslow = pd.ewma(price, span=nslow, min_periods=1)
    emafast = pd.ewma(price, span=nfast, min_periods=1)
    #     MACD = pd.DataFrame({'MACD': emafast-emaslow, 'emaSlw': emaslow, 'emaFst': emafast})
    MACD = pd.DataFrame({'MACD': emafast - emaslow})

    data = data.join(MACD.round(3))
    result_df = spark.createDataFrame(data)
    return result_df

In [96]:
# UDF FUNCTIONS
def profit_(today_price, previous_day_price):
    if today_price - previous_day_price > 0:
        return 1
    else:
        return 0

profit_udf = udf(profit_, IntegerType())

In [97]:
#Path to file with CSV

# path_to_csv = "s3://stocksets100/Orlen.csv"
path_to_csv = "./Datasets/Orlen.csv"

fresh_df = spark.read.csv(path_to_csv, header=True, inferSchema=True)

fresh_df = fresh_df.filter(fresh_df.Open != "null")

temporary_df = fresh_df.select(
    fresh_df["Date"].cast("Date"), fresh_df["Open"].cast("float"),
    fresh_df["High"].cast("float"), fresh_df["Volume"].cast("int"),
    fresh_df["Low"].cast("float"), fresh_df["Close"].cast("float"))

# id
temporary_df = temporary_df.select("*").withColumn(
    "id", monotonically_increasing_id())

# featured_df = features_from_OHLC(temporary_df)
featured_df = temporary_df
featured_df.columns

['Date', 'Open', 'High', 'Volume', 'Low', 'Close', 'id']

In [98]:
# Creating new column with shifted Close price by 1 day
df_daily_return = featured_df.withColumn('prev_day_price',
                                         func.lag(featured_df['Close']).over(
                                             Window.orderBy("id")))

df_daily_return = df_daily_return.filter(
    df_daily_return.prev_day_price.isNotNull())

# Profit label calculation
# 1 if stock risen up, 0 is it went down
df_profit = df_daily_return.withColumn(
    'Profit', profit_udf(df_daily_return.Close,
                         df_daily_return.prev_day_price))

df_shifted_profit = df_profit.withColumn(
    'Profit',
    func.lag(df_profit['Profit'], count=-1).over(Window.orderBy("Date")))

final_df = df_shifted_profit.filter(df_shifted_profit.Profit.isNotNull())

# Removing redudant columns
final_df = final_df.drop("Daily return")
final_df = final_df.drop("prev_day_price")

# converted_df = converted_df.select("id", 'Date', 'Open', 'High', 'Volume',
#                                    'Low', 'Close', 'Profit')
if DEBUG:
    final_df.show(2)
    
converted_df = final_df

In [99]:
#Columns without Date
# converted_df = converted_df.select(
#     [col(c).cast('float') for c in converted_df.columns if c. not in {'Date'}])
# Date column!
df_date = converted_df.select(converted_df.Date)
df_date = df_date.select("*").withColumn("id", monotonically_increasing_id())

#Convert date to splitted format
if Date_Convert:
    split_col = pyspark.sql.functions.split(converted_df['Date'], '-')
    converted_df = converted_df.withColumn('Year',
                                           split_col.getItem(0).cast('int'))
    converted_df = converted_df.withColumn('Month',
                                           split_col.getItem(1).cast('int'))
    converted_df = converted_df.withColumn('Day',
                                           split_col.getItem(2).cast('int'))
    if DEBUG:
        converted_df.show()

converted_df = converted_df.drop("Date")

In [100]:
if MACD_i:
    converted_df = MACD(converted_df)
    converted_df = converted_df.select(
        [col(c).cast('float') for c in converted_df.columns])
    if DEBUG:
        converted_df.show()

In [101]:
if CCI_i:
    converted_df = CCI(converted_df, 14)
    converted_df = converted_df.select(
        [col(c).cast('float') for c in converted_df.columns])
    converted_df = converted_df.filter(converted_df.CCI != "NaN")
    if DEBUG:
        converted_df.show()

In [102]:
# OBV indicator calculation
if OBV_i:
    temp_df = converted_df.toPandas()
    df_obv = spark.createDataFrame(
        temp_df.assign(OBV=(temp_df.Volume * (
            ~temp_df.Close.diff().le(0) * 2 - 1)).cumsum()))
    converted_df = df_obv.select(
        [col(c).cast('float') for c in df_obv.columns])
    if DEBUG:
        converted_df.show()

In [103]:
#RSI indicator calculaction
if RSI_i:
    converted_df = RSI(converted_df, 3, 'SMA')
    converted_df = converted_df.filter(converted_df.RSI != "NaN")
    if DEBUG:
        converted_df.show()

In [104]:
# if DEBUG:
#     df_date = df_date.set_index('id')
#     df_to_plot_dt = converted_df.select([
#         c for c in converted_df.columns
#         if c not in
#         {'OBV', 'Volume', 'Low', 'High', 'Open', 'Profit', 'RSI', 'CCI'}
#     ])
#     df_to_plot_dt = df_to_plot_dt.withColumn('Zero',lit(0))
#     df_to_plot_dt = df_to_plot_dt.toPandas()
#     df_to_plot_dt = df_to_plot_dt.set_index('id')
#     df_to_plot_dt = df_to_plot_dt.join(df_date)
#     plt_dt.figure(figsize=(24, 10))
#     plt_dt.plot(df_to_plot_dt.Date, df_to_plot_dt.Close,df_to_plot_dt.Zero )
#     plt_dt.legend(df_to_plot_dt.columns)
#     plt_dt.show()

In [105]:
converted_df = converted_df.sort(converted_df.id.asc())

# Manual split for training and validating data
if ManualSplit:
    dfp = converted_df.toPandas()

    dfp = np.array_split(dfp, 10)

    p0 = spark.createDataFrame(data=dfp[0])
    p1 = spark.createDataFrame(data=dfp[1])
    p2 = spark.createDataFrame(data=dfp[2])
    p3 = spark.createDataFrame(data=dfp[3])
    p4 = spark.createDataFrame(data=dfp[4])
    p5 = spark.createDataFrame(data=dfp[5])
    p6 = spark.createDataFrame(data=dfp[6])
    p7 = spark.createDataFrame(data=dfp[7])
    p8 = spark.createDataFrame(data=dfp[8])
    p9 = spark.createDataFrame(data=dfp[9])

    p_final = p0.union(p1).union(p2).union(p3).union(p4).union(p5).union(
        p6).union(p7).union(p8)
    train = p_final
    test = p9
    #     test = p9.head(10)
    #     test = spark.createDataFrame(test)
else:
    train, test = converted_df.randomSplit([0.9, 0.1], seed=RANDOM_SEED)

print("We have %d training examples and %d test examples." % (train.count(),
                                                              test.count()))
test = test.select([col(c).cast('float') for c in test.columns])

train = train.select([col(c).cast('float') for c in train.columns])

if SORT:
    test = test.sort(test.id.asc())
    train = train.sort(train.id.asc())

if DEBUG:
    train.show()

We have 1212 training examples and 149 test examples.


In [106]:
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
featuresCols = converted_df.columns
featuresCols.remove('Profit')
featuresCols.remove('id')

print(featuresCols)

# Vector Assembler
# This concatenates all feature columns into a single feature vector in a new column "rawFeatures".
# Used for assembling features into a vector.
# We will pass all the columns that we are going to use for the prediction to the VectorAssembler and
# it will create a new vector column.
vectorAssembler_rt = VectorAssembler(
    inputCols=featuresCols, outputCol="features")

# VectorIndexer:
# is used to index categorical predictors in a featuresCol column.
# Remember that featuresCol is a single column consisting of vectors (refer to featuresCol and labelCol).
# Each row is a vector which contains values from each predictors.

# featureIndexer_rt = VectorIndexer(
#     inputCol="features", outputCol="indexed", maxCategories=len(featuresCols))



['Open', 'High', 'Volume', 'Low', 'Close']


In [107]:
# Patrameters grid testing
# rt = DecisionTreeClassifier(
#     labelCol='Profit', featuresCol="features", minInfoGain=0.01,  maxBins=200)

rt = RandomForestClassifier(
    labelCol='Profit', featuresCol="features", numTrees=25, maxBins=300)

max_Depth_Range = list(range(5, 15))
min_InstancesPerNode = list(range(5, 15))

paramGrid = ParamGridBuilder() \
    .addGrid(rt.maxDepth, max_Depth_Range) \
    .addGrid(rt.minInstancesPerNode, min_InstancesPerNode) \
    .addGrid(rt.maxMemoryInMB, [1000] ).build()

# We define an evaluation metric. This tells CrossValidator how well we are doing by comparing the true
# labels with predictions.

multi = MulticlassClassificationEvaluator(
    labelCol=rt.getLabelCol(),
    metricName='accuracy',
    predictionCol=rt.getPredictionCol())

evaluator_rt = BinaryClassificationEvaluator(
    labelCol=rt.getLabelCol(),
    metricName='areaUnderROC',
    rawPredictionCol=rt.getRawPredictionCol())

evaluator_rt_PR = BinaryClassificationEvaluator(
    labelCol=rt.getLabelCol(),
    metricName='areaUnderPR',
    rawPredictionCol=rt.getRawPredictionCol())

# Declare the CrossValidator, which runs model tuning for us.
if CV:
    val = CrossValidator(
        estimator=rt,
        evaluator=multi,
        estimatorParamMaps=paramGrid,
        numFolds=2)
else:
    val = TrainValidationSplitSorted(
        estimator=rt,
        estimatorParamMaps=paramGrid,
        evaluator=multi,
        # 85% of the data will be used for training, 15% for validation.
        trainRatio=0.85)

In [108]:
from pyspark.ml.classification import RandomForestClassificationModel

RandomForestClassificationModel.getMaxDepth = (
    lambda self: self._java_obj.getMaxDepth())

RandomForestClassificationModel.getMinInstancesPerNode = (
    lambda self: self._java_obj.getMinInstancesPerNode())

In [109]:
# if DEBUG:
#     train.repartition(1).write.csv("PreProcessedSets/TrainSet_" + str(time.mktime(datetime.datetime.today().timetuple())) + "_.csv", header = 'True')
#     test.repartition(1).write.csv("PreProcessedSets/TestSet_" + str(time.mktime(datetime.datetime.today().timetuple())) + "_.csv", header = 'True')

In [110]:
# Creating Final pipeline object
pipeline_rt = Pipeline(stages=[vectorAssembler_rt, val])

# FITTING!
pipelineModel_rt = pipeline_rt.fit(train)

# Getting the Best Model
best_classifier = pipelineModel_rt.stages[-1].bestModel

Py4JJavaError: An error occurred while calling o2315.showString.
: java.lang.NullPointerException
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply2_2$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at org.apache.spark.sql.execution.TakeOrderedAndProjectExec$$anonfun$executeCollect$1.apply(limit.scala:136)
	at org.apache.spark.sql.execution.TakeOrderedAndProjectExec$$anonfun$executeCollect$1.apply(limit.scala:136)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
	at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
	at org.apache.spark.sql.execution.TakeOrderedAndProjectExec.executeCollect(limit.scala:136)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:2853)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2153)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2153)
	at org.apache.spark.sql.Dataset$$anonfun$55.apply(Dataset.scala:2837)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:2836)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2153)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2366)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:245)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)


In [None]:
# Feature importance
print('Features importances' + str(best_classifier.featureImportances))
final_features = best_classifier.featureImportances
   
for feature, importance in zip(featuresCols, final_features):
    print("{} - {}".format(feature, round(importance, 3)))

In [None]:
# Model Parameters!
# Max depth
print("Maximal depth is " + str(best_classifier.getMaxDepth()))
max_depth = best_classifier.getMaxDepth()

# Min instances
print("Minimal instances per node is " + str(
    best_classifier.getMinInstancesPerNode()))
min_instancesPerNode = best_classifier.getMinInstancesPerNode()

# Making Predictions!
predictions_rt = pipelineModel_rt.transform(test)

In [None]:
# Accuracy
accuracy = multi.evaluate(predictions_rt)
print("Accuracy is equal to {}%".format(round(accuracy, 3)))

In [None]:
# Calculating metrics
AreaUnderROC = evaluator_rt.evaluate(predictions_rt)
print("AreaUnderROC on our test set: %g" % AreaUnderROC)

# Calculating metrics
AreaUnderPR = evaluator_rt_PR.evaluate(predictions_rt)
print("AreaUnderPR on our test set: %g" % AreaUnderPR)

#evaluate results
testCount = predictions_rt.count()

FP = predictions_rt.where("prediction = 1 AND Profit = 0").count()
FN = predictions_rt.where("prediction = 0 AND Profit = 1").count()
TP = predictions_rt.where("prediction = 1 AND Profit = 1").count()
TN = predictions_rt.where("prediction = 0 AND Profit = 0").count()

print("Count | FP | FN | TP | TN")
print(
    str(testCount) + " | " + str(FP) + " | " + str(FN) + " | " + str(TP) +
    " | " + str(TN))

# predictions_rt.show()

In [None]:
test = predictions_rt.toPandas()
# csv_buffer = StringIO()
# test.to_csv(csv_buffer)

# s3_resource = boto3.resource('s3')

# s3_resource.Object('logs102', 'DT_Final.csv').put(Body=csv_buffer.getvalue())

if DEBUG != True:
    df_to_plot_rt = predictions_rt.select('prediction', 'Profit')
    print(df_to_plot_rt)
    df_to_plot_rt = df_to_plot_rt.toPandas()
    plt_dt.figure(figsize=(24, 3))
    plt_dt.plot(df_to_plot_rt)
    plt_dt.legend(df_to_plot_rt.columns)
    plt_dt.show()
    

In [None]:
# final_model = pipelineModel_rt
from random import *

for i in range(30):
    new_train, new_test = converted_df.randomSplit([0.1, 0.9], seed=i + 1)
    new_test = new_test.sort(new_test.id.asc())

    predictions = pipelineModel_rt.transform(new_test)

    # Calculating metrics
    AreaUnderROC = evaluator_rt.evaluate(predictions)
    print("AreaUnderROC on our test set: %g" % AreaUnderROC)

    # Calculating metrics
    AreaUnderPR = evaluator_rt_PR.evaluate(predictions)
    print("AreaUnderPR on our test set: %g" % AreaUnderPR)

    # Accuracy
    accuracy = multi.evaluate(predictions)
    print("Accuracy is equal to {}%".format(round(accuracy, 3)))

    #evaluate results
    testCount = predictions.count()

    FP = predictions.where("prediction = 1 AND Profit = 0").count()
    FN = predictions.where("prediction = 0 AND Profit = 1").count()   
    TP = predictions.where("prediction = 1 AND Profit = 1").count() 
    TN = predictions.where("prediction = 0 AND Profit = 0").count()

    print("Count | FP | FN | TP | TN")
    print(
        str(testCount) + " | " + str(FP) + " | " + str(FN) + " | " + str(TP) +
        " | " + str(TN))

    print("####################################################")