## Setup PySpark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, unix_timestamp, lag, when, avg, to_timestamp
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier

## Init Spark

In [2]:
spark = SparkSession.builder.appName("CryptoTradingBot").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/31 13:40:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Load Data

In [4]:
from elasticsearch import Elasticsearch

es = Elasticsearch(hosts=["http://localhost:9200"])

response = es.search(index="historical", body={
    "query": {
        "match_all": {}
    },
    "size": 1000  
})

data = [hit["_source"] for hit in response["hits"]["hits"]]

spark = SparkSession.builder.appName("CryptoTradingBot").getOrCreate()

training_df = spark.createDataFrame(data)

training_df.head()

                                                                                

Row(price='95146.52000000', symbol='btcusdt', time='2024-12-29T11:42:13.653000')

In [5]:
training_df.printSchema()

root
 |-- price: string (nullable = true)
 |-- symbol: string (nullable = true)
 |-- time: string (nullable = true)



## Convert Data

In [6]:
training_df = training_df.withColumn("time", to_timestamp(col("time")))

training_df = training_df.withColumn("time_numeric", unix_timestamp(col("time")))

training_df = training_df.withColumn("price", col("price").cast("double"))

training_df.head(2)

[Row(price=95146.52, symbol='btcusdt', time=datetime.datetime(2024, 12, 29, 11, 42, 13, 653000), time_numeric=1735472533),
 Row(price=95146.52, symbol='btcusdt', time=datetime.datetime(2024, 12, 29, 11, 42, 14, 824000), time_numeric=1735472534)]

In [7]:
training_df.printSchema()

root
 |-- price: double (nullable = true)
 |-- symbol: string (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- time_numeric: long (nullable = true)



In [9]:
window_spec = Window.orderBy("time_numeric")
training_df = training_df.withColumn("prev_price", lag("price", 1).over(window_spec))

In [10]:
training_df = training_df.withColumn("label", when(col("price") > col("prev_price"), 1).otherwise(0))
training_df = training_df.na.drop()

In [12]:
assembler = VectorAssembler(inputCols=["time_numeric", "prev_price"], outputCol="features")
training_df = assembler.transform(training_df).select("features", "label")

In [13]:
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100)
model = rf.fit(training_df)

25/01/31 13:50:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/31 13:50:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/31 13:50:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/31 13:50:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/31 13:50:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/31 13:50:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/31 1

In [14]:
predictions = model.transform(training_df)
predictions.select("features", "label", "prediction").show()

25/01/31 13:51:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/31 13:51:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/31 13:51:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/31 13:51:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/31 13:51:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/31 13:51:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/31 1

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|[1.735472537E9,95...|    1|       1.0|
|[1.735472537E9,95...|    0|       1.0|
|[1.735472537E9,95...|    1|       1.0|
|[1.735472539E9,95...|    1|       1.0|
|[1.73547254E9,951...|    0|       1.0|
|[1.73547254E9,951...|    0|       1.0|
|[1.73547254E9,951...|    0|       1.0|
|[1.73547254E9,951...|    0|       1.0|
|[1.735472566E9,95...|    1|       1.0|
|[1.735472566E9,95...|    1|       1.0|
|[1.735472576E9,95...|    0|       1.0|
|[1.735472576E9,95...|    0|       1.0|
|[1.735472588E9,95...|    1|       1.0|
|[1.735472588E9,95...|    1|       1.0|
|[1.735472589E9,95...|    1|       1.0|
|[1.735472592E9,95...|    1|       1.0|
|[1.735472594E9,95...|    0|       1.0|
|[1.735472594E9,95...|    1|       1.0|
|[1.735472594E9,95...|    0|       1.0|
|[1.735472594E9,95...|    0|       1.0|
+--------------------+-----+----------+
only showing top 20 rows



In [None]:
# Sell if profit > 5% or loss > 2%
take_profit = 0.05
stop_loss = 0.02

training_df = training_df.withColumn(
    "sell_signal",
    when((col("price") > col("prev_price") * (1 + take_profit)), "SELL")
    .when((col("price") < col("prev_price") * (1 - stop_loss)), "SELL")
    .otherwise("HOLD")
)

training_df.select("time", "price", "sell_signal").show()

In [7]:
spark.stop()