## Setup PySpark

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, unix_timestamp, lag, when, avg, to_timestamp
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier

## Init Spark

In [4]:
spark = SparkSession.builder.appName("CryptoTradingBot").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/30 16:14:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Load Data

In [5]:
from elasticsearch import Elasticsearch

es = Elasticsearch(hosts=["http://localhost:9200"])

response = es.search(index="historical", body={
    "query": {
        "match_all": {}
    },
    "size": 1000  
})

data = [hit["_source"] for hit in response["hits"]["hits"]]

spark = SparkSession.builder.appName("CryptoTradingBot").getOrCreate()

training_df = spark.createDataFrame(data)

training_df.head()

                                                                                

Row(price='95146.52000000', symbol='btcusdt', time='2024-12-29T11:42:13.653000')

In [6]:
training_df.printSchema()

root
 |-- price: string (nullable = true)
 |-- symbol: string (nullable = true)
 |-- time: string (nullable = true)



## Convert Data

In [None]:
training_df = training_df.withColumn("time", to_timestamp(col("time")))

training_df = training_df.withColumn("time_numeric", unix_timestamp(col("time")))

training_df = training_df.withColumn("price", col("price").cast("double"))

training_df.head(2)

In [None]:
training_df.printSchema()

In [None]:
training_df = training_df.withColumn("time_numeric", unix_timestamp(col("time")))


window_spec = Window.orderBy("time_numeric")
training_df = training_df.withColumn("prev_price", lag("price", 1).over(window_spec))
#training_df = training_df.withColumn("prev_volume", lag("volume", 1).over(window_spec))


training_df = training_df.withColumn("ma_price", avg("price").over(Window.orderBy("time_numeric").rowsBetween(-2, 0)))


training_df = training_df.withColumn("label", when(col("price") > col("prev_price"), 1).otherwise(0))

# Remove null values
training_df = training_df.na.drop()


assembler = VectorAssembler(inputCols=["time_numeric", "prev_price", "prev_volume", "ma_price"], outputCol="features")
training_df = assembler.transform(training_df).select("features", "label")


rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100)
model = rf.fit(training_df)

# Predict Buy/Hold Decision
predictions = model.transform(training_df)
predictions.select("features", "label", "prediction").show()

In [None]:
# Sell if profit > 5% or loss > 2%
take_profit = 0.05
stop_loss = 0.02

training_df = training_df.withColumn(
    "sell_signal",
    when((col("price") > col("prev_price") * (1 + take_profit)), "SELL")
    .when((col("price") < col("prev_price") * (1 - stop_loss)), "SELL")
    .otherwise("HOLD")
)

training_df.select("time", "price", "sell_signal").show()

In [7]:
spark.stop()