## Setup PySpark

In [87]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, unix_timestamp, lag
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml.regression import LinearRegression

## Init Spark

In [88]:
spark = SparkSession.builder.appName("BitcoinPricePrediction").getOrCreate()

## Load Data

In [89]:
from elasticsearch import Elasticsearch

es = Elasticsearch(hosts=["http://localhost:9200"])

response = es.search(index="historical", body={
    "query": {
        "match_all": {}
    },
    "size": 1000  
})

data = [hit["_source"] for hit in response["hits"]["hits"]]

spark = SparkSession.builder.appName("BitcoinPricePrediction").getOrCreate()

training_df = spark.createDataFrame(data)

training_df.head()

Row(price='95146.52000000', symbol='btcusdt', time='2024-12-29T11:42:13.653000')

In [90]:
training_df.printSchema()

root
 |-- price: string (nullable = true)
 |-- symbol: string (nullable = true)
 |-- time: string (nullable = true)



## Convert Data

In [91]:

training_df = training_df.withColumn("time", to_timestamp(col("time")))

training_df = training_df.withColumn("time_numeric", unix_timestamp(col("time")))

training_df = training_df.withColumn("price", col("price").cast("double"))

training_df.head(2)

[Row(price=95146.52, symbol='btcusdt', time=datetime.datetime(2024, 12, 29, 11, 42, 13, 653000), time_numeric=1735472533),
 Row(price=95146.52, symbol='btcusdt', time=datetime.datetime(2024, 12, 29, 11, 42, 14, 824000), time_numeric=1735472534)]

In [92]:
training_df.printSchema()

root
 |-- price: double (nullable = true)
 |-- symbol: string (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- time_numeric: long (nullable = true)



## Feature Engineering


In [93]:
window_spec = Window.orderBy("time_numeric")

training_df = training_df.withColumn("prev_price", lag("price", 1).over(window_spec))

training_df = training_df.na.drop()

training_df.head(2)

25/01/29 14:29:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 14:29:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 14:29:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 14:29:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 14:29:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


[Row(price=95146.52, symbol='btcusdt', time=datetime.datetime(2024, 12, 29, 11, 42, 14, 824000), time_numeric=1735472534, prev_price=95146.52),
 Row(price=95147.64, symbol='btcusdt', time=datetime.datetime(2024, 12, 29, 11, 42, 17, 836000), time_numeric=1735472537, prev_price=95146.52)]

In [94]:
training_df.printSchema()

root
 |-- price: double (nullable = true)
 |-- symbol: string (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- time_numeric: long (nullable = true)
 |-- prev_price: double (nullable = true)



## Split Data

##### For time series models (such as Bitcoin predictions), randomSplit() is not ideal because future values are not randomly distributed. Instead, a rolling split should be used so that the model learns only from past data.

In [95]:
training_df = training_df.orderBy("time_numeric")

split_index = int(training_df.count() * 0.8)

train_data = training_df.limit(split_index)
test_data = training_df.subtract(train_data)

print(f"Train: {train_data.count()} rows, Test: {test_data.count()} rows")

25/01/29 14:29:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 14:29:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 14:29:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 14:29:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 14:29:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 14:29:48 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 1

Train: 799 rows, Test: 199 rows


## Vectorize

In [96]:
feature_cols = ["time_numeric", "prev_price"]
featureassembler=VectorAssembler(inputCols=feature_cols,outputCol="Independent Features")
train_data_output=featureassembler.transform(train_data)
print(train_data_output.show(5))
print(train_data_output.columns)

25/01/29 14:29:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 14:29:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 14:29:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 14:29:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 14:29:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+--------+-------+--------------------+------------+----------+--------------------+
|   price| symbol|                time|time_numeric|prev_price|Independent Features|
+--------+-------+--------------------+------------+----------+--------------------+
|95146.52|btcusdt|2024-12-29 11:42:...|  1735472534|  95146.52|[1.735472534E9,95...|
|95147.64|btcusdt|2024-12-29 11:42:...|  1735472537|  95146.52|[1.735472537E9,95...|
|95147.64|btcusdt|2024-12-29 11:42:...|  1735472537|  95147.64|[1.735472537E9,95...|
|95147.84|btcusdt|2024-12-29 11:42:...|  1735472537|  95147.64|[1.735472537E9,95...|
|95147.86|btcusdt|2024-12-29 11:42:...|  1735472539|  95147.84|[1.735472539E9,95...|
+--------+-------+--------------------+------------+----------+--------------------+
only showing top 5 rows

None
['price', 'symbol', 'time', 'time_numeric', 'prev_price', 'Independent Features']


## Prepare Data ML Model

In [None]:
scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")
scaler_model = scaler.fit(train_data_output)
training_data_output = scaler_model.transform(train_data_output)

training_data_output.select("features", "scaled_features").show(truncate=False)

In [None]:
finalized_data=training_data_output.select("Independent Features","price")
print(finalized_data.show(5))

25/01/29 13:59:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 13:59:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 13:59:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 13:59:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 13:59:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+--------+-------+--------------------+------------+----------+--------------------+
|   price| symbol|                time|time_numeric|prev_price|Independent Features|
+--------+-------+--------------------+------------+----------+--------------------+
|95146.52|btcusdt|2024-12-29 11:42:...|  1735472534|  95146.52|[1.735472534E9,95...|
|95147.64|btcusdt|2024-12-29 11:42:...|  1735472537|  95146.52|[1.735472537E9,95...|
|95147.64|btcusdt|2024-12-29 11:42:...|  1735472537|  95147.64|[1.735472537E9,95...|
|95147.84|btcusdt|2024-12-29 11:42:...|  1735472537|  95147.64|[1.735472537E9,95...|
|95147.86|btcusdt|2024-12-29 11:42:...|  1735472539|  95147.84|[1.735472539E9,95...|
+--------+-------+--------------------+------------+----------+--------------------+
only showing top 5 rows

None
['price', 'symbol', 'time', 'time_numeric', 'prev_price', 'Independent Features']


25/01/29 13:59:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 13:59:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 13:59:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 13:59:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 13:59:47 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+--------------------+--------+
|Independent Features|   price|
+--------------------+--------+
|[1.735472534E9,95...|95146.52|
|[1.735472537E9,95...|95147.64|
|[1.735472537E9,95...|95147.64|
|[1.735472537E9,95...|95147.84|
|[1.735472539E9,95...|95147.86|
+--------------------+--------+
only showing top 5 rows

None


## Split Data

In [79]:
#train_data,test_data=finalized_data.randomSplit([0.75,0.25])
#train_data.show(2)

## Train Model

In [81]:
regressor=LinearRegression(featuresCol="Independent Features",labelCol="price")
regressor=regressor.fit(finalized_data)

25/01/29 14:01:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 14:01:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 14:01:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 14:01:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 14:01:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 14:01:18 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/01/29 1

In [82]:
print(f"Coefficients: {regressor.coefficients}")
print(f"Intercept: {regressor.intercept}")

Coefficients: [-0.6013950992702369,0.21311176201088236]
Intercept: 1043779918.4383444


In [85]:
regressor.save("regression_model version 1")

In [5]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Elasticsearch to Pyspark').getOrCreate()

25/01/29 09:24:47 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [6]:
from elasticsearch import Elasticsearch

es = Elasticsearch(hosts=["http://localhost:9200"])

response = es.search(index="historical", body={
    "query": {
        "match_all": {}
    },
    "size": 1000  
})

data = [hit["_source"] for hit in response["hits"]["hits"]]

In [9]:
from pyspark.sql import SparkSession

# Spark Session erstellen
spark = SparkSession.builder.appName("Elasticsearch to PySpark").getOrCreate()

training_df = spark.createDataFrame(data)


#from pyspark.ml.feature import VectorAssembler
#assembler = VectorAssembler(inputCols=["feature1", "feature2"], outputCol="features")
#dataset = assembler.transform(df).select("features", "label")

In [10]:
training_df.head()

Row(price='95146.52000000', symbol='btcusdt', time='2024-12-29T11:42:13.653000')

## read data

In [20]:
#training=spark.read.csv('/home/ubuntu/CryptoBot/regression/SalaryData.csv',  header=True, inferSchema=True, nullValue="NULL")

In [11]:
training_df.show()

+--------------+-------+--------------------+
|         price| symbol|                time|
+--------------+-------+--------------------+
|95146.52000000|btcusdt|2024-12-29T11:42:...|
|95146.52000000|btcusdt|2024-12-29T11:42:...|
|95147.64000000|btcusdt|2024-12-29T11:42:...|
|95147.64000000|btcusdt|2024-12-29T11:42:...|
|95147.84000000|btcusdt|2024-12-29T11:42:...|
|95147.86000000|btcusdt|2024-12-29T11:42:...|
|95147.86000000|btcusdt|2024-12-29T11:42:...|
|95147.84000000|btcusdt|2024-12-29T11:42:...|
|95147.83000000|btcusdt|2024-12-29T11:42:...|
|95147.63000000|btcusdt|2024-12-29T11:42:...|
|95147.87000000|btcusdt|2024-12-29T11:42:...|
|95147.89000000|btcusdt|2024-12-29T11:42:...|
|95147.86000000|btcusdt|2024-12-29T11:42:...|
|95147.82000000|btcusdt|2024-12-29T11:42:...|
|95147.87000000|btcusdt|2024-12-29T11:43:...|
|95147.89000000|btcusdt|2024-12-29T11:43:...|
|95147.92000000|btcusdt|2024-12-29T11:43:...|
|95148.75000000|btcusdt|2024-12-29T11:43:...|
|95148.75000000|btcusdt|2024-12-29

In [12]:
training_df.printSchema()

root
 |-- price: string (nullable = true)
 |-- symbol: string (nullable = true)
 |-- time: string (nullable = true)



In [13]:
training_df.columns

['price', 'symbol', 'time']

In [38]:
from pyspark.sql.functions import col, to_timestamp, unix_timestamp

training_df = training_df.withColumn("time", to_timestamp(col("time")))
training_df = training_df.withColumn("time_numeric", unix_timestamp(col("time")))

training_df = training_df.withColumn("price", col("price").cast("double"))

In [39]:
training_df.show(2)
training_df.printSchema()

+--------+-------+--------------------+------------+
|   price| symbol|                time|time_numeric|
+--------+-------+--------------------+------------+
|95146.52|btcusdt|2024-12-29 11:42:...|  1735472533|
|95146.52|btcusdt|2024-12-29 11:42:...|  1735472534|
+--------+-------+--------------------+------------+
only showing top 2 rows

root
 |-- price: double (nullable = true)
 |-- symbol: string (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- time_numeric: long (nullable = true)



In [40]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=["time_numeric"],outputCol="Independent Features")
output=featureassembler.transform(training_df)
output.show(5)
output.columns
finalized_data=output.select("Independent Features","price")


In [45]:
from pyspark.ml.regression import LinearRegression
##traintest split
train_data,test_data=finalized_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol="Independent Features",labelCol="price")
regressor=regressor.fit(train_data)


25/01/29 10:16:21 WARN Instrumentation: [b5239d50] regParam is zero, which might cause numerical instability and overfitting.
25/01/29 10:16:22 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/01/29 10:16:22 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [46]:
print(f"Coefficients: {regressor.coefficients}")
print(f"Intercept: {regressor.intercept}")
regressor.save("regression_model")


Coefficients: [-1.5883647098528668]
Intercept: 2756659435.7723465
