In [1]:
from pyspark.sql import SparkSession 


In [2]:
spark = SparkSession.builder.getOrCreate()

In [14]:
bitcoinData = spark.read.option('inferSchema','true'
                  ).option('header','true'
                  ).csv('./data/archive/btcusd_1-min_data.csv')

In [19]:
print(bitcoinData.schema.fields)

[StructField('Timestamp', DoubleType(), True), StructField('Open', DoubleType(), True), StructField('High', DoubleType(), True), StructField('Low', DoubleType(), True), StructField('Close', DoubleType(), True), StructField('Volume', DoubleType(), True)]


In [22]:
bitcoinData.createOrReplaceTempView('bitcoin')

bitcoinData2 = spark.sql('''
          select
             max(Open) as Open,
             max(High) as High,
             min(Low)  as Low,
             max(Close) as Close,
             max(Volume) as Volume,
            max(to_date(FROM_UNIXTIME(Timestamp))) as data
            from bitcoin
            group by 
              to_date(FROM_UNIXTIME(Timestamp))
            
          
          ''')

In [23]:
bitcoinData2.show()

+------+------+------+------+-------------+----------+
|  Open|  High|   Low| Close|       Volume|      data|
+------+------+------+------+-------------+----------+
|  4.99|  4.99|  4.95|  4.99|         59.0|2012-04-17|
|  12.6|  12.6| 12.06|  12.6|        221.3|2012-10-06|
|  17.6|  17.6| 16.45|  17.6| 277.24326457|2013-01-22|
|  86.0|  86.0| 74.06|  86.0| 324.97025499|2013-03-26|
|118.56|118.56|116.55|118.56| 241.04896851|2013-05-21|
| 124.0|124.13| 115.0| 124.0|1080.14110169|2013-09-09|
|   8.8|   8.8|   7.7|   8.8|      369.759|2012-07-17|
|127.69|127.95|123.02|127.95| 705.41472463|2013-09-19|
| 10.85| 10.85| 10.64| 10.85|        112.0|2012-11-11|
| 20.66| 20.66|  19.5| 20.66| 992.03480753|2013-02-02|
|  5.78|  5.78|   5.5|  5.78|         34.0|2012-02-12|
| 10.19| 10.19|  9.78| 10.19| 212.22207719|2012-08-23|
| 12.39| 12.39| 12.01| 12.39| 176.51252367|2012-09-21|
|  4.91|  4.91|  4.76|  4.91|  42.42713318|2012-03-30|
|  7.59|  7.59|  7.44|  7.59| 224.21827128|2012-07-14|
|  20.7|  

In [25]:
from pyspark.sql.functions import date_format, col 

preppedData = bitcoinData2.na.fill(0).withColumn('day_of_week', date_format(col('data'), 'EEEE')).coalesce(5)

In [26]:
preppedData.show()

+------+------+------+------+-------------+----------+-----------+
|  Open|  High|   Low| Close|       Volume|      data|day_of_week|
+------+------+------+------+-------------+----------+-----------+
|  4.99|  4.99|  4.95|  4.99|         59.0|2012-04-17|    Tuesday|
|  12.6|  12.6| 12.06|  12.6|        221.3|2012-10-06|   Saturday|
|  17.6|  17.6| 16.45|  17.6| 277.24326457|2013-01-22|    Tuesday|
|  86.0|  86.0| 74.06|  86.0| 324.97025499|2013-03-26|    Tuesday|
|118.56|118.56|116.55|118.56| 241.04896851|2013-05-21|    Tuesday|
| 124.0|124.13| 115.0| 124.0|1080.14110169|2013-09-09|     Monday|
|   8.8|   8.8|   7.7|   8.8|      369.759|2012-07-17|    Tuesday|
|127.69|127.95|123.02|127.95| 705.41472463|2013-09-19|   Thursday|
| 10.85| 10.85| 10.64| 10.85|        112.0|2012-11-11|     Sunday|
| 20.66| 20.66|  19.5| 20.66| 992.03480753|2013-02-02|   Saturday|
|  5.78|  5.78|   5.5|  5.78|         34.0|2012-02-12|     Sunday|
| 10.19| 10.19|  9.78| 10.19| 212.22207719|2012-08-23|   Thurs

In [28]:
from pyspark.sql.functions import max as sparkMax, min as sparkMin 
result = preppedData.agg(sparkMax('data').alias('MaxDate'), sparkMin('data').alias('MinData'))
result.show()

+----------+----------+
|   MaxDate|   MinData|
+----------+----------+
|2025-03-08|2012-01-01|
+----------+----------+

