In [1]:
import sys
sys.path.append('/home/jovyan/work') # here add notebooks if fails

In [2]:
from pyspark.sql import SparkSession
import nbimporter
from utils.vault_scripts import read_root_token, get_secret_from_vault
from pyspark.sql import functions as F
from pyspark.sql.functions import explode, col, when, lit, expr
from graphframes import GraphFrame

In [3]:
spark = SparkSession.builder.appName("ExpDataAnalysisExchangeRates").getOrCreate()

In [4]:
hadoopConf = spark._jsc.hadoopConfiguration()

In [5]:
AWS_KEY_ID = get_secret_from_vault("aws1", "keyid")
AWS_ACCESS_KEY = get_secret_from_vault("aws2", "accesskey")
AWS_S3_BUCKET = get_secret_from_vault("aws3", "s3bucket")

In [6]:
hadoopConf.set("fs.s3a.access.key", AWS_KEY_ID)
hadoopConf.set("fs.s3a.secret.key", AWS_ACCESS_KEY)
hadoopConf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")

In [7]:
exchange_rate_eth_path = f"s3a://{AWS_S3_BUCKET}/raw/exchange_rates_hourly_usdt/ETH/*.json"

In [13]:
df = spark.read.json(exchange_rate_eth_path)

In [14]:
df.printSchema()

root
 |-- Data: struct (nullable = true)
 |    |-- Aggregated: boolean (nullable = true)
 |    |-- Data: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- close: double (nullable = true)
 |    |    |    |-- conversionSymbol: string (nullable = true)
 |    |    |    |-- conversionType: string (nullable = true)
 |    |    |    |-- high: double (nullable = true)
 |    |    |    |-- low: double (nullable = true)
 |    |    |    |-- open: double (nullable = true)
 |    |    |    |-- time: long (nullable = true)
 |    |    |    |-- volumefrom: double (nullable = true)
 |    |    |    |-- volumeto: double (nullable = true)
 |    |-- TimeFrom: long (nullable = true)
 |    |-- TimeTo: long (nullable = true)
 |-- Message: string (nullable = true)
 |-- Response: string (nullable = true)
 |-- Type: long (nullable = true)



In [21]:
df.take(1)



In [15]:
df_exploded = df.select(explode(col("Data.Data")).alias("data"))
df_exploded.printSchema()

root
 |-- data: struct (nullable = true)
 |    |-- close: double (nullable = true)
 |    |-- conversionSymbol: string (nullable = true)
 |    |-- conversionType: string (nullable = true)
 |    |-- high: double (nullable = true)
 |    |-- low: double (nullable = true)
 |    |-- open: double (nullable = true)
 |    |-- time: long (nullable = true)
 |    |-- volumefrom: double (nullable = true)
 |    |-- volumeto: double (nullable = true)



In [17]:
df_exploded.take(1)

[Row(data=Row(close=3146.45, conversionSymbol='', conversionType='direct', high=3165.76, low=3140.41, open=3165.07, time=1722574800, volumefrom=12500.54, volumeto=39420155.2))]

In [18]:
data_df = df_exploded.select(
    col("data.close"),
    F.from_unixtime(col("data.time"))
)
data_df.show()

+-------+---------------------------------------------+
|  close|from_unixtime(data.time, yyyy-MM-dd HH:mm:ss)|
+-------+---------------------------------------------+
|3146.45|                          2024-08-02 05:00:00|
|3152.93|                          2024-08-02 06:00:00|
|3144.83|                          2024-08-02 07:00:00|
|3157.08|                          2024-08-02 08:00:00|
|3150.33|                          2024-08-02 09:00:00|
|3156.37|                          2024-08-02 10:00:00|
|3151.13|                          2024-08-02 11:00:00|
|3151.17|                          2024-08-02 12:00:00|
|3158.42|                          2024-08-02 13:00:00|
|3035.85|                          2024-08-02 14:00:00|
|3036.51|                          2024-08-02 15:00:00|
|3024.84|                          2024-08-02 16:00:00|
|3022.94|                          2024-08-02 17:00:00|
|3012.12|                          2024-08-02 18:00:00|
|3008.36|                          2024-08-02 19

In [19]:
min_max_timestamps = df_exploded.agg(
    F.from_unixtime(F.min("data.time")).alias("min_event_timestamp"),
    F.from_unixtime(F.max("data.time")).alias("max_event_timestamp")
)
min_max_timestamps.show()

+-------------------+-------------------+
|min_event_timestamp|max_event_timestamp|
+-------------------+-------------------+
|2024-05-10 21:00:00|2024-10-24 13:00:00|
+-------------------+-------------------+

