#### Bronze Layer

In [2]:
from pyspark.sql.functions import col, explode, arrays_zip, to_date, map_keys, current_timestamp
from pyspark.sql.types import TimestampType
import json
from pyspark.sql.functions import map_from_entries, struct, lit
from pyspark.sql.functions import to_timestamp, from_unixtime
from pyspark.sql import DataFrame
from functools import reduce

# Pfade
news_path = "Files/news_data.json"
data_path = "abfss://FinanceProject@onelake.dfs.fabric.microsoft.com/finance_lakehouse.Lakehouse/Files"

# News Data zu DF
df = spark.read.option("multiline", "true").json(news_path)

df_news = df.selectExpr("explode(articles) as article").select(
    col("article.source.name").cast("string").alias("name"),
    col("article.url").cast("string").alias("url"),
    col("article.urlToImage").cast("string").alias("urlToImage"),
    col("article.content").cast("string").alias("content"),
    col("article.title").cast("string").alias("title"),
    col("article.publishedAt").cast(TimestampType()).alias("PublishDate"),
)

df_news = df_news.withColumn("ingestion_time", current_timestamp())
df_news.write.mode("overwrite").saveAsTable("news_data_bronze")

# Liste der Dateien im Verzeichnis
files = mssparkutils.fs.ls(data_path)

# History Data zu DF
relevant_files = [file.path for file in files if "history_data" in file.name.lower()]

for file_path in relevant_files:
    try:
        # Lese die JSON-Datei
        df_raw = spark.read.option("multiline", "true").json(file_path)

        # Extrahiere Metadaten
        df_meta = df_raw.select(
            col("Meta Data").getItem("2. Symbol").cast("string").alias("symbol"),
            col("Meta Data").getItem("4. Interval").cast("string").alias("interval")
        )

        # Extrahiere den ticker als String
        file_name = file_path.split("/")[-1]
        file_name_without_extension = file_name.replace(".json", "")
        ticker = file_name_without_extension.split("_")[0]
        
        # Extrahiere die Zeitreihen-Daten
        df_time = df_raw.select("`Time Series (5min)`")
        df_time = df_time.withColumnRenamed("Time Series (5min)", "time_struct")

        # Konvertiere die Map in ein RDD und dann in einen DataFrame
        time_map = df_time.rdd.flatMap(lambda row: row[0].asDict().items())
        df_temp = spark.createDataFrame(time_map, ["timestamp", "data"])

        # Extrahiere die Werte aus der Struct-Spalte
        df_data = df_temp.select(
            col("timestamp"),
            col("data").getItem("2. high").cast("double").alias("high"),
            col("data").getItem("3. low").cast("double").alias("low"),
            col("data").getItem("5. volume").cast("double").alias("volume"),
            col("data").getItem("1. open").cast("double").alias("open"),
            col("data").getItem("4. close").cast("double").alias("close")
        )

        # Führe einen Cross-Join durch, um die Metadaten hinzuzufügen
        df_final = df_data.crossJoin(df_meta)

        # Erstelle den Tabellennamen mit dem korrekten ticker
        file_name = f"{ticker}_historic_data_bronze"

        df_final = df_final.withColumn("ingestion_time", current_timestamp())

        # Schreibe die Daten in die Tabelle
        df_final.write.mode("overwrite").saveAsTable(file_name)

        print(f"Daten in Tabelle {file_name} geschrieben.")

    except Exception as e:
        print(f"Fehler bei der Verarbeitung von {file_path}: {str(e)}")
        continue

# Real-time Data zu DF
relevant_files = [file.path for file in files if "realtime_data" in file.name.lower()]

for file_path in relevant_files:
    try:
        # Lese die JSON-Datei
        df_raw = spark.read.option("multiline", "true").json(file_path)

        # Extrahiere die Real-time-Daten
        df_data = df_raw.select(
            col("c").cast("double").alias("current_price"), 
            col("d").cast("double").alias("change"),
            col("dp").cast("double").alias("percentage_change"), 
            col("h").cast("double").alias("high"),
            col("l").cast("double").alias("low"),
            col("o").cast("double").alias("open"),
            col("pc").cast("double").alias("previous_close"),  
            to_timestamp(from_unixtime(col("t").cast("bigint"))).alias("timestamp")
        )

        # Extrahiere den ticker aus dem Dateipfad
        file_name = file_path.split("/")[-1]
        file_name_without_extension = file_name.replace(".json", "")
        ticker = file_name_without_extension.split("_")[0]

        # Erstelle den Tabellennamen
        file_name = f"{ticker}_realtime_data_bronze"

        df_data = df_data.withColumn("ingestion_time", current_timestamp())

        # Schreibe die Daten in die Tabelle
        df_data.write.mode("overwrite").saveAsTable(file_name)

        print(f"Daten in Tabelle {file_name} geschrieben.")

    except Exception as e:
        print(f"Fehler bei der Verarbeitung von {file_path}: {str(e)}")
        continue

StatementMeta(, 499df2cb-265e-46d9-b814-ac16f123eefd, 4, Finished, Available, Finished)

Daten in Tabelle QQQ_historic_data_bronze geschrieben.
Daten in Tabelle SPY_historic_data_bronze geschrieben.
Daten in Tabelle QQQ_realtime_data_bronze geschrieben.
Daten in Tabelle SPY_realtime_data_bronze geschrieben.
