#### Sentiment berechnen

In [None]:
from pyspark.sql.functions import pandas_udf, col, concat_ws
from pyspark.sql.types import StructType, StructField, StringType, FloatType
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import pandas as pd

# FinBERT laden
tokenizer = BertTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = BertForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

# Funktion für FinBERT-Sentiment
def finbert_sentiment(text_series: pd.Series) -> pd.DataFrame:
    sentiments = []
    confidences = []
    # Konvertiere alle Einträge zu Strings und ersetze None durch ""
    text_series = text_series.fillna("").astype(str)
    
    for text in text_series:
        if not text.strip():  # Falls leerer String
            sentiments.append("neutral")
            confidences.append(0.0)
        else:
            inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
            with torch.no_grad():
                outputs = model(**inputs)
            scores = torch.softmax(outputs.logits, dim=1).numpy()[0]
            labels = ["negative", "neutral", "positive"]
            sentiment = labels[scores.argmax()]
            confidence = scores.max()
            sentiments.append(sentiment)    
            confidences.append(float(confidence))
    
    return pd.DataFrame({"sentiment": sentiments, "confidence": confidences})

# Schema für die Rückgabe der UDF
schema = StructType([
    StructField("sentiment", StringType(), True),
    StructField("confidence", FloatType(), True)
])

# Pandas-UDF definieren
@pandas_udf(schema)
def compute_sentiment(text_series: pd.Series) -> pd.DataFrame:
    return finbert_sentiment(text_series)

# JSON-Datei einlesen
df = spark.read.option("multiline", "true").json("Files/news_data.json")

# Entferne die Verschachtelung von "articles"
df_articles = df.selectExpr("explode(articles) as article").select(
    "article.title",
    "article.description"
)

# Kombiniere title und description, ersetze NULL durch leeren String
df_articles = df_articles.withColumn(
    "text",
     concat_ws(" ", col("title").cast("string"), col("description").cast("string"))
)

# FinBERT-Sentiment berechnen
df_with_sentiment = df_articles.withColumn(
    "sentiment_result",
    compute_sentiment("text")
).select(
    "title",
    "description",
    "sentiment_result.sentiment",
    "sentiment_result.confidence"
)

# Ergebnisse speichern
df_with_sentiment.write.mode("overwrite").saveAsTable("news_with_sentiment_gold")

#### Prognosen der Kurse berechnen


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lag, avg, when, lead, last
from pyspark.sql.window import Window
from pyspark.sql.types import StructType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# SparkSession
spark = SparkSession.builder.getOrCreate()

# Symbole
symbols_array = symbols.split(",")

# Sentiment-Daten einlesen
df_sentiment = spark.read.table("finance_lakehouse.dbo.news_with_sentiment_gold")

# Schritt 1: Allgemeine Marktstimmung berechnen
# Numerischen Sentiment-Score berechnen
df_sentiment = df_sentiment.withColumn(
    "sentiment_score",
    when(col("sentiment") == "positive", 1).when(col("sentiment") == "negative", -1).otherwise(0)
)

# Gewichteten Sentiment-Score berechnen (sentiment_score * confidence)
df_sentiment = df_sentiment.withColumn(
    "weighted_sentiment_score",
    col("sentiment_score") * col("confidence")
)

# Durchschnittlichen gewichteten Sentiment-Score berechnen
overall_market_sentiment = df_sentiment.select(avg("weighted_sentiment_score").alias("overall_market_sentiment")).collect()[0]["overall_market_sentiment"]

# Marktstimmung interpretieren
if overall_market_sentiment > 0:
    market_mood = "positive"
elif overall_market_sentiment < 0:
    market_mood = "negative"
else:
    market_mood = "neutral"

print(f"Allgemeine Marktstimmung: {market_mood} (Score: {overall_market_sentiment})")

# Liste für die Ergebnisse der ersten Prognose
first_predictions = []

# Schritt 2: Erste Prognose (nur Finanzdaten)
for symbol in symbols_array:
    print(f"Erste Prognose für Symbol: {symbol}")
    
    # Finanzdaten für das aktuelle Symbol einlesen
    df_finance = spark.read.table(f"finance_lakehouse.dbo.{symbol}_data_silver")
    
    # Runde den Zeitstempel auf das nächste 1-Stunden-Intervall
    df_finance = df_finance.withColumn(
        "timestamp_1h",
        (col("timestamp").cast("long") / 3600).cast("long") * 3600
    ).withColumn(
        "timestamp_1h",
        col("timestamp_1h").cast("timestamp")
    )
    
    # Aggregiere Finanzdaten auf 1-Stunden-Intervalle
    df_finance_agg = df_finance.groupBy("symbol", "timestamp_1h").agg(
        last("close").alias("close"),
        avg("volume").alias("volume"),
        avg("percentage_change").alias("percentage_change")
    ).orderBy("timestamp_1h")
    
    # Feature-Engineering
    window_spec = Window.partitionBy("symbol").orderBy("timestamp_1h")
    
    # Lagged Features (z. B. close der letzten 5 Stunden)
    for i in range(1, 6):
        df_finance_agg = df_finance_agg.withColumn(
            f"close_lag_{i}",
            lag(col("close"), i).over(window_spec)
        ).withColumn(
            f"volume_lag_{i}",
            lag(col("volume"), i).over(window_spec)
        )
    
    # Gleitender Durchschnitt (5-Stunden-Durchschnitt von close)
    df_finance_agg = df_finance_agg.withColumn(
        "close_ma_5",
        avg(col("close")).over(window_spec.rowsBetween(-4, 0))
    )
    
    # Zielvariable: close-Preis der nächsten Stunde
    df_finance_agg = df_finance_agg.withColumn(
        "future_close_1h",
        lead(col("close"), 1).over(window_spec)
    )
    
    # Fehlende Werte behandeln (nur für Feature-Spalten)
    feature_columns = [
        "close", "volume", "percentage_change",
        "close_lag_1", "close_lag_2", "close_lag_3", "close_lag_4", "close_lag_5",
        "volume_lag_1", "volume_lag_2", "volume_lag_3", "volume_lag_4", "volume_lag_5",
        "close_ma_5"
    ]
    df_finance_agg = df_finance_agg.fillna(0, subset=feature_columns)
    
    # VectorAssembler für die Features
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    df_features = assembler.transform(df_finance_agg)
    
    # Für das Training: Zeilen entfernen, bei denen future_close_1h null ist
    df_features_for_training = df_features.filter(col("future_close_1h").isNotNull())
    
    # Train-Test-Split (z. B. 80% Training, 20% Test)
    train_data, test_data = df_features_for_training.randomSplit([0.8, 0.2], seed=42)
    
    # XGBoost-Modell (in Spark als GBTRegressor implementiert)
    gbt = GBTRegressor(
        featuresCol="features",
        labelCol="future_close_1h",
        maxIter=50,
        maxDepth=5,
        seed=42
    )
    
    # Modell trainieren
    model = gbt.fit(train_data)
    
    # Prognosen auf Testdaten erstellen
    predictions = model.transform(test_data)
    
    # Evaluierung
    evaluator = RegressionEvaluator(labelCol="future_close_1h", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print(f"RMSE für {symbol} (Erste Prognose): {rmse}")
    
    # Für die Prognose: Zeilen auswählen, bei denen future_close_1h null ist
    latest_data = df_features.filter(col("future_close_1h").isNull()).orderBy("timestamp_1h", ascending=False).limit(1)
    
    # Überprüfen, ob latest_data leer ist
    if latest_data.count() == 0:
        raise ValueError(f"Keine Daten mit future_close_1h = null für Symbol {symbol} gefunden. Überprüfe die Daten oder die Logik.")
    
    latest_timestamp = latest_data.select("timestamp_1h").collect()[0]["timestamp_1h"]
    
    # Iterativ prognostizieren für die nächsten 168 Stunden
    current_data = latest_data
    predictions_list = []
    
    for i in range(168):
        # Prognose für die nächste Stunde
        prediction = model.transform(current_data)
        
        # Prognostizierten close-Preis extrahieren
        predicted_close = prediction.select("prediction").collect()[0]["prediction"]
        
        # Werte für die Feature-Spalten einmalig sammeln
        current_row = current_data.collect()[0]
        close_lag_1 = current_row["close_lag_1"]
        close_lag_2 = current_row["close_lag_2"]
        close_lag_3 = current_row["close_lag_3"]
        close_lag_4 = current_row["close_lag_4"]
        close_lag_5 = current_row["close_lag_5"]
        volume_lag_1 = current_row["volume_lag_1"]
        volume_lag_2 = current_row["volume_lag_2"]
        volume_lag_3 = current_row["volume_lag_3"]
        volume_lag_4 = current_row["volume_lag_4"]
        volume_lag_5 = current_row["volume_lag_5"]
        close_ma_5 = (close_lag_1 + close_lag_2 + close_lag_3 + close_lag_4 + predicted_close) / 5
        
        # Neue Zeile für die Prognose erstellen (ohne die features-Spalte im Tupel)
        new_timestamp = latest_timestamp + timedelta(hours=i + 1)
        new_row_data = [
            symbol,
            new_timestamp,
            predicted_close,
            0.0,
            0.0,
            close_lag_1,
            close_lag_2,
            close_lag_3,
            close_lag_4,
            close_lag_5,
            volume_lag_1,
            volume_lag_2,
            volume_lag_3,
            volume_lag_4,
            volume_lag_5,
            close_ma_5,
            None
        ]
        
        # Schema ohne die features-Spalte erstellen
        new_row_schema = StructType([field for field in df_features.schema if field.name != "features"])
        new_row = spark.createDataFrame([new_row_data], schema=new_row_schema)
        
        # VectorAssembler anwenden, um die features-Spalte hinzuzufügen
        assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
        new_row = assembler.transform(new_row)
        
        # Prognose speichern
        predictions_list.append(spark.createDataFrame([(
            symbol,
            new_timestamp,
            predicted_close
        )], schema=["symbol", "timestamp_1h", "predicted_future_close"]))
        
        # Features für die nächste Iteration aktualisieren
        current_data = new_row
    
    # Prognosen für das aktuelle Symbol zusammenführen
    symbol_predictions = predictions_list[0]
    for df in predictions_list[1:]:
        symbol_predictions = symbol_predictions.union(df)
    
    first_predictions.append(symbol_predictions)

# Alle Prognosen der ersten Runde zusammenführen
first_predictions_df = first_predictions[0]
for df in first_predictions[1:]:
    first_predictions_df = first_predictions_df.union(df)

# Inhalt von first_predictions_df überprüfen
print("Inhalt von first_predictions_df:")
first_predictions_df.show(5)

# Schritt 3: Finale Prognose (mit allgemeiner Marktstimmung)
print("Finale Prognose mit allgemeiner Marktstimmung...")

# Liste für die finalen Prognosen
final_predictions = []

for symbol in symbols_array:
    print(f"Finale Prognose für Symbol: {symbol}")
    
    # Finanzdaten für das aktuelle Symbol einlesen
    df_finance = spark.read.table(f"finance_lakehouse.dbo.{symbol}_data_silver")
    
    # Runde den Zeitstempel auf das nächste 1-Stunden-Intervall
    df_finance = df_finance.withColumn(
        "timestamp_1h",
        (col("timestamp").cast("long") / 3600).cast("long") * 3600
    ).withColumn(
        "timestamp_1h",
        col("timestamp_1h").cast("timestamp")
    )
    
    # Aggregiere Finanzdaten auf 1-Stunden-Intervalle
    df_finance_agg = df_finance.groupBy("symbol", "timestamp_1h").agg(
        last("close").alias("close"),
        avg("volume").alias("volume"),
        avg("percentage_change").alias("percentage_change")
    ).orderBy("timestamp_1h")
    
    # Mit ersten Prognosen joinen
    df_combined = df_finance_agg.join(
        first_predictions_df,
        ["symbol", "timestamp_1h"],
        "left"
    )
    
    # Allgemeine Marktstimmung als Feature hinzufügen
    df_combined = df_combined.withColumn(
        "overall_market_sentiment",
        col("timestamp_1h").cast("double").cast("long") * 0 + overall_market_sentiment
    )
    
    # Feature-Engineering (erneut, aber mit Marktstimmung)
    window_spec = Window.partitionBy("symbol").orderBy("timestamp_1h")
    
    # Lagged Features (z. B. close der letzten 5 Stunden)
    for i in range(1, 6):
        df_combined = df_combined.withColumn(
            f"close_lag_{i}",
            lag(col("close"), i).over(window_spec)
        ).withColumn(
            f"volume_lag_{i}",
            lag(col("volume"), i).over(window_spec)
        )
    
    # Gleitender Durchschnitt (5-Stunden-Durchschnitt von close)
    df_combined = df_combined.withColumn(
        "close_ma_5",
        avg(col("close")).over(window_spec.rowsBetween(-4, 0))
    )
    
    # Zielvariable: close-Preis der nächsten Stunde
    df_combined = df_combined.withColumn(
        "future_close_1h",
        lead(col("close"), 1).over(window_spec)
    )
    
    # Fehlende Werte behandeln (nur für Feature-Spalten)
    feature_columns = [
        "close", "volume", "percentage_change",
        "close_lag_1", "close_lag_2", "close_lag_3", "close_lag_4", "close_lag_5",
        "volume_lag_1", "volume_lag_2", "volume_lag_3", "volume_lag_4", "volume_lag_5",
        "close_ma_5",
        "predicted_future_close",
        "overall_market_sentiment"
    ]
    df_combined = df_combined.fillna(0, subset=feature_columns)
    
    # VectorAssembler für die Features
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    df_features = assembler.transform(df_combined)
    
    # Für das Training: Zeilen entfernen, bei denen future_close_1h null ist
    df_features_for_training = df_features.filter(col("future_close_1h").isNotNull())
    
    # Train-Test-Split (z. B. 80% Training, 20% Test)
    train_data, test_data = df_features_for_training.randomSplit([0.8, 0.2], seed=42)
    
    # XGBoost-Modell (in Spark als GBTRegressor implementiert)
    gbt = GBTRegressor(
        featuresCol="features",
        labelCol="future_close_1h",
        maxIter=50,
        maxDepth=5,
        seed=42
    )
    
    # Modell trainieren
    model = gbt.fit(train_data)
    
    # Prognosen auf Testdaten erstellen
    predictions = model.transform(test_data)
    
    # Evaluierung
    evaluator = RegressionEvaluator(labelCol="future_close_1h", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print(f"RMSE für {symbol} (Finale Prognose): {rmse}")
    
    # Für die Prognose: Zeilen auswählen, bei denen future_close_1h null ist
    latest_data = df_features.filter(col("future_close_1h").isNull()).orderBy("timestamp_1h", ascending=False).limit(1)
    
    # Überprüfen, ob latest_data leer ist
    if latest_data.count() == 0:
        raise ValueError(f"Keine Daten mit future_close_1h = null für Symbol {symbol} gefunden. Überprüfe die Daten oder die Logik.")
    
    # Inhalt von latest_data überprüfen
    print(f"Inhalt von latest_data für Symbol {symbol} in Schritt 3:")
    latest_data.show()
    
    latest_timestamp = latest_data.select("timestamp_1h").collect()[0]["timestamp_1h"]
    
    # Iterativ prognostizieren für die nächsten 168 Stunden
    current_data = latest_data
    predictions_list = []
    
    for i in range(168):
        # Prognose für die nächste Stunde
        prediction = model.transform(current_data)
        
        # Prognostizierten close-Preis extrahieren
        predicted_close = prediction.select("prediction").collect()[0]["prediction"]
        
        # Werte für die Feature-Spalten einmalig sammeln
        current_row = current_data.collect()[0]
        close = current_row["close"]
        close_lag_1 = current_row["close_lag_1"]
        close_lag_2 = current_row["close_lag_2"]
        close_lag_3 = current_row["close_lag_3"]
        close_lag_4 = current_row["close_lag_4"]
        close_lag_5 = current_row["close_lag_5"]
        volume_lag_1 = current_row["volume_lag_1"]
        volume_lag_2 = current_row["volume_lag_2"]
        volume_lag_3 = current_row["volume_lag_3"]
        volume_lag_4 = current_row["volume_lag_4"]
        volume_lag_5 = current_row["volume_lag_5"]
        predicted_future_close = current_row["predicted_future_close"]
        close_ma_5 = (close_lag_1 + close_lag_2 + close_lag_3 + close_lag_4 + predicted_close) / 5
        
        # Neue Zeile für die Prognose erstellen (ohne die features-Spalte im Tupel)
        new_timestamp = latest_timestamp + timedelta(hours=i + 1)
        new_row_data = [
            symbol,
            new_timestamp,
            predicted_close,
            0.0,
            0.0,
            predicted_future_close,
            overall_market_sentiment,
            close_lag_1,
            close_lag_2,
            close_lag_3,
            close_lag_4,
            close_lag_5,
            volume_lag_1,
            volume_lag_2,
            volume_lag_3,
            volume_lag_4,
            volume_lag_5,
            close_ma_5,
            None
        ]
        
        # Schema ohne die features-Spalte erstellen
        new_row_schema = StructType([field for field in df_features.schema if field.name != "features"])
        new_row = spark.createDataFrame([new_row_data], schema=new_row_schema)
        
        # VectorAssembler anwenden, um die features-Spalte hinzuzufügen
        assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
        new_row = assembler.transform(new_row)
        
        # Prognose speichern
        predictions_list.append(spark.createDataFrame([(
            symbol,
            new_timestamp,
            close,
            predicted_future_close,
            predicted_close
        )], schema=["symbol", "timestamp_1h", "close", "first_predicted_future_close", "final_predicted_future_close"]))
        
        # Features für die nächste Iteration aktualisieren
        current_data = new_row
    
    # Prognosen für das aktuelle Symbol zusammenführen
    symbol_predictions = predictions_list[0]
    for df in predictions_list[1:]:
        symbol_predictions = symbol_predictions.union(df)
    
    final_predictions.append(symbol_predictions)

# Alle finalen Prognosen zusammenführen
final_predictions_df = final_predictions[0]
for df in final_predictions[1:]:
    final_predictions_df = final_predictions_df.union(df)

# Ergebnisse in der Gold Layer speichern
final_predictions_df.write.mode("overwrite").saveAsTable("finance_lakehouse.dbo.stock_predictions_gold")

# Ergebnis anzeigen
final_predictions_df.show(20)