In [12]:
import pandas as pd
import numpy as np

# List of files to read
files = {
    "bitcoin": "bitcoin.csv",
    "gold": "gold.csv",
    "sp500": "sp500.csv",
    "treasury_3m": "treasury_3m.csv",
    "treasury_10y": "treasury_10y.csv",
    "google_trends": "google_trends.csv"
}

# Read CSV files
data = {}
for key, file in files.items():
    data[key] = pd.read_csv('data/' + file, parse_dates=["timestamp"], index_col="timestamp")

# Merge data on timestamp
df = data["sp500"][["Close"]].rename(columns={"Close": "sp500_Close"})
for key in ["bitcoin", "gold"]:
    df = df.join(data[key][["Close"]].rename(columns={"Close": f"{key}_Close"}), how="left")

for key in ["treasury_3m", "treasury_10y"]:
    df = df.join(data[key], how="left")

# Rename Google Trends columns with prefix Google_
if "google_trends" in data:
    data["google_trends"] = data["google_trends"].rename(columns={col: f"Google_{col}" for col in data["google_trends"].columns})
    df = df.join(data["google_trends"], how="left")

# Drop rows with missing values
df.dropna(inplace=True)

# Calculate correlation of S&P 500 Close with other assets
correlations = df.corr().loc["sp500_Close"]
print("Correlation with S&P 500 Close:")
print(correlations)

# Lagging S&P 500 Close prices
lags = [7, 30, 180]
lagged_correlations = {}

for lag in lags:
    df[f"sp500_Close_lag{lag}"] = df["sp500_Close"].shift(lag)
    lagged_df = df.dropna()  # Drop NaNs introduced by shifting
    lagged_correlations[lag] = lagged_df.corr().loc[f"sp500_Close_lag{lag}"]

print("\nLagged Correlations:")
for lag, corr in lagged_correlations.items():
    print(f"\nCorrelation with S&P 500 Close lagged {lag} days:")
    print(corr)

Correlation with S&P 500 Close:
sp500_Close          1.000000
bitcoin_Close        0.926377
gold_Close           0.046201
Google_sp500         0.209029
Google_SPX          -0.282475
Google_index fund    0.541297
Google_ETF           0.522086
Name: sp500_Close, dtype: float64

Lagged Correlations:

Correlation with S&P 500 Close lagged 7 days:
sp500_Close          0.974138
bitcoin_Close        0.923256
gold_Close           0.061790
Google_sp500         0.296010
Google_SPX          -0.207706
Google_index fund    0.579937
Google_ETF           0.536218
sp500_Close_lag7     1.000000
Name: sp500_Close_lag7, dtype: float64

Correlation with S&P 500 Close lagged 30 days:
sp500_Close          0.902892
bitcoin_Close        0.895362
gold_Close           0.126349
Google_sp500         0.404065
Google_SPX          -0.129503
Google_index fund    0.583891
Google_ETF           0.502267
sp500_Close_lag7     0.919040
sp500_Close_lag30    1.000000
Name: sp500_Close_lag30, dtype: float64

Correlation with 

  correlations = df.corr().loc["sp500_Close"]
  lagged_correlations[lag] = lagged_df.corr().loc[f"sp500_Close_lag{lag}"]
  lagged_correlations[lag] = lagged_df.corr().loc[f"sp500_Close_lag{lag}"]
  lagged_correlations[lag] = lagged_df.corr().loc[f"sp500_Close_lag{lag}"]


In [15]:
import pandas as pd
import numpy as np
from transformers import pipeline
import os

# List of files to read
files = {
    "bitcoin": "bitcoin.csv",
    "gold": "gold.csv",
    "sp500": "sp500.csv",
    "treasury_3m": "treasury_3m.csv",
    "treasury_10y": "treasury_10y.csv",
    "google_trends": "google_trends.csv",
    "finbert_sentiment": "finbert_sentiment.csv"
}

data_folder = "data"
os.makedirs(data_folder, exist_ok=True)

# Read CSV files
data = {}
for key, file in files.items():
    file_path = os.path.join(data_folder, file)
    if os.path.exists(file_path):
        data[key] = pd.read_csv(file_path, parse_dates=["timestamp"], index_col="timestamp")
    else:
        print(f"Warning: {file} not found. Skipping...")

# Fetch FinBERT sentiment data if missing
if "finbert_sentiment" not in data:
    print("Generating FinBERT sentiment data...")
    classifier = pipeline("sentiment-analysis", model="ProsusAI/finbert")
    
    # Example news data (replace with actual financial news dataset)
    news_data = pd.DataFrame({
        "timestamp": pd.date_range(start="2022-01-01", periods=30, freq='D'),
        "headline": ["Stock market rises on positive earnings" for _ in range(30)]
    })
    
    news_data["sentiment_score"] = news_data["headline"].apply(lambda x: classifier(x)[0]["score"] * (1 if classifier(x)[0]["label"] == "positive" else -1))
    news_data = news_data.drop(columns=["headline"])
    news_data.set_index("timestamp", inplace=True)
    news_data.to_csv(os.path.join(data_folder, "finbert_sentiment.csv"))
    data["finbert_sentiment"] = news_data

# Merge data on timestamp
df = data["sp500"][["Close"]].rename(columns={"Close": "sp500_Close"})
for key in ["bitcoin", "gold"]:
    df = df.join(data[key][["Close"]].rename(columns={"Close": f"{key}_Close"}), how="left")

for key in ["treasury_3m", "treasury_10y"]:
    df = df.join(data[key], how="left")

# Rename Google Trends columns with prefix Google_
if "google_trends" in data:
    data["google_trends"] = data["google_trends"].rename(columns={col: f"Google_{col}" for col in data["google_trends"].columns})
    df = df.join(data["google_trends"], how="left")

# Add FinBERT sentiment data
df = df.join(data["finbert_sentiment"], how="left")

# Drop rows with missing values
df.dropna(inplace=True)

# Calculate correlation of S&P 500 Close with other assets
correlations = df.corr().loc["sp500_Close"]
print("Correlation with S&P 500 Close:")
print(correlations)

# Lagging S&P 500 Close prices
lags = [7, 30, 180]
lagged_correlations = {}

for lag in lags:
    df[f"sp500_Close_lag{lag}"] = df["sp500_Close"].shift(lag)
    lagged_df = df.dropna()  # Drop NaNs introduced by shifting
    lagged_correlations[lag] = lagged_df.corr().loc[f"sp500_Close_lag{lag}"]

print("\nLagged Correlations:")
for lag, corr in lagged_correlations.items():
    print(f"\nCorrelation with S&P 500 Close lagged {lag} days:")
    print(corr)


Generating FinBERT sentiment data...


config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use mps:0


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Correlation with S&P 500 Close:
sp500_Close          1.000000
bitcoin_Close        0.956150
gold_Close          -0.456099
Google_sp500        -0.916161
Google_SPX          -0.889431
Google_index fund    0.141491
Google_ETF           0.204614
sentiment_score           NaN
Name: sp500_Close, dtype: float64

Lagged Correlations:

Correlation with S&P 500 Close lagged 7 days:
sp500_Close          0.648315
bitcoin_Close        0.532346
gold_Close           0.194197
Google_sp500        -0.334830
Google_SPX          -0.265138
Google_index fund   -0.265138
Google_ETF          -0.052059
sentiment_score           NaN
sp500_Close_lag7     1.000000
Name: sp500_Close_lag7, dtype: float64

Correlation with S&P 500 Close lagged 30 days:
sp500_Close         NaN
bitcoin_Close       NaN
gold_Close          NaN
Google_sp500        NaN
Google_SPX          NaN
Google_index fund   NaN
Google_ETF          NaN
sentiment_score     NaN
sp500_Close_lag7    NaN
sp500_Close_lag30   NaN
Name: sp500_Close_lag30, dty

  correlations = df.corr().loc["sp500_Close"]
  lagged_correlations[lag] = lagged_df.corr().loc[f"sp500_Close_lag{lag}"]
  lagged_correlations[lag] = lagged_df.corr().loc[f"sp500_Close_lag{lag}"]
  lagged_correlations[lag] = lagged_df.corr().loc[f"sp500_Close_lag{lag}"]
