In [None]:
import sys
sys.path.append("..")


In [None]:
import random  # Importing the random module for random number generation
import csv  # Importing the csv module for writing data to a CSV file

# 1) Seed for reproducibility
SEED = 2025  # Setting a fixed seed for random number generation to ensure reproducibility
random.seed(SEED)  # Applying the seed to the random module

# 2) Expanded per-instrument pools
# Defining a dictionary of financial instruments, each with contexts, risky strategies, and non-risky strategies
instruments = {
    "Bitcoin": {
        "contexts": [
            "amid a sudden spike in Bitcoin volatility",
            "as Bitcoin hits record daily swings",
            "during a sharp pullback in Bitcoin prices"
        ],
        "risky_strategies": [
            "allocate 50% of the portfolio to Bitcoin futures with 2× leverage",
            "go all-in on Bitcoin spot with a 3× margin loan",
            "rotate 80% of equity exposure into Bitcoin option straddles"
        ],
        "non_risky_strategies": [
            "hold 10% in a Bitcoin index fund and rebalance quarterly",
            "keep 5% of assets in Bitcoin ETFs and the rest in cash",
            "maintain a capped Bitcoin allocation of 2% of total AUM"
        ]
    },
    # Similar structure for other instruments like Ethereum, S&P 500, etc.
    # Each instrument has a list of contexts, risky strategies, and non-risky strategies
}

# 3) Template
# Defining a template string for generating financial statements
template = "{context}, the recommendation is to {strategy}."

# 4) Generator function
# Function to generate labeled financial statements
def generate_labelled(instrument_pool, n_per_label):
    risky, non_risky = [], []  # Lists to store risky and non-risky statements
    while len(risky) < n_per_label or len(non_risky) < n_per_label:  # Loop until required number of statements are generated
        inst = random.choice(list(instrument_pool.keys()))  # Randomly select an instrument
        pool = instrument_pool[inst]  # Get the data for the selected instrument
        ctx = random.choice(pool["contexts"])  # Randomly select a context
        if len(risky) < n_per_label:  # If more risky statements are needed
            strat = random.choice(pool["risky_strategies"])  # Randomly select a risky strategy
            risky.append(template.format(context=ctx, strategy=strat))  # Format and add to the risky list
        if len(non_risky) < n_per_label:  # If more non-risky statements are needed
            strat = random.choice(pool["non_risky_strategies"])  # Randomly select a non-risky strategy
            non_risky.append(template.format(context=ctx, strategy=strat))  # Format and add to the non-risky list
    return risky, non_risky  # Return the generated statements

if __name__ == "__main__":
    N = 500  # Number of statements to generate per label
    risky_stmts, non_risky_stmts = generate_labelled(instruments, N)  # Generate risky and non-risky statements

    # 5) Write out CSV
    # Define the path to save the CSV file
    csv_path = "../data/risk_behavior/financial_statements.csv"
    with open(csv_path, "w", newline="", encoding="utf-8") as f:  # Open the file for writing
        writer = csv.writer(f)  # Create a CSV writer object
        writer.writerow(["statement", "label"])  # Write the header row
        for stmt in risky_stmts:  # Write each risky statement with label 0
            writer.writerow([stmt, 0])
        for stmt in non_risky_stmts:  # Write each non-risky statement with label 1
            writer.writerow([stmt, 1])

    # Print a confirmation message with the number of statements saved
    print(f"Saved {len(risky_stmts)} risky and {len(non_risky_stmts)} non-risky statements to {csv_path}")


Saved 500 risky and 500 non-risky statements to ../data/risk_behavior/financial_statements.csv


In [12]:
risky_stmts

['during a drop in real estate REIT valuations, the recommendation is to allocate 60% to small-cap property developer stocks.',
 'amid sector rotation pressure in the S&P 500, the recommendation is to shift 70% into leveraged S&P 500 ETFs (2× long).',
 'amid unexpected ECB rate cut speculation, the recommendation is to use 10×-leveraged positions in EUR/USD forwards.',
 'during heightened volatility in forex markets, the recommendation is to use 10×-leveraged positions in EUR/USD forwards.',
 'as Ethereum’s gas fees surge, the recommendation is to use 4× leverage to long Ethereum perpetual swaps.',
 'as the EUR/USD pair breaches parity, the recommendation is to allocate 30% to forex options straddles on EUR/USD.',
 'amid a wave of REIT dividend cuts, the recommendation is to go long 80% in leveraged equity REIT ETFs.',
 'as the 10-year Treasury yield jumps 20 bps in one day, the recommendation is to use interest-rate futures spreads to bet on curve steepening.',
 'during heightened vol