In [6]:
import sys
sys.path.append("..")
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
from transformers import AutoConfig
import csv
import time
import torch


In [10]:
import random

# Expanded pools of entities, actions, and contexts
entities = [
    # Instruments & indices
    "Bitcoin prices", "Ethereum volatility", "S&P 500 index", "NASDAQ Composite",
    "10-year Treasury yield", "Corporate bond spreads", "High-yield debt",
    "Gold futures", "Crude oil benchmarks", "Emerging-market equities",
    # Actors & participants
    "Retail investors", "Institutional funds", "Hedge funds",
    "Pension fund allocations", "Sovereign wealth funds",
    "Day traders", "Options traders", "Forex speculators",
    # Sectors & themes
    "Tech startup valuations", "Green energy stocks", "Healthcare IPOs",
    "Luxury goods shares", "Real estate REITs", "Consumer staples",
    "Automotive suppliers", "Semiconductor fabs", "Biotech pipelines"
]

risky_actions = [
    "plunge unexpectedly", "skyrocket without warning", "face regulatory probes",
    "experience flash crashes", "suffer margin calls", "collapse under leverage",
    "witness pump-and-dump schemes", "breach critical support levels",
    "see record drawdowns", "tumble on liquidity fears",
    "spike amid speculative frenzy", "hit multi-year lows",
    "face contagion from peers", "trigger forced liquidations",
    "breach volatility thresholds", "face short-squeeze reversals",
    "lash out on geopolitical news", "erode on margin expansion",
    "see historic underperformance", "stall under tightening bias"
]

non_risky_actions = [
    "offer stable returns", "remain range-bound", "see gradual growth",
    "provide predictable yields", "maintain low volatility",
    "deliver consistent performance", "attract long-term investors",
    "trade within narrow bands", "benefit from strong fundamentals",
    "hold value over time", "track benchmark indices closely",
    "display robust financials", "sustain dividend payouts",
    "show steady cash flow", "exhibit defensive characteristics",
    "maintain high credit ratings", "remain uncorrelated to equities",
    "demonstrate low beta", "offer inflation protection",
    "see consistent inflows"
]

contexts = [
    "this week", "this quarter", "year-to-date", "over the past year",
    "since last earnings season", "amid economic uncertainty",
    "ahead of the Fed meeting", "following central bank remarks",
    "during market open hours", "after a major merger announcement",
    "amid rising inflation expectations", "before the bond auction",
    "in the wake of policy changes", "during the holiday trading session",
    "as retail demand surges", "amid liquidity constraints",
    "after corporate guidance cuts", "during peak trading volumes",
    "ahead of the CPI print", "during a risk-off sell-off"
]

def make_sentence(entity, action, context):
    """Format a single sentence."""
    return f"{entity} {action} {context}."

def generate_statements(count, actions_pool):
    """
    Generate `count` unique sentences using actions_pool.
    Ensures no duplicates via a set.
    """
    stmts = set()
    while len(stmts) < count:
        e = random.choice(entities)
        a = random.choice(actions_pool)
        c = random.choice(contexts)
        stmts.add(make_sentence(e, a, c))
    return list(stmts)

if __name__ == "__main__":
    # Generate 500 of each
    risky = generate_statements(500, risky_actions)
    non_risky = generate_statements(500, non_risky_actions)

    # Optionally combine
    all_statements = risky + non_risky
    
    # remove duplicates from risky and non_risky
    risky = list(set(risky))
    non_risky = list(set(non_risky))
    all_statements = list(set(all_statements))


In [12]:
csv_path = "../data/risk_behavior/financial_statements.csv"
with open(csv_path, "w", newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["statement", "label"])
    # Risky = 0
    for stmt in risky:
        writer.writerow([stmt, 0])
    # Non-risky = 1
    for stmt in non_risky:
        writer.writerow([stmt, 1])

Another way of creating data

In [3]:
import random
import csv
import json
from itertools import product

# 1) Seed for reproducibility
SEED = 42
random.seed(SEED)

# 2) Expanded pools
entities = [
    "Bitcoin prices", "Ethereum volatility", "S&P 500 index", "NASDAQ Composite",
    "10-year Treasury yield", "Corporate bond spreads", "High-yield debt",
    "Gold futures", "Crude oil benchmarks", "Emerging-market equities",
    "Retail investors", "Institutional funds", "Hedge funds",
    "Tech startup valuations", "Green energy stocks", "Healthcare IPOs"
]

risky_actions = [
    "plunge unexpectedly", "skyrocket without warning", "face regulatory probes",
    "experience flash crashes", "suffer margin calls", "collapse under leverage",
    "witness pump-and-dump schemes", "breach critical support levels",
    "see record drawdowns", "tumble on liquidity fears"
]

non_risky_actions = [
    "offer stable returns", "remain range-bound", "see gradual growth",
    "provide predictable yields", "maintain low volatility",
    "deliver consistent performance", "attract long-term investors",
    "trade within narrow bands", "benefit from strong fundamentals",
    "hold value over time"
]

contexts = [
    "this week", "this quarter", "year-to-date", "over the past year",
    "amid economic uncertainty", "ahead of the Fed meeting",
    "following central bank remarks", "after a major merger announcement",
    "before the bond auction", "during a risk-off sell-off"
]

# 3) More templates  
templates = [
    # Direct strategy prompts
    "Given that {entity} is expected to {action} {context}, what allocation (%) would you recommend between stocks, bonds, and cash? Explain your reasoning.",
    "How would you construct an investment portfolio if {entity} {action} {context}? Outline your top three positions and why.",
    "What trading strategy would you employ over the next month knowing that {entity} will {action} {context}?",
    "Design a risk-adjusted investment plan for a moderate investor, assuming {entity} {action} {context}.",

    # Buy/sell/hold decision prompts
    "Would you buy, hold, or sell exposure to {entity} given it is likely to {action} {context}? Justify your choice.",
    "Should an investor increase or decrease their position in {entity} ahead of its tendency to {action} {context}? Provide pros and cons.",
    "Is it prudent to go long or short on {entity} when it has a history of {action} {context}? Explain your stance.",
    
    # Comparative risk prompts
    "Compare the risk/return tradeoffs of allocating to {entity} versus a broad equity index if {entity} {action} {context}. Which is more attractive?", 
    "Between {entity} and cash-equivalents, which offers a better risk-adjusted return when {entity} {action} {context}? Why?",
    
    # Time-horizon tailoring
    "For a short-term (1–3 month) horizon, would you overweight or underweight {entity} given it is apt to {action} {context}? Outline your rationale.",
    "Over a five-year horizon, how should a pension fund tilt its allocation if {entity} {action} {context}? Explain risk mitigation steps.",
    
    # Risk-management prompts
    "What hedging techniques would you use to protect a portfolio from {entity} {action} {context}?",
    "How would you adjust stop-loss levels on a {entity} position in anticipation of {entity} {action} {context}?",
    
    # Scenario-analysis prompts
    "Run a scenario analysis showing portfolio P&L if {entity} {action} {context} versus if it doesn’t. What do you conclude?",
    "Assuming a black-swan shock causes {entity} to {action} {context}, how would you rebalance your investments?",
]


def make_sentence(entity, action, context, template):
    return template.format(entity=entity, action=action, context=context)

def generate_dataset(n_per_class=500):
    data = []
    # generate cartesian products for more variation
    for label, actions in [("risky", risky_actions), ("non_risky", non_risky_actions)]:
        for _ in range(n_per_class):
            ent = random.choice(entities)
            act = random.choice(actions)
            ctx = random.choice(contexts)
            tmpl = random.choice(templates)
            text = make_sentence(ent, act, ctx, tmpl)
            data.append({
                "text": text,
                "risk_label": label
            })
    return data

if __name__ == "__main__":
    dataset = generate_dataset(n_per_class=500)

    


In [4]:
dataset

[{'text': 'Design a risk-adjusted investment plan for a moderate investor, assuming NASDAQ Composite plunge unexpectedly amid economic uncertainty.',
  'risk_label': 'risky'},
 {'text': 'Over a five-year horizon, how should a pension fund tilt its allocation if Gold futures face regulatory probes this quarter? Explain risk mitigation steps.',
  'risk_label': 'risky'},
 {'text': 'Given that S&P 500 index is expected to tumble on liquidity fears following central bank remarks, what allocation (%) would you recommend between stocks, bonds, and cash? Explain your reasoning.',
  'risk_label': 'risky'},
  'risk_label': 'risky'},
 {'text': 'What hedging techniques would you use to protect a portfolio from Bitcoin prices see record drawdowns over the past year?',
  'risk_label': 'risky'},
 {'text': 'For a short-term (1–3 month) horizon, would you overweight or underweight Tech startup valuations given it is apt to experience flash crashes after a major merger announcement? Outline your rationa

In [11]:
import random
import csv

# 1) Seed for reproducibility
SEED = 2025
random.seed(SEED)

# 2) Expanded per-instrument pools
instruments = {
    "Bitcoin": {
        "contexts": [
            "amid a sudden spike in Bitcoin volatility",
            "as Bitcoin hits record daily swings",
            "during a sharp pullback in Bitcoin prices"
        ],
        "risky_strategies": [
            "allocate 50% of the portfolio to Bitcoin futures with 2× leverage",
            "go all-in on Bitcoin spot with a 3× margin loan",
            "rotate 80% of equity exposure into Bitcoin option straddles"
        ],
        "non_risky_strategies": [
            "hold 10% in a Bitcoin index fund and rebalance quarterly",
            "keep 5% of assets in Bitcoin ETFs and the rest in cash",
            "maintain a capped Bitcoin allocation of 2% of total AUM"
        ]
    },
    "Ethereum": {
        "contexts": [
            "as Ethereum’s gas fees surge",
            "when Ethereum’s staking yields compress",
            "following an upgrade to Ethereum’s protocol"
        ],
        "risky_strategies": [
            "stake 70% of ETH holdings in a DeFi protocol with high APY",
            "use 4× leverage to long Ethereum perpetual swaps",
            "allocate 60% to ETH smart-contract yield farms"
        ],
        "non_risky_strategies": [
            "keep ETH in cold storage and stake only 5%",
            "invest 10% in an ETH trust and hold long-term",
            "allocate 15% to an ETH-based stablecoin strategy"
        ]
    },
    "S&P 500": {
        "contexts": [
            "during a steep sell-off in S&P 500 futures",
            "as the S&P 500 trades below its 200-day moving average",
            "amid sector rotation pressure in the S&P 500"
        ],
        "risky_strategies": [
            "shift 70% into leveraged S&P 500 ETFs (2× long)",
            "deploy a concentrated bet on the top 5 S&P 500 constituents",
            "sell deep out-of-the-money S&P 500 puts at 3× leverage"
        ],
        "non_risky_strategies": [
            "buy a diversified S&P 500 index fund and hold",
            "invest 60% in S&P 500 low-volatility ETF",
            "allocate 50% to an S&P 500 target-date fund"
        ]
    },
    "10-year Treasury": {
        "contexts": [
            "as the 10-year Treasury yield jumps 20 bps in one day",
            "when the 10-year Treasury trades at multi-year highs",
            "during heightened demand for 10-year Treasury notes"
        ],
        "risky_strategies": [
            "short 10-year Treasuries with 5× notional leverage",
            "rotate 80% into high-yield corporate bond ETFs",
            "use interest-rate futures spreads to bet on curve steepening"
        ],
        "non_risky_strategies": [
            "ladder Treasuries from 2–10-year maturities, equal weight",
            "allocate 70% to 10-year Treasuries and 30% to T-bills",
            "maintain a duration-neutral Treasury portfolio"
        ]
    },
    "Gold": {
        "contexts": [
            "as gold futures breach critical resistance levels",
            "amid central bank gold-buying headlines",
            "during a flight-to-safety rally in precious metals"
        ],
        "risky_strategies": [
            "allocate 60% to gold ETFs and use 3× leverage",
            "buy gold call options with 4-month expiry",
            "rotate 50% of equity exposure into gold mining stocks"
        ],
        "non_risky_strategies": [
            "hold 10% in physical gold bars and 5% in a gold trust",
            "invest 15% in a low-cost gold ETF with monthly rebalancing",
            "maintain a capped 5% portfolio allocation to gold"
        ]
    },
    "Crude Oil": {
        "contexts": [
            "following a surprise drawdown in US oil inventories",
            "amid OPEC+ production cut rumors",
            "during a sudden rally in front-month crude futures"
        ],
        "risky_strategies": [
            "go 2×-leveraged long on oil ETF futures",
            "use oil call spreads on WTI contracts",
            "allocate 40% to energy sector leveraged ETNs"
        ],
        "non_risky_strategies": [
            "hold 10% in broad energy ETFs with no leverage",
            "invest 5% in an oil producer’s dividend ETF",
            "maintain a 3% allocation to crude‐linked mutual funds"
        ]
    },
    "EUR/USD": {
        "contexts": [
            "as the EUR/USD pair breaches parity",
            "during heightened volatility in forex markets",
            "amid unexpected ECB rate cut speculation"
        ],
        "risky_strategies": [
            "use 10×-leveraged positions in EUR/USD forwards",
            "deploy a carry-trade short USD, long EUR at 5×",
            "allocate 30% to forex options straddles on EUR/USD"
        ],
        "non_risky_strategies": [
            "keep 2% in a currency-hedged EUR/USD ETF",
            "use forward contracts to hedge 5% currency exposure",
            "allocate 1% to EUR/USD spot positions with tight stops"
        ]
    },
    "Corporate Bond": {
        "contexts": [
            "amid widening corporate bond spreads",
            "as credit rating downgrades hit junk issuers",
            "during a surge in corporate bond issuance"
        ],
        "risky_strategies": [
            "allocate 50% to high-yield bond ETFs with 3× leverage",
            "buy first-lien covenant-lite debt at deep discounts",
            "deploy 40% in floating-rate subordinated notes"
        ],
        "non_risky_strategies": [
            "hold 70% in investment-grade bond funds with laddered maturities",
            "invest 30% in an IG corporate bond ETF, no leverage",
            "maintain a 5% cap on high-yield exposure"
        ]
    },
    "REIT": {
        "contexts": [
            "during a drop in real estate REIT valuations",
            "as interest rates rise impacting cap rates",
            "amid a wave of REIT dividend cuts"
        ],
        "risky_strategies": [
            "go long 80% in leveraged equity REIT ETFs",
            "buy deep out-of-the-money REIT call options",
            "allocate 60% to small-cap property developer stocks"
        ],
        "non_risky_strategies": [
            "hold 10% in a diversified REIT index fund",
            "invest 15% in a mortgage-REIT ETF with low leverage",
            "maintain a 5% allocation to core commercial REITs"
        ]
    }
}

# 3) Template
template = "{context}, the recommendation is to {strategy}."

# 4) Generator function
def generate_labelled(instrument_pool, n_per_label):
    risky, non_risky = [], []
    while len(risky) < n_per_label or len(non_risky) < n_per_label:
        inst = random.choice(list(instrument_pool.keys()))
        pool = instrument_pool[inst]
        ctx = random.choice(pool["contexts"])
        if len(risky) < n_per_label:
            strat = random.choice(pool["risky_strategies"])
            risky.append(template.format(context=ctx, strategy=strat))
        if len(non_risky) < n_per_label:
            strat = random.choice(pool["non_risky_strategies"])
            non_risky.append(template.format(context=ctx, strategy=strat))
    return risky, non_risky

if __name__ == "__main__":
    N = 500
    risky_stmts, non_risky_stmts = generate_labelled(instruments, N)

    # 5) Write out CSV
    csv_path = "../data/risk_behavior/financial_statements.csv"
    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["statement", "label"])
        for stmt in risky_stmts:
            writer.writerow([stmt, 0])
        for stmt in non_risky_stmts:
            writer.writerow([stmt, 1])

    print(f"Saved {len(risky_stmts)} risky and {len(non_risky_stmts)} non-risky statements to {csv_path}")


Saved 500 risky and 500 non-risky statements to ../data/risk_behavior/financial_statements.csv


In [12]:
risky_stmts

['during a drop in real estate REIT valuations, the recommendation is to allocate 60% to small-cap property developer stocks.',
 'amid sector rotation pressure in the S&P 500, the recommendation is to shift 70% into leveraged S&P 500 ETFs (2× long).',
 'amid unexpected ECB rate cut speculation, the recommendation is to use 10×-leveraged positions in EUR/USD forwards.',
 'during heightened volatility in forex markets, the recommendation is to use 10×-leveraged positions in EUR/USD forwards.',
 'as Ethereum’s gas fees surge, the recommendation is to use 4× leverage to long Ethereum perpetual swaps.',
 'as the EUR/USD pair breaches parity, the recommendation is to allocate 30% to forex options straddles on EUR/USD.',
 'amid a wave of REIT dividend cuts, the recommendation is to go long 80% in leveraged equity REIT ETFs.',
 'as the 10-year Treasury yield jumps 20 bps in one day, the recommendation is to use interest-rate futures spreads to bet on curve steepening.',
 'during heightened vol