In [1]:
# Stock Prediction Decision Making Notebook
# Interactive notebook for making daily trading decisions

import os
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Configuration
TOP_N_PICKS = 10
INVESTMENT_AMOUNT = 1000

# Path configuration - adjust for notebook location
# Since notebook is in eda-notebooks/, go up one level to project root
PROJECT_ROOT = Path("..").resolve()  # Go up one level from eda-notebooks/
ARTIFACTS_DIR = PROJECT_ROOT / "artifacts"
DATA_DIR = PROJECT_ROOT / "data"
RESULTS_DIR = PROJECT_ROOT / "results"
APP_DIR = PROJECT_ROOT / "app"

print("📊 Stock Prediction Decision Making Notebook")
print("=" * 50)

# Setup paths
if str(APP_DIR) not in sys.path:
    sys.path.append(str(APP_DIR))

print(f"✅ Paths configured:")
print(f"   Project root: {PROJECT_ROOT}")
print(f"   App code: {APP_DIR}")
print(f"   Artifacts: {ARTIFACTS_DIR}")
print(f"   Data: {DATA_DIR}")

📊 Stock Prediction Decision Making Notebook
✅ Paths configured:
   Project root: /Users/sagardhal/Desktop/Practice/personal-stock
   App code: /Users/sagardhal/Desktop/Practice/personal-stock/app
   Artifacts: /Users/sagardhal/Desktop/Practice/personal-stock/artifacts
   Data: /Users/sagardhal/Desktop/Practice/personal-stock/data


In [2]:
# Stock Prediction Decision Making Notebook
# Interactive notebook for making daily trading decisions

import os
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Configuration
TOP_N_PICKS = 10
INVESTMENT_AMOUNT = 1000

# Path configuration - adjust for notebook location
# Since notebook is in eda-notebooks/, go up one level to project root
PROJECT_ROOT = Path("..").resolve()  # Go up one level from eda-notebooks/
ARTIFACTS_DIR = PROJECT_ROOT / "artifacts"
DATA_DIR = PROJECT_ROOT / "data"
RESULTS_DIR = PROJECT_ROOT / "results"
APP_DIR = PROJECT_ROOT / "app"

print("📊 Stock Prediction Decision Making Notebook")
print("=" * 50)

# Setup paths
if str(APP_DIR) not in sys.path:
    sys.path.append(str(APP_DIR))

print(f"✅ Paths configured:")
print(f"   Project root: {PROJECT_ROOT}")
print(f"   App code: {APP_DIR}")
print(f"   Artifacts: {ARTIFACTS_DIR}")
print(f"   Data: {DATA_DIR}")

📊 Stock Prediction Decision Making Notebook
✅ Paths configured:
   Project root: /Users/sagardhal/Desktop/Practice/personal-stock
   App code: /Users/sagardhal/Desktop/Practice/personal-stock/app
   Artifacts: /Users/sagardhal/Desktop/Practice/personal-stock/artifacts
   Data: /Users/sagardhal/Desktop/Practice/personal-stock/data


In [3]:
# go up one level to the root directory
sys.path.append(os.path.abspath(".."))



# # add project root (parent of notebooks/) to sys.path
# project_root = Path.cwd().parent
# sys.path.insert(0, str(project_root))

from app.predictions import (
    load_latest_data,
    load_model_and_features,
    PredictionComparator,
    _TransformAdapter,
)

# If you also need TrainModel directly in the notebook:
from app.train_model_new import TrainModel   # ✅ absolute package import

In [4]:
# Step 1: Load Tickers
print("\n" + "=" * 50)
print("STEP 1: LOAD TICKERS")
print("=" * 50)


tickers = pd.read_csv("/Users/sagardhal/Desktop/Practice/personal-stock/ticker/spx_ndx_liq_top250_latest.csv")
tickers.head()



STEP 1: LOAD TICKERS


Unnamed: 0,Ticker,avg_traded_value,last_close
0,NVDA,30202330000.0,170.619995
1,TSLA,27448230000.0,334.089996
2,AAPL,12357710000.0,238.470001
3,PLTR,12142480000.0,154.899994
4,MSFT,11163710000.0,505.350006


In [5]:
# Step 2: Check Requirements
print("\n" + "=" * 50)
print("STEP 2: CHECK REQUIREMENTS")
print("=" * 50)

# Check for trained models
artifacts_files = list(ARTIFACTS_DIR.glob("*.joblib")) + list(ARTIFACTS_DIR.glob("*.pkl"))
print(f"🤖 Models found: {len(artifacts_files)}")
for f in artifacts_files:
    print(f"   - {f.name}")

# Check for data files
data_files = list(DATA_DIR.glob("*.parquet")) if DATA_DIR.exists() else []
print(f"📊 Data files found: {len(data_files)}")
for f in data_files[:3]:
    print(f"   - {f.name}")
if len(data_files) > 3:
    print(f"   ... and {len(data_files) - 3} more files")


STEP 2: CHECK REQUIREMENTS
🤖 Models found: 5
   - best_rf_model.joblib
   - random_forest_train_valid_20250904_112953.joblib
   - random_forest_train_only_20250904_000839.joblib
   - random_forest_train_only_20250904_112906.joblib
   - random_forest_train_valid_20250904_000856.joblib
📊 Data files found: 1
   - stock_data_combined_20250904_071145.parquet


In [6]:
print("\n" + "=" * 50)
print("STEP 3: LOAD DATA AND MODEL")
print("=" * 50)

# Load data
try:
    data = load_latest_data()
    print(f"✅ Data loaded: {data.shape[0]:,} rows × {data.shape[1]:,} columns")
    print(f"   Date range: {data['Date'].min()} to {data['Date'].max()}")
    print(f"   Unique tickers: {data['Ticker'].nunique()}")
    
    # Prepare data for modeling
    tm = TrainModel(_TransformAdapter(data))
    tm.prepare_dataframe(start_date="2000-01-01")
    print(f"   Data prepared: {tm.df_full.shape}")
    
except Exception as e:
    print(f"❌ Data loading failed: {e}")
    raise


STEP 3: LOAD DATA AND MODEL
[load_latest_data] Using: stock_data_combined_20250904_071145.parquet  (from /Users/sagardhal/Desktop/Practice/personal-stock/data)
✅ Data loaded: 2,094,565 rows × 222 columns
   Date range: 1962-01-02 00:00:00 to 2025-09-03 00:00:00
   Unique tickers: 250
Preparing dataframe for modeling...
Defining feature sets...
Feature Set Summary:
  Growth features: 70
  Technical indicators: 56
  Technical patterns: 61
  Custom numerical: 7
  Macro features: 75
  Categorical (for dummies): 7
  Target columns: 2
  Total numerical features: 206
  Unused columns: 0
Creating dummy variables...
Created 397 dummy variables
Sample dummies: ['month_1', 'month_10', 'month_11', 'month_12', 'month_2']
Filtered data from 2000-01-01
Date range: 2000-01-03 00:00:00 to 2025-09-03 00:00:00
Temporal split created:
  train: 904,317 samples
  validation: 230,530 samples
  test: 237,417 samples
Creating ML datasets...
Total features before filtering: 603
  - Numerical: 206
  - Dummies: 

In [7]:
# Load model
try:
    model, feature_cols, target_col = load_model_and_features(str(ARTIFACTS_DIR))
    tm.model = model
    tm._inference_feature_columns = feature_cols
    if target_col:
        tm.target_col = target_col
    
    print(f"✅ Model loaded successfully")
    print(f"   Features: {len(feature_cols)}")
    print(f"   Target: {tm.target_col}")
    
    # Check feature availability
    available_features = [f for f in feature_cols if f in tm.df_full.columns]
    missing_features = [f for f in feature_cols if f not in tm.df_full.columns]
    
    feature_coverage = len(available_features) / len(feature_cols)
    print(f"   Feature coverage: {feature_coverage:.1%} ({len(available_features)}/{len(feature_cols)})")
    
    if feature_coverage < 0.8:
        print(f"⚠️ Warning: Low feature coverage ({feature_coverage:.1%})")
        print("Model and data may be incompatible")
    
except Exception as e:
    print(f"❌ Model loading failed: {e}")
    raise


[load_model_and_features] Using model file: best_rf_model.joblib
[load_model_and_features] feature_names_in_: 603 features
✅ Model loaded successfully
   Features: 603
   Target: is_positive_growth_30d_future
   Feature coverage: 100.0% (603/603)


In [8]:
# Step 4: Generate Predictions (only if everything loaded)
#if not missing_requirements and 'tm' in locals() and 'model' in locals():
print("\n" + "=" * 50)
print("STEP 4: GENERATE PREDICTIONS")
print("=" * 50)

# Create prediction comparator
comparator = PredictionComparator(tm.df_full, tm.target_col)

# Add manual rule-based predictions
try:
    comparator.add_manual_predictions()
    print("✅ Manual predictions created")
except Exception as e:
    print(f"⚠️ Manual predictions failed: {e}")

# Add ML predictions
try:
    comparator.add_ml_predictions(model, feature_cols)
    print("✅ ML predictions created")
    ml_success = True
except Exception as e:
    print(f"⚠️ ML predictions failed: {e}")
    # Create fallback probability column
    comparator.df['rf_prob_30d'] = 0.5
    ml_success = False

# Add additional strategies if ML worked
if ml_success and 'rf_prob_30d' in comparator.df.columns:
    try:
        comparator.add_ml_thresholds_from_validation("rf_prob_30d")
        print("✅ Adaptive thresholds created")
    except Exception as e:
        print(f"⚠️ Adaptive thresholds failed: {e}")
    
    try:
        comparator.add_daily_topn(proba_col="rf_prob_30d", n=3)
        comparator.add_daily_topn(proba_col="rf_prob_30d", n=5)
        print("✅ Top-K strategies created")
    except Exception as e:
        print(f"⚠️ Top-K strategies failed: {e}")

print(f"📊 Total strategies created: {len(comparator.prediction_cols)}")


STEP 4: GENERATE PREDICTIONS
Creating manual rule-based predictions...
Manual prediction summary:
  pred0_manual_cci: 2.5% positive predictions
  pred1_manual_prev_g1: 58.2% positive predictions
  pred2_manual_prev_g1_and_snp: 0.0% positive predictions
  pred3_manual_declining_rates: 47.6% positive predictions
  pred4_manual_fed_easing: 40.5% positive predictions
  pred5_manual_vix_contrarian: 18.9% positive predictions
  pred6_manual_stock_btc_momentum: 0.2% positive predictions
✅ Manual predictions created


KeyboardInterrupt: 

In [None]:
print("\n" + "=" * 50)
print("STEP 5: ANALYZE YOUR TICKERS")
print("=" * 50)

# Get latest date and filter to your tickers
latest_date = comparator.df['Date'].max()
print(f"📅 Latest data date: {latest_date}")

# Filter to your tickers and recent data
your_data = comparator.df[
    (comparator.df['Ticker'].isin(tickers)) & 
    (comparator.df['Date'] >= latest_date - timedelta(days=7))
].copy()

print(f"📊 Your tickers in recent data: {your_data['Ticker'].nunique()}/{len(tickers)}")

if len(your_data) == 0:
    print("❌ No data found for your tickers in recent period")
    print("Your tickers might not be in the processed dataset")
    available_tickers = comparator.df['Ticker'].unique()[:20]
    print(f"Available tickers (sample): {', '.join(available_tickers)}")
else:
    # Get most recent data for each ticker
    latest_by_ticker = your_data.loc[your_data.groupby('Ticker')['Date'].idxmax()]
    
    # Sort by prediction probability
    prob_col = 'rf_prob_30d' if 'rf_prob_30d' in latest_by_ticker.columns else None
    if prob_col and latest_by_ticker[prob_col].std() > 0:
        latest_by_ticker = latest_by_ticker.sort_values(prob_col, ascending=False)
        prob_source = "ML Model"
    else:
        # Fallback to manual prediction
        manual_cols = [c for c in latest_by_ticker.columns if c.startswith('pred') and 'manual' in c]
        if manual_cols:
            prob_col = manual_cols[0]
            latest_by_ticker = latest_by_ticker.sort_values(prob_col, ascending=False)
            prob_source = "Manual Rules"
        else:
            prob_col = None
            prob_source = "None"
    
    # Top picks
    top_picks = latest_by_ticker.head(TOP_N_PICKS)
    
    print(f"\n🎯 TOP {len(top_picks)} PICKS (sorted by {prob_source}):")
    print("-" * 60)
    
    for i, (_, stock) in enumerate(top_picks.iterrows(), 1):
        ticker = stock['Ticker']
        date = stock['Date'].strftime('%Y-%m-%d')
        
        if prob_col and prob_col == 'rf_prob_30d':
            prob_value = f"{stock[prob_col]*100:.1f}%"
        elif prob_col:
            prob_value = f"{stock[prob_col]:.0f}"
        else:
            prob_value = "N/A"
        
        print(f"{i:2d}. {ticker:6s} | Prob: {prob_value:6s} | Date: {date} | Investment: ${INVESTMENT_AMOUNT:,}")



STEP 5: ANALYZE YOUR TICKERS
📅 Latest data date: 2025-09-03 00:00:00
📊 Your tickers in recent data: 0/250
❌ No data found for your tickers in recent period
Your tickers might not be in the processed dataset
Available tickers (sample): AAPL, ADBE, AMAT, AMD, AMZN, APP, AVGO, BA, BAC, BRK-B, C, CAT, COIN, COST, CRM, CRWD, CSCO, CVX, GEV, GOOG


In [None]:
# Step 5: Analyze Your Tickers
print("\n" + "=" * 50)
print("STEP 5: ANALYZE YOUR TICKERS")
print("=" * 50)

# Get latest date and filter to your tickers
latest_date = comparator.df['Date'].max()
print(f"📅 Latest data date: {latest_date}")

# Filter to your tickers and recent data
your_data = comparator.df[
    (comparator.df['Ticker'].isin(tickers)) & 
    (comparator.df['Date'] >= latest_date - timedelta(days=7))
].copy()

print(f"📊 Your tickers in recent data: {your_data['Ticker'].nunique()}/{len(tickers)}")

if len(your_data) == 0:
    print("❌ No data found for your tickers in recent period")
    print("Your tickers might not be in the processed dataset")
    available_tickers = comparator.df['Ticker'].unique()[:20]
    print(f"Available tickers (sample): {', '.join(available_tickers)}")
else:
    # Get most recent data for each ticker
    latest_by_ticker = your_data.loc[your_data.groupby('Ticker')['Date'].idxmax()]
    
    # Sort by prediction probability
    prob_col = 'rf_prob_30d' if 'rf_prob_30d' in latest_by_ticker.columns else None
    if prob_col and latest_by_ticker[prob_col].std() > 0:
        latest_by_ticker = latest_by_ticker.sort_values(prob_col, ascending=False)
        prob_source = "ML Model"
    else:
        # Fallback to manual prediction
        manual_cols = [c for c in latest_by_ticker.columns if c.startswith('pred') and 'manual' in c]
        if manual_cols:
            prob_col = manual_cols[0]
            latest_by_ticker = latest_by_ticker.sort_values(prob_col, ascending=False)
            prob_source = "Manual Rules"
        else:
            prob_col = None
            prob_source = "None"
    
    # Top picks
    top_picks = latest_by_ticker.head(TOP_N_PICKS)
    
    print(f"\n🎯 TOP {len(top_picks)} PICKS (sorted by {prob_source}):")
    print("-" * 60)
    
    for i, (_, stock) in enumerate(top_picks.iterrows(), 1):
        ticker = stock['Ticker']
        date = stock['Date'].strftime('%Y-%m-%d')
        
        if prob_col and prob_col == 'rf_prob_30d':
            prob_value = f"{stock[prob_col]*100:.1f}%"
        elif prob_col:
            prob_value = f"{stock[prob_col]:.0f}"
        else:
            prob_value = "N/A"
        
        print(f"{i:2d}. {ticker:6s} | Prob: {prob_value:6s} | Date: {date} | Investment: ${INVESTMENT_AMOUNT:,}")



STEP 5: ANALYZE YOUR TICKERS
📅 Latest data date: 2025-09-03 00:00:00
📊 Your tickers in recent data: 0/250
❌ No data found for your tickers in recent period
Your tickers might not be in the processed dataset
Available tickers (sample): AAPL, ADBE, AMAT, AMD, AMZN, APP, AVGO, BA, BAC, BRK-B, C, CAT, COIN, COST, CRM, CRWD, CSCO, CVX, GEV, GOOG


In [None]:
# Step 6: Decision Matrix
print("\n" + "=" * 50)
print("STEP 6: DECISION MATRIX")
print("=" * 50)

decisions = []
total_strong_buy = 0
total_buy = 0

for _, stock in top_picks.iterrows():
    ticker = stock['Ticker']
    
    # Determine probability and action
    if prob_col == 'rf_prob_30d':
        prob = stock[prob_col]
        prob_display = f"{prob*100:.1f}%"
        
        if prob >= 0.8:
            action = "🟢 STRONG BUY"
            total_strong_buy += 1
        elif prob >= 0.7:
            action = "🟡 BUY"
            total_buy += 1
        elif prob >= 0.6:
            action = "🟠 CONSIDER"
        else:
            action = "🔴 WAIT"
    else:
        prob_display = "Manual"
        action = "🟡 BUY" if stock.get(prob_col, 0) > 0 else "🔴 WAIT"
        if action == "🟡 BUY":
            total_buy += 1
    
    decisions.append({
        'Rank': len(decisions) + 1,
        'Ticker': ticker,
        'Signal': prob_display,
        'Action': action,
        'Investment': f"${INVESTMENT_AMOUNT:,}"
    })

# Display decision table
decision_df = pd.DataFrame(decisions)
print(decision_df.to_string(index=False))

# Summary
print(f"\n💰 INVESTMENT SUMMARY:")
print(f"   Strong Buy signals: {total_strong_buy}")
print(f"   Buy signals: {total_buy}")
total_positions = total_strong_buy + total_buy
total_investment = total_positions * INVESTMENT_AMOUNT
print(f"   Total positions: {total_positions}")
print(f"   Total investment: ${total_investment:,}")

# Action plan
print(f"\n📋 ACTION PLAN:")
strong_buys = [d['Ticker'] for d in decisions if 'STRONG' in d['Action']]
buys = [d['Ticker'] for d in decisions if d['Action'] == '🟡 BUY']

if strong_buys:
    print(f"🟢 IMMEDIATE: Buy {', '.join(strong_buys)}")
if buys:
    print(f"🟡 SECONDARY: Consider {', '.join(buys)}")
if not strong_buys and not buys:
    print("🔴 WAIT: No strong signals today")


STEP 6: DECISION MATRIX


NameError: name 'top_picks' is not defined

In [None]:
# Step 7: Save Results
print("\n" + "=" * 50)
print("STEP 7: SAVE RESULTS")
print("=" * 50)

# Create results directory
RESULTS_DIR.mkdir(exist_ok=True)

# Save with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
results_file = RESULTS_DIR / f"notebook_decisions_{timestamp}.csv"

# Create tracking dataframe
tracking_df = pd.DataFrame(decisions)
tracking_df['Analysis_Date'] = datetime.now()
tracking_df['Data_Date'] = latest_date
tracking_df['Model_Source'] = prob_source
tracking_df['Ticker_Count'] = len(tickers)

# Save
tracking_df.to_csv(results_file, index=False)
print(f"💾 Results saved: {results_file}")

# Also save just the buy signals for easy reference
buy_signals = tracking_df[tracking_df['Action'].str.contains('BUY')]
if len(buy_signals) > 0:
    buy_file = RESULTS_DIR / f"buy_signals_{timestamp}.csv"
    buy_signals[['Ticker', 'Action', 'Investment']].to_csv(buy_file, index=False)
    print(f"💾 Buy signals saved: {buy_file}")

# Final Summary
print("\n" + "=" * 50)
print("ANALYSIS COMPLETE")
print("=" * 50)

if missing_requirements:
print("❌ Analysis incomplete due to missing requirements")
print("Complete the setup steps above and restart")
elif 'decisions' in locals():
print("✅ Analysis complete!")
print(f"📊 Analyzed {len(top_picks)} stocks from {len(tickers)} tickers")
print(f"💰 Investment recommendations: ${total_investment:,}")
print(f"📅 Based on data through: {latest_date}")

print(f"\n🔄 To refresh analysis:")
print("1. Update data: python run_data_extraction.py")
print("2. Retrain model (optional): python run_model_training.py")
print("3. Re-run this notebook")
else:
print("⚠️ Analysis incomplete - check errors above")

print(f"\n📝 Remember:")
print("• Set stop losses at -15% to -20%")
print("• Monitor positions daily")
print("• Diversify - don't put more than 5-10% in any single position")
print("• Past performance doesn't guarantee future results")