In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
from datetime import datetime, timedelta
import warnings
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
# CFM 101 - Group Assignment 2025
# Robo-Advising Challenge
# Competition Goal: Market Beat - Highest return above the benchmark average

warnings.filterwarnings('ignore')

# Machine Learning imports

# ============================================================================
# CONFIGURATION
# ============================================================================

INITIAL_CAPITAL = 1_000_000  # CAD
USD_TO_CAD = 1.41  # Exchange rate (update as needed)
TRAINING_START = '2022-01-01'
TRAINING_END = '2024-11-15'
VOLUME_CHECK_START = '2023-10-01'
VOLUME_CHECK_END = '2024-09-30'
MIN_STOCKS = 10
MAX_STOCKS = 10
MAX_WEIGHT = 0.15
MAX_SECTOR_WEIGHT = 0.40
MIN_TRADING_DAYS = 18

# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def calculate_fee(shares, price_usd):
    """Calculate transaction fee per stock"""
    per_share_fee = shares * 0.001
    flat_fee = 2.15
    return min(per_share_fee, flat_fee)

def get_stock_data(ticker, start, end):
    """Download stock data with error handling"""
    try:
        data = ticker.history(start=start, end=end)
        return data
    except:
        return None

def calculate_technical_features(df):
    """Calculate technical indicators for ML model"""
    features = {}
    
    # Returns
    features['return_5d'] = df['Close'].pct_change(5).iloc[-1]
    features['return_10d'] = df['Close'].pct_change(10).iloc[-1]
    features['return_20d'] = df['Close'].pct_change(20).iloc[-1]
    features['return_60d'] = df['Close'].pct_change(60).iloc[-1]
    
    # Volatility
    features['volatility_20d'] = df['Close'].pct_change().rolling(20).std().iloyc[-1]
    features['volatility_60d'] = df['Close'].pct_change().rolling(60).std().iloc[-1]
    
    # Moving averages
    features['sma_20'] = df['Close'].rolling(20).mean().iloc[-1]
    features['sma_50'] = df['Close'].rolling(50).mean().iloc[-1]
    features['price_to_sma20'] = df['Close'].iloc[-1] / features['sma_20'] if features['sma_20'] > 0 else 0
    features['price_to_sma50'] = df['Close'].iloc[-1] / features['sma_50'] if features['sma_50'] > 0 else 0
    
    # Momentum
    features['rsi'] = calculate_rsi(df['Close'], 14)
    features['momentum'] = df['Close'].iloc[-1] / df['Close'].iloc[-20] - 1 if len(df) > 20 else 0
    
    # Volume
    features['volume_ratio'] = df['Volume'].iloc[-20:].mean() / df['Volume'].iloc[-60:-20].mean() if len(df) > 60 else 1
    
    return features

def calculate_rsi(prices, period=14):
    """Calculate Relative Strength Index"""
    delta = prices.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi.iloc[-1] if not rsi.empty else 50

def check_volume_requirement(ticker, start, end):
    """Check if stock meets minimum volume requirement"""
    try:
        data = ticker.history(start=start, end=end)
        if data.empty or len(data) < 50:
            return False
        
        avg_volume = data['Volume'].mean()
        return float(avg_volume) >= 5000

    except Exception as e:
        print(f"Volume check failed for {ticker}: {e}")
        return False

def get_market_cap(ticker):
    """Get market cap in CAD"""
    try:
        info = ticker.info
        market_cap = info.get('marketCap', 0)
        
        # Convert to CAD if needed
        currency = info.get('currency', 'USD')
        if currency == 'USD':
            market_cap *= USD_TO_CAD
        
        return market_cap / 1e9  # Return in billions
    except:
        return 0

def get_sector(ticker):
    """Get stock sector"""
    try:
        return ticker.info.get('sector', 'Unknown')
    except:
        return 'Unknown'

# ============================================================================
# MAIN PORTFOLIO CONSTRUCTION
# ============================================================================

# Load tickers
tickers_df = pd.read_csv('Tickers_Example.csv')

# Handle different column name formats
if 'Tickers' in tickers_df.columns:
    tickers_list = tickers_df['Tickers'].tolist()
elif 'Ticker' in tickers_df.columns:
    tickers_list = tickers_df['Ticker'].tolist()
else:
    # If no header, use first column
    tickers_list = tickers_df.iloc[:, 0].tolist()


for i in range (len(tickers_list)):
    tickers_list[i]=yf.Ticker(tickers_list[i])
    

print(f"Total tickers loaded: {len(tickers_list)}")

# Filter stocks based on volume requirement
print("\nFiltering stocks by volume requirement...")
valid_tickers = []
for ticker in tickers_list:
    if check_volume_requirement(ticker, VOLUME_CHECK_START, VOLUME_CHECK_END):
        valid_tickers.append(ticker)

print(f"Stocks passing volume filter: {len(valid_tickers)}")

# Prepare data for ML model
print("\nPreparing features for ML model...")
stock_features = []

for ticker in valid_tickers:
    print(f"Processing {ticker}...")
    data = get_stock_data(ticker, TRAINING_START, TRAINING_END)
    
    if data is None or len(data) < 100:
        continue
    
    try:
        features = calculate_technical_features(data)
        features['ticker'] = ticker
       
        
        # Calculate target: use 5-day return from 10 days ago
        # This creates a historical prediction task that we can validate
        if len(data) >= 20:
            # Get return from 15 days ago to 10 days ago (5-day return)
            past_prices = data['Close'].iloc[-20:-10]
            if len(past_prices) >= 5:
                target_return = (past_prices.iloc[-1] / past_prices.iloc[0]) - 1
                features['target'] = target_return
            else:
                continue
        else:
            continue
        
        stock_features.append(features)
    except Exception as e:
        print(f"  Error processing {ticker}: {e}")
        continue


# Convert to DataFrame
features_df = pd.DataFrame(stock_features)

# Drop rows with missing values
if len(features_df) > 0:
    features_df = features_df.dropna()

print(f"\nStocks with complete features: {len(features_df)}")

# Check if we have enough data to proceed
if len(features_df) == 0:
    raise ValueError("No stocks have complete features. Try adjusting date ranges or checking data availability.")

# ============================================================================
# MACHINE LEARNING MODEL
# ============================================================================

print("\nTraining ML model...")

# Prepare features and target
feature_columns = [col for col in features_df.columns if col not in ['ticker', 'target']]
X = features_df[feature_columns]
y = features_df['target']

# Split and scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Random Forest model
model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
model.fit(X_train_scaled, y_train)

# Predict on all stocks
X_all_scaled = scaler.transform(X)
predictions = model.predict(X_all_scaled)
features_df['predicted_return'] = predictions

# Rank stocks by predicted return
features_df = features_df.sort_values('predicted_return', ascending=False)

print(f"Model trained. R² score: {model.score(X_test_scaled, y_test):.4f}")
print(f"\nTop 10 predicted stocks:")
print(features_df[['ticker', 'predicted_return']].head(10))

# ============================================================================
# PORTFOLIO OPTIMIZATION WITH CONSTRAINTS
# ============================================================================

print("\nBuilding portfolio with constraints...")

# Select top stocks for portfolio
selected_stocks = []
sector_allocation = {}

for idx, row in features_df.iterrows():
    ticker = row['ticker']
    
    # Get market cap and sector
    market_cap = get_market_cap(ticker)
    sector = get_sector(ticker)

    print(f"Market Cap: {market_cap} ")
    
    if market_cap == 0:
        continue
    
    selected_stocks.append({
        'Ticker': ticker,
        'predicted_return': row['predicted_return'],
        'market_cap': market_cap,
        'sector': sector
    })
    
    if len(selected_stocks) >= MAX_STOCKS+1:
        break

# Ensure we have enough stocks
if len(selected_stocks) < MIN_STOCKS:
    print(f"Warning: Only {len(selected_stocks)} stocks available")

# Check market cap mix
has_large_cap = any(s['market_cap'] > 10 for s in selected_stocks)
has_small_cap = any(s['market_cap'] < 2 for s in selected_stocks)

print(f"\nSelected {len(selected_stocks)} stocks")
print(f"Has large-cap (>$10B): {has_large_cap}")
print(f"Has small-cap (<$2B): {has_small_cap}")

# Calculate weights (equal weight adjusted for constraints)
n_stocks = len(selected_stocks)
min_weight_per_stock = 1.0 / (2 * n_stocks)



# Initial equal weighting
portfolio_df = pd.DataFrame(selected_stocks)
mean_of_predicted_return_of_selected_stocks = portfolio_df['predicted_return'].mean()
std_of_predicted_return_of_selected_stocks = portfolio_df['predicted_return'].std()
portfolio_df['normalized_predicted_return'] = (portfolio_df['predicted_return']-mean_of_predicted_return_of_selected_stocks)/std_of_predicted_return_of_selected_stocks
portfolio_df['shifted_normalized_predict_return'] = portfolio_df['normalized_predicted_return']-portfolio_df['normalized_predicted_return'].min()

sum_shifted_normalized = portfolio_df['shifted_normalized_predict_return'].sum()
avg_shifted_normalized = sum_shifted_normalized/n_stocks
portfolio_df['Weight'] = portfolio_df['shifted_normalized_predict_return']/avg_shifted_normalized*1.0/n_stocks


print(f"Portfolio_df: {portfolio_df}")

# Adjust for max weight constraint
portfolio_df['Weight'] = portfolio_df['Weight'].clip(upper=MAX_WEIGHT)
portfolio_df['Weight'] = portfolio_df['Weight'] / portfolio_df['Weight'].sum()

# Adjust for sector constraint
sector_weights = portfolio_df.groupby('sector')['Weight'].sum()
for sector in sector_weights[sector_weights > MAX_SECTOR_WEIGHT].index:
    sector_mask = portfolio_df['sector'] == sector
    scale_factor = MAX_SECTOR_WEIGHT / sector_weights[sector]
    portfolio_df.loc[sector_mask, 'Weight'] *= scale_factor

# Renormalize
portfolio_df['Weight'] = portfolio_df['Weight'] / portfolio_df['Weight'].sum()
print("\nCalculating final portfolio...")
print(f"\nSector allocation:")
print(portfolio_df.groupby('sector')['Weight'].sum().sort_values(ascending=False))
portfolio_df['Price_USD'] = 0.0
# ============================================================================
# CALCULATE SHARES AND FINAL PORTFOLIO
# ============================================================================

print("\nCalculating final portfolio...")

# Get current prices
portfolio_df['Price_USD'] = 0.0
portfolio_df['Currency'] = ''

for idx, row in portfolio_df.iterrows():
    ticker = row['Ticker']
    current_data = get_stock_data(ticker, (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d'), datetime.now().strftime('%Y-%m-%d'))
    
    if current_data is not None and not current_data.empty:
        portfolio_df.at[idx, 'Price_USD'] = current_data['Close'].iloc[-1]
        
        # Determine currency
        stock_info = ticker.info
        currency = stock_info.get('currency', 'USD')
        portfolio_df.at[idx, 'Currency'] = currency

# Calculate shares and fees
total_fees = 0
portfolio_df['Shares'] = 0.0
portfolio_df['Fee_CAD'] = 0.0

for idx, row in portfolio_df.iterrows():
    weight = row['Weight']
    price_usd = row['Price_USD']
    currency = row['Currency']
    
    # Convert allocation to CAD
    allocation_cad = INITIAL_CAPITAL * weight
    
    # Convert to purchase currency
    if currency == 'CAD':
        allocation_purchase = allocation_cad
        price_purchase = price_usd
    else:  # USD
        allocation_purchase = allocation_cad / USD_TO_CAD
        price_purchase = price_usd
    
    # Calculate shares (initial estimate)
    shares = allocation_purchase / price_purchase
    
    # Calculate fee
    fee_usd = calculate_fee(shares, price_purchase)
    fee_cad = fee_usd * USD_TO_CAD
    
    # Adjust shares to account for fees
    allocation_after_fee = allocation_purchase - fee_usd
    shares_final = allocation_after_fee / price_purchase
    
    portfolio_df.at[idx, 'Shares'] = shares_final
    portfolio_df.at[idx, 'Fee_CAD'] = fee_cad
    total_fees += fee_cad

# Calculate final values
portfolio_df['Value_CAD'] = 0.0
portfolio_df['Price'] = 0.0

for idx, row in portfolio_df.iterrows():
    shares = row['Shares']
    price_usd = row['Price_USD']
    currency = row['Currency']
    
    if currency == 'CAD':
        value_cad = shares * price_usd
        portfolio_df.at[idx, 'Price'] = price_usd
    else:  # USD
        value_cad = shares * price_usd * USD_TO_CAD
        portfolio_df.at[idx, 'Price'] = price_usd * USD_TO_CAD
    
    portfolio_df.at[idx, 'Value_CAD'] = value_cad

total_portfolio_value = portfolio_df['Value_CAD'].sum()
portfolio_df['Weight'] = portfolio_df['Value_CAD'] / total_portfolio_value

# ============================================================================
# OUTPUT FINAL PORTFOLIO
# ============================================================================

print("\n" + "="*80)
print("FINAL PORTFOLIO")
print("="*80)

final_portfolio = portfolio_df[['Ticker', 'Price', 'Currency', 'Shares', 'Value_CAD', 'Weight']].copy()
final_portfolio.columns = ['Ticker', 'Price', 'Currency', 'Shares', 'Value', 'Weight']
final_portfolio.index = range(1, len(final_portfolio) + 1)
final_portfolio['Weight'] = (final_portfolio['Weight'] * 100).round(2)
final_portfolio['Value'] = final_portfolio['Value'].round(2)
final_portfolio['Shares'] = final_portfolio['Shares'].round(4)
final_portfolio['Price'] = final_portfolio['Price'].round(2)

print(final_portfolio)
print("\n" + "-"*80)
print(f"Total Portfolio Value: ${total_portfolio_value:,.2f} CAD")
print(f"Total Fees Paid: ${total_fees:,.2f} CAD")

Total tickers loaded: 40

Filtering stocks by volume requirement...


$AGN: possibly delisted; no timezone found
$CELG: possibly delisted; no timezone found
$MON: possibly delisted; no timezone found
$RTN: possibly delisted; no timezone found


Stocks passing volume filter: 36

Preparing features for ML model...
Processing yfinance.Ticker object <ABBV>...
Processing yfinance.Ticker object <ABT>...
Processing yfinance.Ticker object <ACN>...
Processing yfinance.Ticker object <AIG>...
Processing yfinance.Ticker object <AMZN>...
Processing yfinance.Ticker object <AXP>...
Processing yfinance.Ticker object <BA>...
Processing yfinance.Ticker object <BAC>...
Processing yfinance.Ticker object <BB.TO>...
Processing yfinance.Ticker object <BIIB>...
Processing yfinance.Ticker object <BK>...
Processing yfinance.Ticker object <BLK>...
Processing yfinance.Ticker object <BMY>...
Processing yfinance.Ticker object <C>...
Processing yfinance.Ticker object <CAT>...
Processing yfinance.Ticker object <CL>...
Processing yfinance.Ticker object <KO>...
Processing yfinance.Ticker object <LLY>...
Processing yfinance.Ticker object <LMT>...
Processing yfinance.Ticker object <MO>...
Processing yfinance.Ticker object <MRK>...
Processing yfinance.Ticker obj