In [None]:
# Papermill injects parameters here

In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
from pprint import pprint
from IPython.display import display, Markdown

# --- 1. PANDAS & IPYTHON OPTIONS ---
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 3000*5)
pd.set_option('display.float_format', '{:.6f}'.format)
%load_ext autoreload
%autoreload 2

# --- 2. PROJECT PATH CONFIGURATION ---
NOTEBOOK_DIR = Path.cwd()
PARENT_DIR = NOTEBOOK_DIR.parent
ROOT_DIR = NOTEBOOK_DIR.parent.parent  # Adjust if your notebook is in a 'notebooks' subdirectory
DATA_DIR = ROOT_DIR / 'data'
SRC_DIR = ROOT_DIR / 'src'

# Add 'src' to the Python path to import custom modules
if str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))

# --- 3. IMPORT CUSTOM MODULES ---
import utils
import plotting_utils

# --- 4. INITIAL_CAPITAL ---
INITIAL_CAPITAL = 100000

# --- 5. RISK FREE ANNUAL RATE ---
RISK_FREE_ANNUAL_RATE = 0.04

# --- 6. VERIFICATION ---
print("--- Path Configuration ---")
print(f"✅ Project Root: {ROOT_DIR}")
print(f"✅ Parent Dir:   {PARENT_DIR}")
print(f"✅ Notebook Dir: {NOTEBOOK_DIR}")
print(f"✅ Data Dir:     {DATA_DIR}")
print(f"✅ Source Dir:   {SRC_DIR}")
assert all([ROOT_DIR.exists(), DATA_DIR.exists(), SRC_DIR.exists()]), "A key directory was not found!"

print("\n--- Module Verification ---")
print(f"✅ Successfully imported 'utils' and 'plotting_utils'.")

In [None]:
# Use the papermill injected parameters to read the files
returns_train = pd.read_parquet(returns_train_path)
returns_test = pd.read_parquet(returns_test_path)

In [None]:
# import pandas as pd

# # Read the Parquet file into a DataFrame
# returns_train = pd.read_parquet('returns_train.parquet', engine='pyarrow')
# returns_test = pd.read_parquet('returns_test.parquet', engine='pyarrow')

# parameters injected by papermill
# returns_train = pd.read_csv(returns_train_path, index_col=0, parse_dates=True)
# returns_test = pd.read_csv(returns_test_path, index_col=0, parse_dates=True)

# returns_train = pd.read_parquet(NOTEBOOK_DIR / '_returns_train')
# returns_test = pd.read_parquet(NOTEBOOK_DIR / '_returns_test')

# returns_train = pd.read_parquet(returns_train_path)
# returns_test = pd.read_parquet(returns_test_path)     

In [None]:
file_list = utils.get_recent_files(
    directory_path = DATA_DIR,
    extension = 'parquet', 
    prefix = None,
    contains_pattern = 'df_finviz_merged_stocks_etfs',
    count = None
)

df_finviz = pd.read_parquet(DATA_DIR / file_list[0])

In [None]:
print(f'df_finviz:\n{df_finviz}')

In [None]:
# returns_train = train_chunks[i].iloc[:n_test_rows]
# returns_test = train_chunks[i].iloc[n_test_rows:]

In [None]:
print(f'returns_train.shape: {returns_train.shape}')
print(f'returns_test.shape: {returns_test.shape}')

In [None]:
import pandas as pd

# Assuming returns_train is your DataFrame with daily returns

# Extract the last 30, 60, 120, and 240 rows
last_30 = returns_train.iloc[-30:]
last_60 = returns_train.iloc[-60:]
last_120 = returns_train.iloc[-120:]
last_240 = returns_train.iloc[-240:]

# Function to calculate Sharpe ratio
def calculate_sharpe_ratio(returns):
    mean_return = returns.mean()
    std_dev = returns.std()
    sharpe_ratio = mean_return / std_dev
    return sharpe_ratio

# Calculate Sharpe ratios for each subset
sharpe_ratio_30 = calculate_sharpe_ratio(last_30)
sharpe_ratio_60 = calculate_sharpe_ratio(last_60)
sharpe_ratio_120 = calculate_sharpe_ratio(last_120)
sharpe_ratio_240 = calculate_sharpe_ratio(last_240)

# Print the results
print("Sharpe Ratios for the last 30 rows:")
print(sharpe_ratio_30.head())  # Display the first few tickers for brevity

print("Sharpe Ratios for the last 60 rows:")
print(sharpe_ratio_60.head())  # Display the first few tickers for brevity

print("\nSharpe Ratios for the last 120 rows:")
print(sharpe_ratio_120.head())  # Display the first few tickers for brevity

print("\nSharpe Ratios for the last 240 rows:")
print(sharpe_ratio_240.head())  # Display the first few tickers for brevity

In [None]:
# Extract the Sharpe ratio of 'VGT' for each period
vgt_sharpe_30 = sharpe_ratio_30['VGT']
vgt_sharpe_60 = sharpe_ratio_60['VGT']
vgt_sharpe_120 = sharpe_ratio_120['VGT']
vgt_sharpe_240 = sharpe_ratio_240['VGT']

# Find tickers with higher Sharpe ratio than 'VGT' for all four periods
tickers_with_higher_sharpe = sharpe_ratio_30[
    (sharpe_ratio_30 > vgt_sharpe_30) &    
    (sharpe_ratio_60 > vgt_sharpe_60) &
    (sharpe_ratio_120 > vgt_sharpe_120) &
    (sharpe_ratio_240 > vgt_sharpe_240)
].index

n_higher_sharpe_tickers = len(tickers_with_higher_sharpe)
total_ticker_columns = len(returns_train.columns)
pct_higher_sharpe_tickers = n_higher_sharpe_tickers  / total_ticker_columns * 100

print(f"Percentage of tickers with Sharpe Ratio > Benchmark's Sharpe Ratio: {pct_higher_sharpe_tickers:.2f}%")

In [None]:
# Create a boolean array
boolean_array_30 = sharpe_ratio_30 > vgt_sharpe_30
boolean_array_60 = sharpe_ratio_60 > vgt_sharpe_60
boolean_array_120 = sharpe_ratio_120 > vgt_sharpe_120
boolean_array_240 = sharpe_ratio_240 > vgt_sharpe_240

# Count the number of True values
count_true_30 = np.sum(boolean_array_30)
count_true_60 = np.sum(boolean_array_60)
count_true_120 = np.sum(boolean_array_120)
count_true_240 = np.sum(boolean_array_240)

print(f'count sharpe_ratio_30 > vgt_sharpe_30: {count_true_30}')
print(f'count sharpe_ratio_60 > vgt_sharpe_60: {count_true_60}')
print(f'count sharpe_ratio_120 > vgt_sharpe_120: {count_true_120}')
print(f'count sharpe_ratio_240 > vgt_sharpe_240: {count_true_240}')

# Print the tickers
print(f"\n{n_higher_sharpe_tickers} tickers, or {pct_higher_sharpe_tickers:.2f}%, have higher Sharpe Ratio than Benchmark for all four periods:")
print(tickers_with_higher_sharpe)

In [None]:
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import linkage, to_tree
from scipy.spatial.distance import pdist

# Filter returns_train to only include tickers in tickers_with_higher_sharpe
returns_train_filtered = returns_train[tickers_with_higher_sharpe]

# Calculate the covariance matrix
cov_matrix = returns_train_filtered.cov()

# Calculate the distance matrix
dist_matrix = pdist(cov_matrix, metric='correlation')

# Perform hierarchical clustering
linkage_matrix = linkage(dist_matrix, method='single')

# Convert the linkage matrix to a tree structure
tree = to_tree(linkage_matrix)

# Function to get the order of leaves in the dendrogram
def get_leaf_order(tree):
    if tree.is_leaf():
        return [tree.id]
    return get_leaf_order(tree.left) + get_leaf_order(tree.right)

# Get the order of leaves
leaf_order = get_leaf_order(tree)

# Function to allocate risk proportionally
def allocate_risk(cov_matrix, leaf_order):
    n = len(cov_matrix)
    weights = np.zeros(n)
    cluster_var = np.diag(cov_matrix)
    total_var = np.sum(cluster_var)
    for i, ticker in enumerate(leaf_order):
        weights[ticker] = cluster_var[i] / total_var
    return weights

# Allocate risk proportionally
weights = allocate_risk(cov_matrix, leaf_order)

# Select the top 10 tickers with the highest weights
selected_tickers = returns_train_filtered.columns[np.argsort(weights)[-10:][::-1]]

# Get the weights for the selected tickers
selected_weights = weights[np.argsort(weights)[-10:][::-1]]

# Normalize the selected weights to ensure they sum to 1
selected_weights /= np.sum(selected_weights)

if len(selected_tickers) < 10:
    print(f'===== There are only {len(selected_tickers)} selected_tickers. It is less than 10. Stay in CASH =====')
    print(f'===== They are {selected_tickers.tolist()} =====\n')
    selected_tickers = ['CASH']
    selected_weights = [1.0]    

# Print the selected tickers and their normalized weights
print("Selected tickers for the diversified portfolio using HRP:")
print(selected_tickers)
print("Normalized portfolio weights for the selected tickers:")
print(selected_weights)

# Verify that the weights sum to 1
print("Sum of weights:", np.sum(selected_weights))

In [None]:
# selected_tickers = ['CASH']
# selected_weights = [1.0]   

In [None]:
import numpy as np
import pandas as pd

# Ensure returns_test is a DataFrame with daily returns for all tickers
# returns_test should have a DatetimeIndex and columns for each ticker

# Extract the daily returns for the selected tickers
selected_returns = returns_test[selected_tickers]

# Extract the daily returns for 'VGT'
vgt_returns = returns_test['VGT']

# Calculate the daily portfolio value
portfolio_value = selected_returns.dot(selected_weights)

# Create a DataFrame to store the daily values
daily_values = pd.DataFrame({
    'Portfolio': portfolio_value,
    'VGT': vgt_returns
})

# Calculate the cumulative returns for the portfolio and 'VGT'
daily_values['Portfolio_Cumulative'] = (1 + daily_values['Portfolio']).cumprod()
daily_values['VGT_Cumulative'] = (1 + daily_values['VGT']).cumprod()

# Print the daily values
print("Daily values of the portfolio and 'VGT':")
print(daily_values[['Portfolio_Cumulative', 'VGT_Cumulative']])

# Plot the cumulative returns for comparison
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.plot(daily_values.index, daily_values['Portfolio_Cumulative'], label='Portfolio')
plt.plot(daily_values.index, daily_values['VGT_Cumulative'], label='VGT')
plt.title('Daily Cumulative Returns: Portfolio vs VGT')
plt.xlabel('Date')
plt.ylabel('Cumulative Returns')
plt.legend()
plt.show()

In [None]:
import pandas as pd
import os

# Portfolio and Benchmark daily Value to save to csv
portfolio_daily_value = daily_values[['Portfolio_Cumulative', 'VGT_Cumulative']]

# Define the path for the temporary directory
temp_data_dir = NOTEBOOK_DIR / "temp"

# Define the CSV file name
# csv_file_name = 'portfolio daily value.csv'
csv_file_name = temp_data_dir / 'portfolio daily value.csv'

# Check if the file exists
if os.path.exists(csv_file_name):
    # Read the existing CSV file into a DataFrame
    existing_data = pd.read_csv(csv_file_name, index_col='Date', parse_dates=True)
    
    # Ensure the existing DataFrame has the same columns as portfolio_daily_value
    existing_data = existing_data[portfolio_daily_value.columns]
    
    # Append the new data to the existing DataFrame
    updated_data = pd.concat([existing_data, portfolio_daily_value], ignore_index=False)
    
    # Save the updated DataFrame back to the CSV file
    updated_data.to_csv(csv_file_name, index=True)
    print(f"Data appended to {csv_file_name}")
else:
    # Save the new DataFrame to the CSV file, including the date index
    portfolio_daily_value.to_csv(csv_file_name, index=True)
    print(f"New file created and data saved to {csv_file_name}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Create a list of counts
# counts = [count_true_30, count_true_60, count_true_120, count_true_240]
counts = [count_true_240, count_true_120, count_true_60, count_true_30]

# Create a list of labels for the x-axis
# labels = ['30', '60', '120', '240']
labels = ['240', '120', '60', '30']

# # Create a bar chart
# plt.bar(labels, counts, color=['blue', 'green', 'red', 'purple'])

# Create line chart
plt.plot(labels, counts, marker='o', linestyle='-', color='blue')

# Add titles and labels
# plt.title('Count of True Values for Different Time Periods')
plt.title('Tickers with Sharpe Ratio > Benchmark for Past 240, 120, 60, 30 Days')
plt.xlabel('Past Period (days)')
plt.ylabel('Count of Tickers')

# # Show the plot
plt.show()

In [None]:
import pandas as pd

if selected_tickers[0] != 'CASH':
    # Define the columns you want to include in the new DataFrame
    cols = [
        'Company',
        'Index',
        'Sector',
        'Industry',
        'Info',
    ]

    missing_tickers = []
    
    # Filter out tickers that are not in the DataFrame's index
    valid_tickers = [ticker for ticker in selected_tickers if ticker in df_finviz.index]
    missing_tickers = [ticker for ticker in selected_tickers if ticker not in df_finviz.index]

    # Print missing tickers
    if missing_tickers:
        print(f"The following tickers are not in the df_finviz's index and will be excluded: {missing_tickers}\n")

    # Check if there are any valid tickers left
    if not valid_tickers:
        print("None of the selected tickers are in the DataFrame's index.")
    else:
        # Filter the DataFrame to include only the valid tickers
        df_selected = df_finviz.loc[valid_tickers]

        # Select the specified columns
        df_selected = df_selected[cols]

        # Print the resulting DataFrame
        print("DataFrame with selected tickers and specified columns:")
        print(df_selected)
else:
    print(f'All CASH portfolio')

In [None]:
print(f'count sharpe_ratio_30 > vgt_sharpe_30: {count_true_30}')
print(f'count sharpe_ratio_60 > vgt_sharpe_60: {count_true_60}')
print(f'count sharpe_ratio_120 > vgt_sharpe_120: {count_true_120}')
print(f'count sharpe_ratio_240 > vgt_sharpe_240: {count_true_240}')

# Print the tickers
print(f"\n{n_higher_sharpe_tickers} tickers, or {pct_higher_sharpe_tickers:.2f}%, have higher Sharpe Ratio than Benchmark for all four periods:")
print(tickers_with_higher_sharpe)

In [None]:
if selected_tickers[0] != 'CASH':
    print(f'returns[missing_tickers]:\n{returns_train[missing_tickers]}')
else:
    print(f'returns["CASH"]:\n{returns_train["CASH"]}')