#### Create "ATR/Price %" in df_finviz
#### Calculate "Perf 3D %" in df_Perf_3D_pct
#### Merge columns of df_finviz and df_Perf_3D_pct

In [None]:
# process_files.py
from config import DATE_STR, DOWNLOAD_DIR, DEST_DIR
from pathlib import Path  # Better path handling

print(f"DATE_STR: {DATE_STR}")
print(f"DOWNLOAD_DIR: {DOWNLOAD_DIR}")
print(f"DEST_DIR: {DEST_DIR}\n")

# Build paths
# ohlcv_path = Path(DEST_DIR) / f'{DATE_STR}_df_OHLCV_clean_stocks_etfs.parquet'
ohlcv_path = Path(DEST_DIR) / f'df_OHLCV_clean_stocks_etfs.parquet'
source_path = Path(DEST_DIR) / f'{DATE_STR}_df_finviz_n_ratios_stocks_etfs.parquet'
dest_path = Path(DEST_DIR) / f'{DATE_STR}_df_finviz_merged_stocks_etfs.parquet'
tickers_path = Path(DEST_DIR) / f'{DATE_STR}_df_common_tickers_stocks_etfs.parquet'

print(f"ohlcv_path: {ohlcv_path}")
print(f"source_path: {source_path}")
print(f"dest_path: {dest_path}")
print(f"tickers_path: {tickers_path}")

In [None]:
import pandas as pd

# Set pandas display options to show more columns and rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)       # Limit to 10 rows for readability
pd.set_option('display.width', None)        # Let the display adjust to the window
pd.set_option('display.width', 2000) 

In [None]:
# Load the parquet file into a DataFrame
df_finviz = pd.read_parquet(source_path)

# Display basic information about the DataFrame
print(f"df_finviz.shape: {df_finviz.shape}")
print("\nFirst few rows:")
print(df_finviz.head())

In [None]:
# Calculate ATR/Price ratio and format it as percentage
df_finviz['ATR/Price %'] = (df_finviz['ATR'] / df_finviz['Price']) * 100

# Display the first few rows of the new column
print("\nFirst few rows of ATR/Price column:")
display(df_finviz[['ATR', 'Price', 'ATR/Price %']].head())

In [None]:
df_common_tickers = pd.read_parquet(tickers_path)
tickers = df_common_tickers.index.to_list()

print(f'len(tickers): {len(tickers)}')
print(f'tickers[0:5]: {tickers[0:5]}')

In [None]:
import pandas as pd
import numpy as np  # Import numpy

# 1. Load Data
df_OHLCV = pd.read_parquet(ohlcv_path)
print("df_OHLCV Info:")
display(df_OHLCV.info())


# 2. Prepare Adjusted Close Data
df_adj_close = df_OHLCV[['Adj Close']].unstack(level=0)
df_adj_close.columns = df_adj_close.columns.get_level_values(1)
df_adj_close = df_adj_close.sort_index(axis=0)

print(f'Before filter, len(df_adj_close.columns): {len(df_adj_close.columns)}\n')
df_adj_close = df_adj_close[tickers]
print(f'After filter, len(df_adj_close.columns): {len(df_adj_close.columns)}\n')
print("df_adj_close Info:")
print(df_adj_close.info())
print("df_adj_close Tail:")
display(df_adj_close.tail())


# 4. Calculate Returns
df_returns = df_adj_close.pct_change(periods=3) * 100
df_Perf_3D_pct = df_returns.tail(1)


# 5. Display Returns
pd.options.display.float_format = '{:.5f}'.format
print("df_Perf_3D_pct:")
display(df_Perf_3D_pct)
print("df_Perf_3D_pct Info:")
df_Perf_3D_pct.info()
print(df_Perf_3D_pct.info())  # Print is redundant, info is already displayed above
print("df_Perf_3D_pct Head:")
print(df_Perf_3D_pct.head(2))


# 6. Define Merge Function
def merge_dataframes(df, df_Perf_3D_pct):
    """
    Merges data from df_Perf_3D_pct into df, aligning based on tickers.
    Renames the added column to "Perf 3D %".

    Assumptions:
      - df_Perf_3D_pct has columns representing tickers.
      - df has an index representing tickers.
    """
    # Transpose df_Perf_3D_pct so that tickers become the index.
    df_Perf_3D_pct_transposed = df_Perf_3D_pct.T
    df_Perf_3D_pct_transposed.index.name = 'Ticker'

    # Ensure that the main dataframe's index also has a name if it doesn't
    if df.index.name is None:
        df.index.name = 'Ticker'

    # Merge the two dataframes based on the index (which represents the tickers)
    df_merged = df.merge(df_Perf_3D_pct_transposed, left_index=True, right_index=True, how='left')

    # Rename the added column, using the *last* column name
    if df_merged.shape[1] > df.shape[1]:  # Check if new columns were actually added
        last_col = df_merged.columns[-1]  # Get the last column name which is what we need to rename
        df_merged = df_merged.rename(columns={last_col: 'Perf 3D %'})

    return df_merged


# 7. Merge Dataframes
pd.options.display.float_format = '{:.2f}'.format
df_merged = merge_dataframes(df_finviz, df_Perf_3D_pct)
print("df_merged Head:")
print(df_merged.head())

In [None]:
import pandas as pd

def find_duplicate_columns(df):
  """
  Finds and returns a list of duplicate column names in a Pandas DataFrame.

  Args:
    df: The Pandas DataFrame to check.

  Returns:
    A list of column names that are duplicates (excluding the first occurrence).
    Returns an empty list if no duplicate columns are found.
  """

  duplicate_columns = []
  seen_columns = set()  # Keep track of columns we've already encountered

  for col in df.columns:
    if col in seen_columns:
      duplicate_columns.append(col)
    else:
      seen_columns.add(col)

  return duplicate_columns

In [None]:
duplicate_cols = find_duplicate_columns(df_merged)

if duplicate_cols:
    print("Duplicate columns found:")
    for col in duplicate_cols:
        print(f"- {col}")
    raise ValueError("Duplicate columns found") # Raise the error
else:
    print("No duplicate columns found.")

In [None]:
print(df_merged.columns.to_list())
print(f"len(df_merged.columns): {len(df_merged.columns)}")

In [None]:
new_column_order = ['No.', 'Company', 'Index', 'Sector', 'Industry', 'Country', 'Exchange',
                    'Info', 'MktCap AUM, M', 'Rank',
                    'Market Cap, M', 'P/E', 'Fwd P/E', 'PEG', 'P/S', 'P/B', 'P/C', 'P/FCF', 
                    'Book/sh', 'Cash/sh', 'Dividend %', 'Dividend TTM', 'Dividend Ex Date', 
                    'Payout Ratio %', 'EPS', 'EPS next Q', 'EPS this Y %', 'EPS next Y %', 
                    'EPS past 5Y %', 'EPS next 5Y %', 'Sales past 5Y %', 'Sales Q/Q %', 
                    'EPS Q/Q %', 'EPS YoY TTM %', 'Sales YoY TTM %', 'Sales, M', 'Income, M', 
                    'EPS Surprise %', 'Revenue Surprise %', 'Outstanding, M', 
                    'Float, M', 'Float %', 'Insider Own %', 'Insider Trans %', 'Inst Own %', 
                    'Inst Trans %', 'Short Float %', 'Short Ratio', 'Short Interest, M', 
                    # 'ROA %', 'ROE %', 'ROI %', 'Curr R', 'Quick R', 'LTDebt/Eq', 'Debt/Eq', 
                    'ROA %', 'ROE %', 'ROIC %', 'Curr R', 'Quick R', 'LTDebt/Eq', 'Debt/Eq',                     
                    'Gross M %', 'Oper M %', 'Profit M %', 'Perf 3D %', 'Perf Week %', 'Perf Month %', 
                    'Perf Quart %', 'Perf Half %', 'Perf Year %', 'Perf YTD %', 'Beta', 'ATR','ATR/Price %',
                    'Volatility W %', 'Volatility M %', 'SMA20 %', 'SMA50 %', 'SMA200 %', 
                    '50D High %', '50D Low %', '52W High %', '52W Low %', '52W Range', 
                    'All-Time High %', 'All-Time Low %', 'RSI', 'Earnings', 'IPO Date', 
                    'Optionable', 'Shortable', 'Employees', 'Change from Open %', 'Gap %', 
                    'Recom', 'Avg Volume, M', 'Rel Volume', 'Volume', 'Target Price', 
                    'Prev Close', 'Open', 'High', 'Low', 'Price', 'Change %', 'Single Category', 
                    'Asset Type', 'Expense %', 'Holdings', 'AUM, M', 'Flows 1M, M', 'Flows% 1M', 
                    'Flows 3M, M', 'Flows% 3M', 'Flows YTD, M', 'Flows% YTD', 'Return% 1Y', 
                    'Return% 3Y', 'Return% 5Y', 'Tags', 'Sharpe 3d', 'Sortino 3d', 'Omega 3d', 
                    'Sharpe 5d', 'Sortino 5d', 'Omega 5d', 'Sharpe 10d', 'Sortino 10d', 'Omega 10d', 
                    'Sharpe 15d', 'Sortino 15d', 'Omega 15d', 'Sharpe 30d', 'Sortino 30d', 'Omega 30d', 
                    'Sharpe 60d', 'Sortino 60d', 'Omega 60d', 'Sharpe 120d', 'Sortino 120d', 
                    'Omega 120d', 'Sharpe 250d', 'Sortino 250d', 'Omega 250d',]

In [None]:
print(f'len(new_column_order): {len(new_column_order)}')
print(f'len(df_merged.columns.to_list()): {len(df_merged.columns.to_list())}')
missing_columns = [col for col in df_merged.columns.to_list() if col not in new_column_order]
# missing_columns = [col for col in new_column_order if col not in df_merged.columns.to_list()]
print(f'missing_columns: {missing_columns}')

In [None]:
# Reindex the DataFrame with the new column order
df_merged = df_merged.reindex(columns=new_column_order)

print("reindexed df_merged Head:")
print(df_merged.head(), df_merged.info())

In [None]:
df_merged.to_parquet(dest_path, engine='pyarrow', compression='zstd')
print(f"Merged dataframe saved to {dest_path}")

In [None]:
df_merged.info()

In [None]:
print(f'df_merged shape: {df_merged.shape}')

In [None]:
df_merged.columns.to_list()