In [17]:
import pandas as pd

# Set pandas display options to show more columns and rows
pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.max_rows', 10)       # Limit to 10 rows for readability
pd.set_option('display.width', None)        # Let the display adjust to the window

# 2. Set the display width (optional but often helpful)
#    'None' tries to detect terminal width. 
#    A large number (e.g., 1000) ensures no wrapping unless absolutely necessary.
pd.set_option('display.width', 1000) 

In [21]:
# process_files.py
from config import date_str, DOWNLOAD_DIR, DEST_DIR
from pathlib import Path  # Better path handling

# Build paths
# source_path = Path(DOWNLOAD_DIR) / f'df_finviz_{date_str}.pkl'
# dest_path = Path(DEST_DIR) / 'df_finviz.pkl'
source_path = Path(DOWNLOAD_DIR) / f'df_finviz_{date_str}_stocks_etfs.parquet'
dest_path = Path(DEST_DIR) / f'{date_str}_df_finviz.parquet'

print(f"source_path: {source_path}")
print(f"dest_path: {dest_path}")


source_path: C:\Users\ping\Downloads\df_finviz_2025-04-24_stocks_etfs.parquet
dest_path: ..\data\2025-04-24_df_finviz.parquet


In [19]:
import pandas as pd

# Set pandas display options to show more columns and rows
pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.max_rows', 10)       # Limit to 10 rows for readability
# pd.set_option('display.width', None)        # Let the display adjust to the window
# pd.set_option('display.max_colwidth', None) # Show full content of each cell

In [20]:
df = pd.read_parquet(source_path, engine='pyarrow')
# df = pd.read_pickle(source_path)
display(df.info(), df.head())

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\ping\\Downloads\\df_finviz_2025-04-24_stocks_etfs.parquet'

In [None]:
import numpy as np

def convert_B_M_K_to_million(value_str):
  """
  Convert financial string values with suffixes to numeric values in billions
  Examples:
  '104.27B' -> 104,270
  '104.27M' -> 104.27
  '104.27K' -> 0.10427
  '-' -> np.nan
  """
  
  if not isinstance(value_str, str):
    return value_str  # Return the original value if not a string
    
  if value_str == '-':
    return np.nan
    
  # Remove any whitespace
  value_str = value_str.strip()
  
  # Handle suffixes
  multipliers = {
    'B': 1000,  
    'M': 1,
    'K': 0.001
  }
  
  try:
    # Extract the numeric part and suffix
    if value_str[-1] in multipliers:
      number = float(value_str[:-1])
      multiplier = multipliers[value_str[-1]]
      return number * multiplier
    else:
      # If no suffix, try to convert directly to float
      return float(value_str)
  except (ValueError, IndexError):
    return np.nan

# Example usage:
# df['Market Cap Numeric'] = df['Market Cap'].apply(convert_B_M_K_to_million)

In [None]:
import pandas as pd
import numpy as np

def process_percentage_columns(df):
    """
    Identifies and processes columns in a DataFrame where values end with '%'.
    The function cleans, converts to numeric, renames, and prints which columns were modified.

    Args:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: The modified DataFrame with processed percentage columns.
                      Returns the original DataFrame if no percentage columns are found.
    """

    # Identify columns where values END WITH '%'
    percent_cols = [
        col for col in df.columns
        if df[col].dtype == 'object'
        and df[col].str.strip().str.endswith('%', na=False).any()
    ]

    # If no percentage columns are found, return the original DataFrame
    if not percent_cols:
        print("No percentage columns found to modify.")  # Print message
        return df

    print("The following columns ending with % were modified:") #Print message before the loop

    # Process identified percentage columns
    for col in percent_cols:
        # Clean data: (1) Strip whitespace, (2) Handle '-', (3) Remove trailing %
        cleaned_series = (
            df[col].str.strip()
            .replace('-', np.nan)  # Convert '-' to NaN
            .str.replace(r'%$', '', regex=True)  # Remove only ENDING %
        )
        
        # Convert to numeric (coerce invalid values to NaN)
        df[col] = pd.to_numeric(cleaned_series, errors='coerce')
        
        # Rename column
        df.rename(columns={col: f"{col} %"}, inplace=True)
        print(f"- {col}")  # Print the column name
        
    return df

# Example Usage:
# Assuming you have a DataFrame called 'df'
# df = process_percentage_columns(df)

In [None]:
df = process_percentage_columns(df)

In [None]:
# List of columns to concatenate
columns_to_concat = ["Sector", "Industry", "Single Category", "Asset Type"]

# Replace '-' with empty string in specified columns
for col in columns_to_concat:
    df[col] = df[col].replace('-', '')

# Concatenate the columns, handling empty strings, and remove extra spaces
df['Info'] = df[columns_to_concat].apply(lambda row: ', '.join(filter(None, row.astype(str))), axis=1)

In [None]:
# Concatenate the columns 'Market Cap' and 'AUM'
# Replace '-' with empty string in both columns

df['MktCap AUM'] = df['Market Cap'].replace('-', '') + df['AUM'].replace('-', '')

In [None]:
# Create new column by converting to numeric values in millions
df['MktCap AUM, M'] = df['MktCap AUM'].apply(convert_B_M_K_to_million)
df['Avg Volume, M'] = df['Avg Volume'].apply(convert_B_M_K_to_million)
df['Volume, M'] = (pd.to_numeric(df['Volume'].str.replace(',', ''), errors='coerce') / 1_000_000)

In [None]:
# Convert 'Beta', 'ATR', 'Rel Volume', and 'Price' to float
for col in ['Beta', 'ATR', 'RSI', 'Rel Volume', 'Price']:
  # Clean and convert to numeric
  df[col] = pd.to_numeric(
    df[col].str.replace('$', '').str.replace(',', ''),
    errors='coerce'  # Convert invalid values to NaN
  )

# Verify the conversion
for col in ['Beta', 'ATR', 'Rel Volume', 'Price']:
  print(f"{col}: {df[col].dtype}")

In [None]:
df.info()

In [None]:
df.columns

In [None]:
my_cols = ['Ticker', 'Company', 'Info', 'MktCap AUM, M', 'Beta',
          'RSI', 'Perf YTD %', 'Perf Week %', 'Perf Month %', 'Perf Quart %', 'Perf Half %', 'Perf Year %',
          'SMA20 %', 'SMA50 %', 'SMA200 %',   
          '50D High %', '50D Low %', '52W High %', '52W Low %', 'All-Time High %', 'All-Time Low %',
          'ATR', 'Volatility W %', 'Volatility M %',  
          'Volume, M', 'Avg Volume, M','Rel Volume',
          'Price', 'Change %', 'Dividend %', 
          ]

In [None]:
# Create new df with my_cols and set Ticker as index
new_df = df[my_cols].set_index('Ticker')

# Sort by 'MktCap AUM, M' in descending order, with NaN values last
new_df = new_df.sort_values(by='MktCap AUM, M', ascending=False, na_position='last')

# Display info and first few rows to verify
display(new_df.info(), new_df)

In [None]:
# Save to pickle file
# new_df.to_pickle(dest_path)

# Using PyArrow (default, recommended for most cases)
new_df.to_parquet(dest_path, engine='pyarrow')
print(f'save new_df to {dest_path}')

# # To load it later:
# loaded_df = pd.read_pickle('df_finviz.pkl')