In [147]:
import pandas as pd

# Set pandas display options to show more columns and rows
pd.set_option('display.max_columns', None)  # Show all columns
# pd.set_option('display.max_rows', 10)       # Limit to 10 rows for readability
pd.set_option('display.width', 1500) 

In [148]:
# process_files.py
from config import date_str, DOWNLOAD_DIR, DEST_DIR
from pathlib import Path  # Better path handling

# Build paths
# source_path = Path(DOWNLOAD_DIR) / f'df_finviz_{date_str}.pkl'
# dest_path = Path(DEST_DIR) / 'df_finviz.pkl'
source_path = Path(DOWNLOAD_DIR) / f'df_finviz_{date_str}_stocks_etfs.parquet'
dest_path = Path(DEST_DIR) / f'{date_str}_df_finviz_stocks_etfs.parquet'

print(f"source_path: {source_path}")
print(f"dest_path: {dest_path}")


source_path: C:\Users\ping\Downloads\df_finviz_2025-04-25_stocks_etfs.parquet
dest_path: ..\data\2025-04-25_df_finviz_stocks_etfs.parquet


In [149]:
import pandas as pd

# Set pandas display options to show more columns and rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', 200)       # Limit to 10 rows for readability
# pd.set_option('display.width', None)        # Let the display adjust to the window
# pd.set_option('display.max_colwidth', None) # Show full content of each cell

In [150]:
df = pd.read_parquet(source_path, engine='pyarrow')
display(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Columns: 111 entries, No. to Tags
dtypes: object(111)
memory usage: 1.3+ MB


None

In [151]:
print(df.head())

   No. Ticker                     Company    Index                  Sector          Industry Country Exchange Market Cap     P/E Fwd P/E    PEG   P/S   P/B     P/C  P/FCF Book/sh Cash/sh Dividend Dividend TTM Dividend Ex Date Payout Ratio    EPS EPS next Q EPS this Y EPS next Y EPS past 5Y EPS next 5Y Sales past 5Y Sales Q/Q  EPS Q/Q EPS YoY TTM Sales YoY TTM  Sales     Income EPS Surprise Revenue Surprise Outstanding    Float Float % Insider Own Insider Trans Inst Own Inst Trans Short Float Short Ratio Short Interest     ROA      ROE     ROI Curr R Quick R LTDebt/Eq Debt/Eq Gross M  Oper M Profit M Perf Week Perf Month Perf Quart Perf Half Perf Year Perf YTD  Beta   ATR Volatility W Volatility M   SMA20   SMA50   SMA200 50D High 50D Low 52W High 52W Low       52W Range All-Time High All-Time Low    RSI  Earnings    IPO Date Optionable Shortable Employees Change from Open     Gap Recom Avg Volume Rel Volume     Volume Target Price Prev Close    Open    High     Low   Price  Change Sing

In [152]:
import pandas as pd
import numpy as np

def process_percentage_columns(df):
    """
    Identifies and processes columns in a DataFrame where values end with '%'.
    The function cleans, converts to numeric, renames, and prints which columns were modified.

    Args:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: The modified DataFrame with processed percentage columns.
                      Returns the original DataFrame if no percentage columns are found.
    """

    # Identify columns where values END WITH '%'
    percent_cols = [
        col for col in df.columns
        if df[col].dtype == 'object'
        and df[col].str.strip().str.endswith('%', na=False).any()
    ]

    # If no percentage columns are found, return the original DataFrame
    if not percent_cols:
        print("No percentage columns found to modify.")  # Print message
        return df

    print("The following columns ending with % were modified:") #Print message before the loop

    # Process identified percentage columns
    for col in percent_cols:
        # Clean data: (1) Strip whitespace, (2) Handle '-', (3) Remove trailing %
        cleaned_series = (
            df[col].str.strip()
            .replace('-', np.nan)  # Convert '-' to NaN
            .str.replace(r'%$', '', regex=True)  # Remove only ENDING %
        )
        
        # Convert to numeric (coerce invalid values to NaN)
        df[col] = pd.to_numeric(cleaned_series, errors='coerce')
        
        # Rename column
        # Check if the specific pattern "%" is already present in the column name
        if "%" not in col:
            # If it's NOT present, then rename by appending " %"
            new_col_name = f"{col} %"
            df.rename(columns={col: new_col_name}, inplace=True)
            print(f"- Renamed: {col} -> {new_col_name}") # Indicate the rename happened
        else:
            # If it IS already present, do nothing to the name, just print
            print(f"- Kept as is: {col} (already contains %)") # Indicate no rename needed

        
    return df

# Example Usage:
# Assuming you have a DataFrame called 'df'
# df = process_percentage_columns(df)

In [153]:
df = process_percentage_columns(df)

The following columns ending with % were modified:
- Renamed: Dividend -> Dividend %
- Renamed: Payout Ratio -> Payout Ratio %
- Renamed: EPS this Y -> EPS this Y %
- Renamed: EPS next Y -> EPS next Y %
- Renamed: EPS past 5Y -> EPS past 5Y %
- Renamed: EPS next 5Y -> EPS next 5Y %
- Renamed: Sales past 5Y -> Sales past 5Y %
- Renamed: Sales Q/Q -> Sales Q/Q %
- Renamed: EPS Q/Q -> EPS Q/Q %
- Renamed: EPS YoY TTM -> EPS YoY TTM %
- Renamed: Sales YoY TTM -> Sales YoY TTM %
- Renamed: EPS Surprise -> EPS Surprise %
- Renamed: Revenue Surprise -> Revenue Surprise %
- Kept as is: Float % (already contains %)
- Renamed: Insider Own -> Insider Own %
- Renamed: Insider Trans -> Insider Trans %
- Renamed: Inst Own -> Inst Own %
- Renamed: Inst Trans -> Inst Trans %
- Renamed: Short Float -> Short Float %
- Renamed: ROA -> ROA %
- Renamed: ROE -> ROE %
- Renamed: ROI -> ROI %
- Renamed: Gross M -> Gross M %
- Renamed: Oper M -> Oper M %
- Renamed: Profit M -> Profit M %
- Renamed: Perf Week -

In [154]:
# List of columns to concatenate
# columns_to_concat = ["Sector", "Industry", "Single Category", "Asset Type"]
columns_to_concat = ["Sector", "Industry"]

# Replace '-' with empty string in specified columns
for col in columns_to_concat:
    df[col] = df[col].replace('-', '')

# Concatenate the columns, handling empty strings, and remove extra spaces
df['Info'] = df[columns_to_concat].apply(lambda row: ', '.join(filter(None, row.astype(str))), axis=1)

In [155]:
print(df)

      No. Ticker                     Company    Index                  Sector                 Industry      Country Exchange Market Cap     P/E Fwd P/E    PEG   P/S   P/B     P/C  P/FCF Book/sh Cash/sh  Dividend % Dividend TTM Dividend Ex Date  Payout Ratio %    EPS EPS next Q  EPS this Y %  EPS next Y %  EPS past 5Y %  EPS next 5Y %  Sales past 5Y %  Sales Q/Q %  EPS Q/Q %  EPS YoY TTM %  Sales YoY TTM %   Sales     Income  EPS Surprise %  Revenue Surprise % Outstanding    Float  Float %  Insider Own %  Insider Trans %  Inst Own %  Inst Trans %  Short Float % Short Ratio Short Interest  ROA %  ROE %  ROI % Curr R Quick R LTDebt/Eq Debt/Eq  Gross M %  Oper M %  Profit M %  Perf Week %  Perf Month %  Perf Quart %  Perf Half %  Perf Year %  Perf YTD %  Beta    ATR  Volatility W %  Volatility M %  SMA20 %  SMA50 %  SMA200 %  50D High %  50D Low %  52W High %  52W Low %        52W Range  All-Time High %  All-Time Low %    RSI  Earnings    IPO Date Optionable Shortable Employees  Change fro

In [156]:
# df_sorted = df.sort_values(by="AUM, M", ascending=False)
df_sorted = df.sort_values(by="AUM", ascending=False)
print(df_sorted)

      No. Ticker                               Company    Index                  Sector                 Industry      Country Exchange Market Cap     P/E Fwd P/E   PEG    P/S   P/B     P/C   P/FCF Book/sh Cash/sh  Dividend % Dividend TTM Dividend Ex Date  Payout Ratio %    EPS EPS next Q  EPS this Y %  EPS next Y %  EPS past 5Y %  EPS next 5Y %  Sales past 5Y %  Sales Q/Q %  EPS Q/Q %  EPS YoY TTM %  Sales YoY TTM %    Sales    Income  EPS Surprise %  Revenue Surprise % Outstanding    Float  Float %  Insider Own %  Insider Trans %  Inst Own %  Inst Trans %  Short Float % Short Ratio Short Interest  ROA %  ROE %  ROI % Curr R Quick R LTDebt/Eq Debt/Eq  Gross M %  Oper M %  Profit M %  Perf Week %  Perf Month %  Perf Quart %  Perf Half %  Perf Year %  Perf YTD %  Beta    ATR  Volatility W %  Volatility M %  SMA20 %  SMA50 %  SMA200 %  50D High %  50D Low %  52W High %  52W Low %        52W Range  All-Time High %  All-Time Low %    RSI  Earnings    IPO Date Optionable Shortable Employees 

In [157]:
# df_sorted = df.sort_values(by="AUM, M", ascending=False)
# # df_sorted = df.sort_values(by="AUM", ascending=False)
# print(df_sorted)

In [158]:
print(df)

      No. Ticker                     Company    Index                  Sector                 Industry      Country Exchange Market Cap     P/E Fwd P/E    PEG   P/S   P/B     P/C  P/FCF Book/sh Cash/sh  Dividend % Dividend TTM Dividend Ex Date  Payout Ratio %    EPS EPS next Q  EPS this Y %  EPS next Y %  EPS past 5Y %  EPS next 5Y %  Sales past 5Y %  Sales Q/Q %  EPS Q/Q %  EPS YoY TTM %  Sales YoY TTM %   Sales     Income  EPS Surprise %  Revenue Surprise % Outstanding    Float  Float %  Insider Own %  Insider Trans %  Inst Own %  Inst Trans %  Short Float % Short Ratio Short Interest  ROA %  ROE %  ROI % Curr R Quick R LTDebt/Eq Debt/Eq  Gross M %  Oper M %  Profit M %  Perf Week %  Perf Month %  Perf Quart %  Perf Half %  Perf Year %  Perf YTD %  Beta    ATR  Volatility W %  Volatility M %  SMA20 %  SMA50 %  SMA200 %  50D High %  50D Low %  52W High %  52W Low %        52W Range  All-Time High %  All-Time Low %    RSI  Earnings    IPO Date Optionable Shortable Employees  Change fro

In [159]:
import pandas as pd
import numpy as np
import re # Keep this if using the regex-based check from previous step

# --- Assume df is your DataFrame from the previous step ---
# --- Re-run identification or use the provided list ---

# Option 1: Re-run the identification (Recommended for robustness)
# Define the suffixes we are looking for abbreviation
abbreviation_suffixes = ('B', 'M', 'K', 'T')

def check_numeric_abbreviation(series):
    """
    Checks if a Series contains string values that look like
    abbreviated numbers (e.g., '10.5M', '2B', '500K').
    It verifies both the suffix and that the prefix is numeric.
    """
    try:
        s_str = series.dropna().astype(str).str.strip()
        if s_str.empty: return False
        ends_with_suffix_mask = s_str.str.upper().str.endswith(abbreviation_suffixes)
        if not ends_with_suffix_mask.any(): return False
        candidates = s_str[ends_with_suffix_mask]
        prefixes = candidates.str[:-1].str.strip()
        # Allow for potential negative signs or commas if needed, pd.to_numeric is good
        numeric_prefixes = pd.to_numeric(prefixes, errors='coerce')
        return numeric_prefixes.notna().any()
    except Exception:
        return False

is_numeric_abbreviation_col = df.apply(check_numeric_abbreviation, axis=0)
columns_to_convert = is_numeric_abbreviation_col[is_numeric_abbreviation_col].index.tolist()

# Option 2: Use the list you provided (if you are certain it's correct)
# columns_to_convert = [
#     'Market Cap', 'Sales', 'Income', 'Outstanding', 'Float',
#     'Short Interest', 'Avg Volume', 'AUM', 'Flows 1M', 'Flows 3M', 'Flows YTD'
# ]
# Note: 'Short Interest' might often be a percentage, double-check if it truly belongs here.
# Let's assume the dynamically generated list from Option 1 is more accurate for the example.

print(f"Columns identified for conversion: {columns_to_convert}")

# --- Conversion Logic ---

# Define multipliers to convert to Millions
multipliers = {
    'T': 1_000_000, # Trillion to Million
    'B': 1_000,     # Billion to Million
    'M': 1,         # Million to Million
    'K': 0.001      # Thousand to Million (1/1000)
}

def convert_to_millions(value):
    """Converts a string with T/B/M/K suffix to millions."""
    if pd.isna(value):
        return np.nan
    
    value_str = str(value).strip().upper()
    if not value_str:
        return np.nan

    suffix = value_str[-1]
    
    if suffix in multipliers:
        number_part = value_str[:-1]
        try:
            # Attempt to convert the part before the suffix to a float
            number = float(number_part)
            # Apply the multiplier
            return number * multipliers[suffix]
        except ValueError:
            # The part before the suffix wasn't a valid number
            return np.nan
    else:
        # No recognized suffix (T, B, M, K)
        # Optionally, handle plain numbers if necessary, otherwise return NaN
        try:
            # Could it be a plain number already (treat as raw value)?
            # If you want plain numbers (e.g., 5000000) to be converted to millions:
            # return float(value_str) / 1_000_000
            # ---
            # For now, only convert if suffix is present, return NaN otherwise
            return np.nan 
        except ValueError:
            return np.nan # It wasn't a plain number either

# --- Apply Conversion and Rename ---

new_column_names = {}
print("\nConverting columns to Millions:")

for col in columns_to_convert:
    if col in df.columns: # Ensure column still exists
        print(f"- Processing: {col}")
        original_dtype = df[col].dtype
        
        # Apply the conversion function
        converted_series = df[col].apply(convert_to_millions)
        
        # Update the DataFrame column
        df[col] = converted_series
        
        # Prepare new name
        new_name = f"{col}, M"
        new_column_names[col] = new_name
        print(f"  ...Converted values and prepared rename to: {new_name}")
    else:
         print(f"- Warning: Column '{col}' not found in DataFrame. Skipping.")


# Rename columns in one go (more efficient)
df.rename(columns=new_column_names, inplace=True)

print("\nConversion Complete. DataFrame head after conversion:")
print(df.head())

# Optionally, check dtypes
print("\nData types after conversion:")
print(df.dtypes) 

Columns identified for conversion: ['Market Cap', 'Sales', 'Income', 'Outstanding', 'Float', 'Short Interest', 'Avg Volume', 'AUM', 'Flows 1M', 'Flows 3M', 'Flows YTD']

Converting columns to Millions:
- Processing: Market Cap
  ...Converted values and prepared rename to: Market Cap, M
- Processing: Sales
  ...Converted values and prepared rename to: Sales, M
- Processing: Income
  ...Converted values and prepared rename to: Income, M
- Processing: Outstanding
  ...Converted values and prepared rename to: Outstanding, M
- Processing: Float
  ...Converted values and prepared rename to: Float, M
- Processing: Short Interest
  ...Converted values and prepared rename to: Short Interest, M
- Processing: Avg Volume
  ...Converted values and prepared rename to: Avg Volume, M
- Processing: AUM
  ...Converted values and prepared rename to: AUM, M
- Processing: Flows 1M
  ...Converted values and prepared rename to: Flows 1M, M
- Processing: Flows 3M
  ...Converted values and prepared rename to: 

In [160]:
df.dtypes.head(120)

No.                    object
Ticker                 object
Company                object
Index                  object
Sector                 object
Industry               object
Country                object
Exchange               object
Market Cap, M         float64
P/E                    object
Fwd P/E                object
PEG                    object
P/S                    object
P/B                    object
P/C                    object
P/FCF                  object
Book/sh                object
Cash/sh                object
Dividend %            float64
Dividend TTM           object
Dividend Ex Date       object
Payout Ratio %        float64
EPS                    object
EPS next Q             object
EPS this Y %          float64
EPS next Y %          float64
EPS past 5Y %         float64
EPS next 5Y %         float64
Sales past 5Y %       float64
Sales Q/Q %           float64
EPS Q/Q %             float64
EPS YoY TTM %         float64
Sales YoY TTM %       float64
Sales, M  

In [None]:
# # Replace '-' with empty string in both columns

# df['MktCap AUM'] = df['Market Cap'].replace('-', '') + df['AUM'].replace('-', '')

In [None]:
# # Concatenate the columns 'Market Cap' and 'AUM'
# # Replace '-' with empty string in both columns

# df['MktCap AUM'] = df['Market Cap'].replace('-', '') + df['AUM'].replace('-', '')

In [None]:
# Create new column by converting to numeric values in millions
df['MktCap, M'] = df['Market Cap'].apply(convert_B_M_K_to_million)
df['Avg Volume, M'] = df['Avg Volume'].apply(convert_B_M_K_to_million)
df['Volume, M'] = (pd.to_numeric(df['Volume'].str.replace(',', ''), errors='coerce') / 1_000_000)

In [None]:
# Convert 'Beta', 'ATR', 'Rel Volume', and 'Price' to float
for col in ['Beta', 'ATR', 'RSI', 'Rel Volume', 'Price', 'Fwd P/E', 'P/S', 'P/B', 'P/FCF', 'Debt/Eq']:
  # Clean and convert to numeric
  df[col] = pd.to_numeric(
    df[col].str.replace('$', '').str.replace(',', ''),
    errors='coerce'  # Convert invalid values to NaN
  )

# Verify the conversion
for col in ['Beta', 'ATR', 'Rel Volume', 'Price']:
  print(f"{col}: {df[col].dtype}")

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.head()

In [None]:
my_cols = ['Ticker', 'Company', 'Info', 'MktCap, M', 'Beta',
          'RSI', 'Perf YTD %', 'Perf Week %', 'Perf Month %', 'Perf Quart %', 'Perf Half %', 'Perf Year %',
          'SMA20 %', 'SMA50 %', 'SMA200 %',   
          '50D High %', '50D Low %', '52W High %', '52W Low %', 'All-Time High %', 'All-Time Low %',
          'ATR', 'Volatility W %', 'Volatility M %',  
          'Volume, M', 'Avg Volume, M','Rel Volume',
          'Price', 'Change %', 'Dividend %', 
          'Fwd P/E', 'P/S', 'P/B', 'P/FCF', 'ROE %', 'Debt/Eq', 'Oper M %',
          ]

In [None]:
# Create new df with my_cols and set Ticker as index
new_df = df[my_cols].set_index('Ticker')

# Sort by 'MktCap, M' in descending order, with NaN values last
new_df = new_df.sort_values(by='MktCap, M', ascending=False, na_position='last')

# Display info and first few rows to verify
display(new_df.info(), new_df)

In [None]:
# Save to pickle file
# new_df.to_pickle(dest_path)

# Using PyArrow (default, recommended for most cases)
new_df.to_parquet(dest_path, engine='pyarrow', compression='zstd')
print(f'save new_df to {dest_path}')

# # To load it later:
# loaded_df = pd.read_pickle('df_finviz.pkl')