### Process data in finviz download

1.  Concatenate columns `["Sector", "Industry", "Single Category", "Asset Type"]` into an `Info` column.
2.  Create a `"MktCap AUM"` column by combining the `"Market Cap"` and `"AUM"` columns.
3.  Process columns with values that end in K, M, B, T:
    *   Convert to numeric in units of millions.
    *   Add the suffix `, M` to their column names.
4.  Process columns with values that end in `%`:
    *   Convert to numeric.
    *   Add the suffix ` %` to their column names.
5.  Sort by `"MktCap AUM, M"` in descending order.
6.  Set `Ticker` as the index.
7.  Add a `Rank` column with the largest `"MktCap AUM"` ranked as 1.

In [1]:
import sys
from pathlib import Path
import pandas as pd
import os
from IPython.display import display, Markdown  # Assuming you use these for display

# Set pandas display options to show more columns and rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', 200)       # Limit to 10 rows for readability
pd.set_option('display.width', 2500) 

# Notebook cell
%load_ext autoreload
%autoreload 2

# Get root directory (assuming notebook is in root/notebooks/)
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebooks' else NOTEBOOK_DIR

# Add src directory to Python path
sys.path.append(str(ROOT_DIR / 'src'))

# Verify path
print(f"Python will look in these locations:\n{sys.path}")


# --- Execute the processor ---
import utils



Python will look in these locations:
['C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.11.9\\python311.zip', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.11.9\\DLLs', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.11.9\\Lib', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.11.9', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv', '', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv\\Lib\\site-packages', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv\\Lib\\site-packages\\win32', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv\\Lib\\site-packages\\win32\\lib', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv\\Lib\\site-packages\\Pythonwin', 'c:\\Users\\ping\\Files_win10\\python\\py311\\stocks\\src']


In [2]:
# process_files.py
from config import DATE_STR, DOWNLOAD_DIR, DEST_DIR
from pathlib import Path  # Better path handling


# ###########################
# DATE_STR = '2025-04-25'  # Example date string, replace with your actual date string
# ###########################


# Build paths
source_path = Path(DOWNLOAD_DIR) / f'df_finviz_{DATE_STR}_stocks_etfs.parquet'
dest_path = Path(DEST_DIR) / f'{DATE_STR}_df_finviz_stocks_etfs.parquet'

print(f"source_path: {source_path}")
print(f"dest_path: {dest_path}")


source_path: C:\Users\ping\Downloads\df_finviz_2025-06-13_stocks_etfs.parquet
dest_path: ..\data\2025-06-13_df_finviz_stocks_etfs.parquet


In [3]:
df = pd.read_parquet(source_path, engine='pyarrow')
display(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Columns: 111 entries, No. to Tags
dtypes: object(111)
memory usage: 1.3+ MB


None

In [4]:
print(df.head())

   No. Ticker                                    Company Index     Sector              Industry Country Exchange Market Cap P/E Fwd P/E PEG P/S P/B P/C P/FCF Book/sh Cash/sh Dividend Dividend TTM Dividend Ex Date Payout Ratio EPS EPS next Q EPS this Y EPS next Y EPS past 5Y EPS next 5Y Sales past 5Y Sales Q/Q EPS Q/Q EPS YoY TTM Sales YoY TTM Sales Income EPS Surprise Revenue Surprise Outstanding Float Float % Insider Own Insider Trans Inst Own Inst Trans Short Float Short Ratio Short Interest ROA ROE ROIC Curr R Quick R LTDebt/Eq Debt/Eq Gross M Oper M Profit M Perf Week Perf Month Perf Quart Perf Half Perf Year Perf YTD  Beta   ATR Volatility W Volatility M   SMA20  SMA50 SMA200 50D High 50D Low 52W High 52W Low       52W Range All-Time High All-Time Low    RSI Earnings   IPO Date Optionable Shortable Employees Change from Open     Gap Recom Avg Volume Rel Volume     Volume Target Price Prev Close    Open    High     Low   Price  Change                              Single Category   

In [5]:
# _df_etfs = df.sort_values(by="AUM, M", ascending=False)
_df_etfs = df.sort_values(by="AUM", ascending=False)
print(_df_etfs.head(10))

     No. Ticker                                        Company Index     Sector              Industry Country Exchange Market Cap P/E Fwd P/E PEG P/S P/B P/C P/FCF Book/sh Cash/sh Dividend Dividend TTM Dividend Ex Date Payout Ratio EPS EPS next Q EPS this Y EPS next Y EPS past 5Y EPS next 5Y Sales past 5Y Sales Q/Q EPS Q/Q EPS YoY TTM Sales YoY TTM Sales Income EPS Surprise Revenue Surprise Outstanding Float Float % Insider Own Insider Trans Inst Own Inst Trans Short Float Short Ratio Short Interest ROA ROE ROIC Curr R Quick R LTDebt/Eq Debt/Eq Gross M Oper M Profit M Perf Week Perf Month Perf Quart Perf Half Perf Year Perf YTD  Beta   ATR Volatility W Volatility M   SMA20   SMA50  SMA200 50D High 50D Low 52W High 52W Low        52W Range All-Time High All-Time Low    RSI Earnings    IPO Date Optionable Shortable Employees Change from Open     Gap Recom Avg Volume Rel Volume      Volume Target Price Prev Close    Open    High     Low   Price  Change                                   Si

In [6]:
# _df_stocks = df.sort_values(by="AUM, M", ascending=False)
_df_stocks = df.sort_values(by="Market Cap", ascending=False)
print(_df_stocks.head(10))

      No. Ticker                                  Company         Index             Sector                          Industry         Country Exchange Market Cap     P/E Fwd P/E    PEG    P/S    P/B     P/C  P/FCF Book/sh Cash/sh Dividend Dividend TTM Dividend Ex Date Payout Ratio    EPS EPS next Q EPS this Y EPS next Y EPS past 5Y EPS next 5Y Sales past 5Y Sales Q/Q  EPS Q/Q EPS YoY TTM Sales YoY TTM   Sales   Income EPS Surprise Revenue Surprise Outstanding    Float Float % Insider Own Insider Trans Inst Own Inst Trans Short Float Short Ratio Short Interest     ROA     ROE    ROIC Curr R Quick R LTDebt/Eq Debt/Eq Gross M  Oper M Profit M Perf Week Perf Month Perf Quart Perf Half Perf Year Perf YTD  Beta    ATR Volatility W Volatility M   SMA20   SMA50  SMA200 50D High 50D Low 52W High  52W Low        52W Range All-Time High All-Time Low    RSI  Earnings   IPO Date Optionable Shortable Employees Change from Open     Gap Recom Avg Volume Rel Volume     Volume Target Price Prev Close    

In [7]:
# List of columns to concatenate
columns_to_concat = ["Sector", "Industry", "Single Category", "Asset Type"]

# Replace '-' with empty string in specified columns
for col in columns_to_concat:
    df[col] = df[col].replace('-', '')

# Concatenate the columns, handling empty strings, and remove extra spaces
df['Info'] = df[columns_to_concat].apply(lambda row: ', '.join(filter(None, row.astype(str))), axis=1)

In [8]:
# Concatenate the columns 'Market Cap' and 'AUM'
# Replace '-' with empty string in both columns

df['MktCap AUM'] = df['Market Cap'].replace('-', '') + df['AUM'].replace('-', '')

In [9]:
print(df)

       No. Ticker                                    Company Index             Sector                   Industry Country Exchange Market Cap     P/E Fwd P/E    PEG       P/S   P/B     P/C     P/FCF Book/sh Cash/sh Dividend Dividend TTM Dividend Ex Date Payout Ratio    EPS EPS next Q EPS this Y EPS next Y EPS past 5Y EPS next 5Y Sales past 5Y Sales Q/Q  EPS Q/Q EPS YoY TTM Sales YoY TTM    Sales     Income EPS Surprise Revenue Surprise Outstanding    Float Float % Insider Own Insider Trans Inst Own Inst Trans Short Float Short Ratio Short Interest      ROA       ROE      ROIC Curr R Quick R LTDebt/Eq Debt/Eq     Gross M       Oper M     Profit M Perf Week Perf Month Perf Quart Perf Half Perf Year Perf YTD  Beta   ATR Volatility W Volatility M    SMA20    SMA50   SMA200 50D High 50D Low 52W High 52W Low       52W Range All-Time High All-Time Low    RSI  Earnings    IPO Date Optionable Shortable Employees Change from Open     Gap Recom Avg Volume Rel Volume      Volume Target Price Prev C

In [10]:
import pandas as pd
import numpy as np
import re # Keep this if using the regex-based check from previous step

# --- Assume df is your DataFrame from the previous step ---
# --- Re-run identification or use the provided list ---

# Option 1: Re-run the identification (Recommended for robustness)
# Define the suffixes we are looking for abbreviation
abbreviation_suffixes = ('B', 'M', 'K', 'T')

def check_numeric_abbreviation(series):
    """
    Checks if a Series contains string values that look like
    abbreviated numbers (e.g., '10.5M', '2B', '500K').
    It verifies both the suffix and that the prefix is numeric.
    """
    try:
        s_str = series.dropna().astype(str).str.strip()
        if s_str.empty: return False
        ends_with_suffix_mask = s_str.str.upper().str.endswith(abbreviation_suffixes)
        if not ends_with_suffix_mask.any(): return False
        candidates = s_str[ends_with_suffix_mask]
        prefixes = candidates.str[:-1].str.strip()
        # Allow for potential negative signs or commas if needed, pd.to_numeric is good
        numeric_prefixes = pd.to_numeric(prefixes, errors='coerce')
        return numeric_prefixes.notna().any()
    except Exception:
        return False

is_numeric_abbreviation_col = df.apply(check_numeric_abbreviation, axis=0)
columns_to_convert = is_numeric_abbreviation_col[is_numeric_abbreviation_col].index.tolist()

# Option 2: Use the list you provided (if you are certain it's correct)
# columns_to_convert = [
#     'Market Cap', 'Sales', 'Income', 'Outstanding', 'Float',
#     'Short Interest', 'Avg Volume', 'AUM', 'Flows 1M', 'Flows 3M', 'Flows YTD'
# ]
# Note: 'Short Interest' might often be a percentage, double-check if it truly belongs here.
# Let's assume the dynamically generated list from Option 1 is more accurate for the example.

print(f"Columns identified for conversion: {columns_to_convert}")

# --- Conversion Logic ---

# Define multipliers to convert to Millions
multipliers = {
    'T': 1_000_000, # Trillion to Million
    'B': 1_000,     # Billion to Million
    'M': 1,         # Million to Million
    'K': 0.001      # Thousand to Million (1/1000)
}

def convert_to_millions(value):
    """Converts a string with T/B/M/K suffix to millions."""
    if pd.isna(value):
        return np.nan
    
    value_str = str(value).strip().upper()
    if not value_str:
        return np.nan

    suffix = value_str[-1]
    
    if suffix in multipliers:
        number_part = value_str[:-1]
        try:
            # Attempt to convert the part before the suffix to a float
            number = float(number_part)
            # Apply the multiplier
            return number * multipliers[suffix]
        except ValueError:
            # The part before the suffix wasn't a valid number
            return np.nan
    else:
        # No recognized suffix (T, B, M, K)
        # Optionally, handle plain numbers if necessary, otherwise return NaN
        try:
            # Could it be a plain number already (treat as raw value)?
            # If you want plain numbers (e.g., 5000000) to be converted to millions:
            # return float(value_str) / 1_000_000
            # ---
            # For now, only convert if suffix is present, return NaN otherwise
            return np.nan 
        except ValueError:
            return np.nan # It wasn't a plain number either

# --- Apply Conversion and Rename ---

new_column_names = {}
print("\nConverting columns to Millions:")

for col in columns_to_convert:
    if col in df.columns: # Ensure column still exists
        print(f"- Processing: {col}")
        original_dtype = df[col].dtype
        
        # Apply the conversion function
        converted_series = df[col].apply(convert_to_millions)
        
        # Update the DataFrame column
        df[col] = converted_series
        
        # Prepare new name
        new_name = f"{col}, M"
        new_column_names[col] = new_name
        print(f"  ...Converted values and prepared rename to: {new_name}")
    else:
         print(f"- Warning: Column '{col}' not found in DataFrame. Skipping.")


# Rename columns in one go (more efficient)
df.rename(columns=new_column_names, inplace=True)

print("\nConversion Complete. DataFrame head after conversion:")
print(df.head())

# Optionally, check dtypes
print("\nData types after conversion:")
print(df.dtypes) 

Columns identified for conversion: ['Market Cap', 'Sales', 'Income', 'Outstanding', 'Float', 'Short Interest', 'Avg Volume', 'AUM', 'Flows 1M', 'Flows 3M', 'Flows YTD', 'MktCap AUM']

Converting columns to Millions:
- Processing: Market Cap
  ...Converted values and prepared rename to: Market Cap, M
- Processing: Sales
  ...Converted values and prepared rename to: Sales, M
- Processing: Income
  ...Converted values and prepared rename to: Income, M
- Processing: Outstanding
  ...Converted values and prepared rename to: Outstanding, M
- Processing: Float
  ...Converted values and prepared rename to: Float, M
- Processing: Short Interest
  ...Converted values and prepared rename to: Short Interest, M
- Processing: Avg Volume
  ...Converted values and prepared rename to: Avg Volume, M
- Processing: AUM
  ...Converted values and prepared rename to: AUM, M
- Processing: Flows 1M
  ...Converted values and prepared rename to: Flows 1M, M
- Processing: Flows 3M
  ...Converted values and prepar

In [11]:
print(f'df.dtypes: {df.dtypes}')

df.dtypes: No.                   object
Ticker                object
Company               object
Index                 object
Sector                object
Industry              object
Country               object
Exchange              object
Market Cap, M        float64
P/E                   object
Fwd P/E               object
PEG                   object
P/S                   object
P/B                   object
P/C                   object
P/FCF                 object
Book/sh               object
Cash/sh               object
Dividend              object
Dividend TTM          object
Dividend Ex Date      object
Payout Ratio          object
EPS                   object
EPS next Q            object
EPS this Y            object
EPS next Y            object
EPS past 5Y           object
EPS next 5Y           object
Sales past 5Y         object
Sales Q/Q             object
EPS Q/Q               object
EPS YoY TTM           object
Sales YoY TTM         object
Sales, M             float64
Inc

In [12]:
import pandas as pd
import numpy as np

def process_percentage_columns(df):
    """
    Identifies and processes columns in a DataFrame where values end with '%'.
    The function cleans, converts to numeric, renames, and prints which columns were modified.

    Args:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: The modified DataFrame with processed percentage columns.
                      Returns the original DataFrame if no percentage columns are found.
    """

    # Identify columns where values END WITH '%'
    percent_cols = [
        col for col in df.columns
        if df[col].dtype == 'object'
        and df[col].str.strip().str.endswith('%', na=False).any()
    ]

    # If no percentage columns are found, return the original DataFrame
    if not percent_cols:
        print("No percentage columns found to modify.")  # Print message
        return df

    print("The following columns ending with % were modified:") #Print message before the loop

    # Process identified percentage columns
    for col in percent_cols:
        # Clean data: (1) Strip whitespace, (2) Handle '-', (3) Remove trailing %
        cleaned_series = (
            df[col].str.strip()
            .replace('-', np.nan)  # Convert '-' to NaN
            .str.replace(r'%$', '', regex=True)  # Remove only ENDING %
        )
        
        # Convert to numeric (coerce invalid values to NaN)
        df[col] = pd.to_numeric(cleaned_series, errors='coerce')
        
        # Rename column
        # Check if the specific pattern "%" is already present in the column name
        if "%" not in col:
            # If it's NOT present, then rename by appending " %"
            new_col_name = f"{col} %"
            df.rename(columns={col: new_col_name}, inplace=True)
            print(f"- Renamed: {col} -> {new_col_name}") # Indicate the rename happened
        else:
            # If it IS already present, do nothing to the name, just print
            print(f"- Kept as is: {col} (already contains %)") # Indicate no rename needed

        
    return df

# Example Usage:
# Assuming you have a DataFrame called 'df'
# df = process_percentage_columns(df)

In [13]:
df = process_percentage_columns(df)
print(f'df.dtypes: {df.dtypes}')

The following columns ending with % were modified:
- Renamed: Dividend -> Dividend %
- Renamed: Payout Ratio -> Payout Ratio %
- Renamed: EPS this Y -> EPS this Y %
- Renamed: EPS next Y -> EPS next Y %
- Renamed: EPS past 5Y -> EPS past 5Y %
- Renamed: EPS next 5Y -> EPS next 5Y %
- Renamed: Sales past 5Y -> Sales past 5Y %
- Renamed: Sales Q/Q -> Sales Q/Q %
- Renamed: EPS Q/Q -> EPS Q/Q %
- Renamed: EPS YoY TTM -> EPS YoY TTM %
- Renamed: Sales YoY TTM -> Sales YoY TTM %
- Renamed: EPS Surprise -> EPS Surprise %
- Renamed: Revenue Surprise -> Revenue Surprise %
- Kept as is: Float % (already contains %)
- Renamed: Insider Own -> Insider Own %
- Renamed: Insider Trans -> Insider Trans %
- Renamed: Inst Own -> Inst Own %
- Renamed: Inst Trans -> Inst Trans %
- Renamed: Short Float -> Short Float %
- Renamed: ROA -> ROA %
- Renamed: ROE -> ROE %
- Renamed: ROIC -> ROIC %
- Renamed: Gross M -> Gross M %
- Renamed: Oper M -> Oper M %
- Renamed: Profit M -> Profit M %
- Renamed: Perf Week

In [14]:
numeric_columns = [
    "No.",
    "P/E",
    "Fwd P/E",
    "PEG",
    "P/S",
    "P/B",
    "P/C",
    "P/FCF",
    "Book/sh",
    "Cash/sh",
    "Dividend TTM",
    "EPS",
    "EPS next Q",
    "Short Ratio",
    "Curr R",
    "Quick R",
    "LTDebt/Eq",
    "Debt/Eq",
    "Beta",
    "ATR",
    "RSI",
    "Employees",
    "Recom",
    "Rel Volume",
    "Volume",
    "Target Price",
    "Prev Close",
    "Open",
    "High",
    "Low",
    "Price",
    "Holdings",
]

In [15]:
# Convert columns with numeric values to float
for col in numeric_columns:
  # Clean and convert to numeric
  df[col] = pd.to_numeric(
    df[col].str.replace('$', '').str.replace(',', ''),
    errors='coerce'  # Convert invalid values to NaN
  )

# Verify the conversion
for col in numeric_columns:
  print(f"{col}: {df[col].dtype}")

No.: int64
P/E: float64
Fwd P/E: float64
PEG: float64
P/S: float64
P/B: float64
P/C: float64
P/FCF: float64
Book/sh: float64
Cash/sh: float64
Dividend TTM: float64
EPS: float64
EPS next Q: float64
Short Ratio: float64
Curr R: float64
Quick R: float64
LTDebt/Eq: float64
Debt/Eq: float64
Beta: float64
ATR: float64
RSI: float64
Employees: float64
Recom: float64
Rel Volume: float64
Volume: int64
Target Price: float64
Prev Close: float64
Open: float64
High: float64
Low: float64
Price: float64
Holdings: float64


In [16]:
print(df.head())

   No. Ticker                                    Company Index     Sector              Industry Country Exchange  Market Cap, M  P/E  Fwd P/E  PEG  P/S  P/B  P/C  P/FCF  Book/sh  Cash/sh  Dividend %  Dividend TTM Dividend Ex Date  Payout Ratio %  EPS  EPS next Q  EPS this Y %  EPS next Y %  EPS past 5Y %  EPS next 5Y %  Sales past 5Y %  Sales Q/Q %  EPS Q/Q %  EPS YoY TTM %  Sales YoY TTM %  Sales, M  Income, M  EPS Surprise %  Revenue Surprise %  Outstanding, M  Float, M  Float %  Insider Own %  Insider Trans %  Inst Own %  Inst Trans %  Short Float %  Short Ratio  Short Interest, M  ROA %  ROE %  ROIC %  Curr R  Quick R  LTDebt/Eq  Debt/Eq  Gross M %  Oper M %  Profit M %  Perf Week %  Perf Month %  Perf Quart %  Perf Half %  Perf Year %  Perf YTD %  Beta   ATR  Volatility W %  Volatility M %  SMA20 %  SMA50 %  SMA200 %  50D High %  50D Low %  52W High %  52W Low %       52W Range  All-Time High %  All-Time Low %    RSI Earnings   IPO Date Optionable Shortable  Employees  Change from

In [17]:
# df_etfs = df.sort_values(by="AUM, M", ascending=False)
df_etfs = df.sort_values(by="AUM, M", ascending=False)
print(df_etfs.head(10))

     No. Ticker                              Company Index     Sector              Industry Country Exchange  Market Cap, M  P/E  Fwd P/E  PEG  P/S  P/B  P/C  P/FCF  Book/sh  Cash/sh  Dividend %  Dividend TTM Dividend Ex Date  Payout Ratio %  EPS  EPS next Q  EPS this Y %  EPS next Y %  EPS past 5Y %  EPS next 5Y %  Sales past 5Y %  Sales Q/Q %  EPS Q/Q %  EPS YoY TTM %  Sales YoY TTM %  Sales, M  Income, M  EPS Surprise %  Revenue Surprise %  Outstanding, M  Float, M  Float %  Insider Own %  Insider Trans %  Inst Own %  Inst Trans %  Short Float %  Short Ratio  Short Interest, M  ROA %  ROE %  ROIC %  Curr R  Quick R  LTDebt/Eq  Debt/Eq  Gross M %  Oper M %  Profit M %  Perf Week %  Perf Month %  Perf Quart %  Perf Half %  Perf Year %  Perf YTD %  Beta   ATR  Volatility W %  Volatility M %  SMA20 %  SMA50 %  SMA200 %  50D High %  50D Low %  52W High %  52W Low %        52W Range  All-Time High %  All-Time Low %    RSI Earnings    IPO Date Optionable Shortable  Employees  Change from O

In [18]:
print(f"df.dtypes:\n{df.dtypes}")

df.dtypes:
No.                     int64
Ticker                 object
Company                object
Index                  object
Sector                 object
Industry               object
Country                object
Exchange               object
Market Cap, M         float64
P/E                   float64
Fwd P/E               float64
PEG                   float64
P/S                   float64
P/B                   float64
P/C                   float64
P/FCF                 float64
Book/sh               float64
Cash/sh               float64
Dividend %            float64
Dividend TTM          float64
Dividend Ex Date       object
Payout Ratio %        float64
EPS                   float64
EPS next Q            float64
EPS this Y %          float64
EPS next Y %          float64
EPS past 5Y %         float64
EPS next 5Y %         float64
Sales past 5Y %       float64
Sales Q/Q %           float64
EPS Q/Q %             float64
EPS YoY TTM %         float64
Sales YoY TTM %       float64

In [19]:
# print(df.describe())
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Columns: 113 entries, No. to MktCap AUM, M
dtypes: float64(94), int64(2), object(17)
memory usage: 1.3+ MB
None


In [20]:
import pandas as pd

# Assuming 'df' is your DataFrame

print("Original DataFrame head:")
print(df.head())
print("-" * 30)

print("DataFrame Columns:", df.columns.tolist())
print("-" * 30)

try:
    # 1. Sort the DataFrame by 'MktCap AUM, M' in descending order, in place.
    print(f"Sorting DataFrame by 'MktCap AUM, M'...")
    df.sort_values(by='MktCap AUM, M', ascending=False, inplace=True)
    print("DataFrame sorted.")
    print("-" * 30)

    # 2. Set 'Ticker' as the index, if the column exists.
    if 'Ticker' in df.columns:
        df.set_index('Ticker', inplace=True)
        print("'Ticker' column successfully set as index.")
    else:
        print("Warning: 'Ticker' column not found. Skipping setting it as index.")
    print("-" * 30)

    # 3. Display the head of the modified DataFrame.
    print("Modified DataFrame head (first 20 rows):")
    print(df.head(20))

except KeyError as e:
    print(f"\nKeyError: The column {e} was not found in the DataFrame.")
    print("Please ensure the column name used for sorting ('MktCap AUM, M') exists and is spelled correctly.")
    print("Available columns:", df.columns.tolist())
except Exception as e:
    print(f"\nAn unexpected error occurred: {e}")
    import traceback
    traceback.print_exc()

Original DataFrame head:
   No. Ticker                                    Company Index     Sector              Industry Country Exchange  Market Cap, M  P/E  Fwd P/E  PEG  P/S  P/B  P/C  P/FCF  Book/sh  Cash/sh  Dividend %  Dividend TTM Dividend Ex Date  Payout Ratio %  EPS  EPS next Q  EPS this Y %  EPS next Y %  EPS past 5Y %  EPS next 5Y %  Sales past 5Y %  Sales Q/Q %  EPS Q/Q %  EPS YoY TTM %  Sales YoY TTM %  Sales, M  Income, M  EPS Surprise %  Revenue Surprise %  Outstanding, M  Float, M  Float %  Insider Own %  Insider Trans %  Inst Own %  Inst Trans %  Short Float %  Short Ratio  Short Interest, M  ROA %  ROE %  ROIC %  Curr R  Quick R  LTDebt/Eq  Debt/Eq  Gross M %  Oper M %  Profit M %  Perf Week %  Perf Month %  Perf Quart %  Perf Half %  Perf Year %  Perf YTD %  Beta   ATR  Volatility W %  Volatility M %  SMA20 %  SMA50 %  SMA200 %  50D High %  50D Low %  52W High %  52W Low %       52W Range  All-Time High %  All-Time Low %    RSI Earnings   IPO Date Optionable Shortabl

In [21]:
import pandas as pd

# Assuming your DataFrame 'df' already exists

# Sort the DataFrame by the "MktCap AUM, M" column in descending order
df_sorted = df.sort_values(by="MktCap AUM, M", ascending=False).copy()
# Using .copy() here creates a new, non-fragmented DataFrame

# Add the "Rank" column
df_sorted["Rank"] = range(1, len(df_sorted) + 1)

# If you still want to update the original DataFrame (though generally not recommended
# after sorting), you could do:
# df["Rank"] = df.sort_values(by="MktCap AUM, M", ascending=False).reset_index(drop=True).index + 1

# Now df_sorted has the "Rank" column
print(f'df_sorted.head(3):\n{df_sorted.head(3)}')
print(f'\ndf_sorted.tail(3):\n{df_sorted.tail(3)}')




df_sorted.head(3):
        No.                Company               Index      Sector                   Industry Country Exchange  Market Cap, M    P/E  Fwd P/E   PEG    P/S    P/B    P/C  P/FCF  Book/sh  Cash/sh  Dividend %  Dividend TTM Dividend Ex Date  Payout Ratio %    EPS  EPS next Q  EPS this Y %  EPS next Y %  EPS past 5Y %  EPS next 5Y %  Sales past 5Y %  Sales Q/Q %  EPS Q/Q %  EPS YoY TTM %  Sales YoY TTM %  Sales, M  Income, M  EPS Surprise %  Revenue Surprise %  Outstanding, M  Float, M  Float %  Insider Own %  Insider Trans %  Inst Own %  Inst Trans %  Short Float %  Short Ratio  Short Interest, M  ROA %   ROE %  ROIC %  Curr R  Quick R  LTDebt/Eq  Debt/Eq  Gross M %  Oper M %  Profit M %  Perf Week %  Perf Month %  Perf Quart %  Perf Half %  Perf Year %  Perf YTD %  Beta   ATR  Volatility W %  Volatility M %  SMA20 %  SMA50 %  SMA200 %  50D High %  50D Low %  52W High %  52W Low %        52W Range  All-Time High %  All-Time Low %    RSI  Earnings    IPO Date Optionable S

In [22]:
# print(df.describe())
print(df_sorted.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1560 entries, MSFT to VTHR
Columns: 113 entries, No. to Rank
dtypes: float64(94), int64(3), object(16)
memory usage: 1.4+ MB
None


In [23]:
# Using PyArrow (default, recommended for most cases)
df_sorted.to_parquet(dest_path, engine='pyarrow', compression='zstd')
print(f'save df_sorted to {dest_path}')

# To load it later:
loaded_df = pd.read_parquet(dest_path, engine='pyarrow')
print(f'loaded_df.head(20):\n{loaded_df.head(20)}')

save df_sorted to ..\data\2025-06-13_df_finviz_stocks_etfs.parquet
loaded_df.head(20):
        No.                                 Company               Index                  Sector                        Industry Country Exchange  Market Cap, M     P/E  Fwd P/E    PEG    P/S    P/B     P/C   P/FCF    Book/sh    Cash/sh  Dividend %  Dividend TTM Dividend Ex Date  Payout Ratio %       EPS  EPS next Q  EPS this Y %  EPS next Y %  EPS past 5Y %  EPS next 5Y %  Sales past 5Y %  Sales Q/Q %  EPS Q/Q %  EPS YoY TTM %  Sales YoY TTM %  Sales, M  Income, M  EPS Surprise %  Revenue Surprise %  Outstanding, M  Float, M  Float %  Insider Own %  Insider Trans %  Inst Own %  Inst Trans %  Short Float %  Short Ratio  Short Interest, M  ROA %   ROE %  ROIC %  Curr R  Quick R  LTDebt/Eq  Debt/Eq  Gross M %  Oper M %  Profit M %  Perf Week %  Perf Month %  Perf Quart %  Perf Half %  Perf Year %  Perf YTD %  Beta       ATR  Volatility W %  Volatility M %  SMA20 %  SMA50 %  SMA200 %  50D High %  50D Low