#### Process data in finviz download
#### Concatenate columns ["Sector", "Industry", "Single Category", "Asset Type"] to Info column
#### Create "MktCap AUM" column by combining "Market Cap" and "AUM" columns
#### Process columns with values end on K,M,B,T
- convert to numeric in units of million
- add suffix ", M' to their column names
#### Process columns with values end in %
- convert to numeric
- add suffix " %" to their column names
#### Sort by "MktCap AUM, M" in descending order
#### Set Ticker as index
#### Add Rank column with largest "MktCap AUM" ranked as 1     

 

In [5]:
import sys
from pathlib import Path
import pandas as pd
import os
from IPython.display import display, Markdown  # Assuming you use these for display

# Set pandas display options to show more columns and rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', 200)       # Limit to 10 rows for readability
pd.set_option('display.width', 2500) 

# Notebook cell
%load_ext autoreload
%autoreload 2

# Get root directory (assuming notebook is in root/notebooks/)
NOTEBOOK_DIR = Path.cwd()
ROOT_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebooks' else NOTEBOOK_DIR

# Add src directory to Python path
sys.path.append(str(ROOT_DIR / 'src'))

# Verify path
print(f"Python will look in these locations:\n{sys.path}")


# --- Execute the processor ---
import utils



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Python will look in these locations:
['C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.11.9\\python311.zip', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.11.9\\DLLs', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.11.9\\Lib', 'C:\\Users\\ping\\.pyenv\\pyenv-win\\versions\\3.11.9', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv', '', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv\\Lib\\site-packages', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv\\Lib\\site-packages\\win32', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv\\Lib\\site-packages\\win32\\lib', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv\\Lib\\site-packages\\Pythonwin', 'c:\\Users\\ping\\Files_win10\\python\\py311\\stocks\\src', 'c:\\Users\\ping\\Files_win10\\python\\py311\\.venv\\Lib\\site-packages\\setuptools\\_vendor', 'c:\\Users\\ping\\Files_win10\\python\\py311\\stocks\\src']


In [6]:
# process_files.py
from config import DATE_STR, DOWNLOAD_DIR, DEST_DIR
from pathlib import Path  # Better path handling


# ###########################
# DATE_STR = '2025-04-25'  # Example date string, replace with your actual date string
# ###########################


# Build paths
source_path = Path(DOWNLOAD_DIR) / f'df_finviz_{DATE_STR}_stocks_etfs.parquet'
dest_path = Path(DEST_DIR) / f'{DATE_STR}_df_finviz_stocks_etfs.parquet'

print(f"source_path: {source_path}")
print(f"dest_path: {dest_path}")


ImportError: cannot import name 'DATE_STR' from 'config' (c:\Users\ping\Files_win10\python\py311\stocks\notebooks\config.py)

In [None]:
df = pd.read_parquet(source_path, engine='pyarrow')
display(df.info())

In [None]:
print(df.head())

In [None]:
# _df_etfs = df.sort_values(by="AUM, M", ascending=False)
_df_etfs = df.sort_values(by="AUM", ascending=False)
print(_df_etfs.head(10))

In [None]:
# _df_stocks = df.sort_values(by="AUM, M", ascending=False)
_df_stocks = df.sort_values(by="Market Cap", ascending=False)
print(_df_stocks.head(10))

In [None]:
# List of columns to concatenate
columns_to_concat = ["Sector", "Industry", "Single Category", "Asset Type"]

# Replace '-' with empty string in specified columns
for col in columns_to_concat:
    df[col] = df[col].replace('-', '')

# Concatenate the columns, handling empty strings, and remove extra spaces
df['Info'] = df[columns_to_concat].apply(lambda row: ', '.join(filter(None, row.astype(str))), axis=1)

In [None]:
# Concatenate the columns 'Market Cap' and 'AUM'
# Replace '-' with empty string in both columns

df['MktCap AUM'] = df['Market Cap'].replace('-', '') + df['AUM'].replace('-', '')

In [None]:
print(df)

In [None]:
import pandas as pd
import numpy as np
import re # Keep this if using the regex-based check from previous step

# --- Assume df is your DataFrame from the previous step ---
# --- Re-run identification or use the provided list ---

# Option 1: Re-run the identification (Recommended for robustness)
# Define the suffixes we are looking for abbreviation
abbreviation_suffixes = ('B', 'M', 'K', 'T')

def check_numeric_abbreviation(series):
    """
    Checks if a Series contains string values that look like
    abbreviated numbers (e.g., '10.5M', '2B', '500K').
    It verifies both the suffix and that the prefix is numeric.
    """
    try:
        s_str = series.dropna().astype(str).str.strip()
        if s_str.empty: return False
        ends_with_suffix_mask = s_str.str.upper().str.endswith(abbreviation_suffixes)
        if not ends_with_suffix_mask.any(): return False
        candidates = s_str[ends_with_suffix_mask]
        prefixes = candidates.str[:-1].str.strip()
        # Allow for potential negative signs or commas if needed, pd.to_numeric is good
        numeric_prefixes = pd.to_numeric(prefixes, errors='coerce')
        return numeric_prefixes.notna().any()
    except Exception:
        return False

is_numeric_abbreviation_col = df.apply(check_numeric_abbreviation, axis=0)
columns_to_convert = is_numeric_abbreviation_col[is_numeric_abbreviation_col].index.tolist()

# Option 2: Use the list you provided (if you are certain it's correct)
# columns_to_convert = [
#     'Market Cap', 'Sales', 'Income', 'Outstanding', 'Float',
#     'Short Interest', 'Avg Volume', 'AUM', 'Flows 1M', 'Flows 3M', 'Flows YTD'
# ]
# Note: 'Short Interest' might often be a percentage, double-check if it truly belongs here.
# Let's assume the dynamically generated list from Option 1 is more accurate for the example.

print(f"Columns identified for conversion: {columns_to_convert}")

# --- Conversion Logic ---

# Define multipliers to convert to Millions
multipliers = {
    'T': 1_000_000, # Trillion to Million
    'B': 1_000,     # Billion to Million
    'M': 1,         # Million to Million
    'K': 0.001      # Thousand to Million (1/1000)
}

def convert_to_millions(value):
    """Converts a string with T/B/M/K suffix to millions."""
    if pd.isna(value):
        return np.nan
    
    value_str = str(value).strip().upper()
    if not value_str:
        return np.nan

    suffix = value_str[-1]
    
    if suffix in multipliers:
        number_part = value_str[:-1]
        try:
            # Attempt to convert the part before the suffix to a float
            number = float(number_part)
            # Apply the multiplier
            return number * multipliers[suffix]
        except ValueError:
            # The part before the suffix wasn't a valid number
            return np.nan
    else:
        # No recognized suffix (T, B, M, K)
        # Optionally, handle plain numbers if necessary, otherwise return NaN
        try:
            # Could it be a plain number already (treat as raw value)?
            # If you want plain numbers (e.g., 5000000) to be converted to millions:
            # return float(value_str) / 1_000_000
            # ---
            # For now, only convert if suffix is present, return NaN otherwise
            return np.nan 
        except ValueError:
            return np.nan # It wasn't a plain number either

# --- Apply Conversion and Rename ---

new_column_names = {}
print("\nConverting columns to Millions:")

for col in columns_to_convert:
    if col in df.columns: # Ensure column still exists
        print(f"- Processing: {col}")
        original_dtype = df[col].dtype
        
        # Apply the conversion function
        converted_series = df[col].apply(convert_to_millions)
        
        # Update the DataFrame column
        df[col] = converted_series
        
        # Prepare new name
        new_name = f"{col}, M"
        new_column_names[col] = new_name
        print(f"  ...Converted values and prepared rename to: {new_name}")
    else:
         print(f"- Warning: Column '{col}' not found in DataFrame. Skipping.")


# Rename columns in one go (more efficient)
df.rename(columns=new_column_names, inplace=True)

print("\nConversion Complete. DataFrame head after conversion:")
print(df.head())

# Optionally, check dtypes
print("\nData types after conversion:")
print(df.dtypes) 

In [None]:
print(f'df.dtypes: {df.dtypes}')

In [None]:
import pandas as pd
import numpy as np

def process_percentage_columns(df):
    """
    Identifies and processes columns in a DataFrame where values end with '%'.
    The function cleans, converts to numeric, renames, and prints which columns were modified.

    Args:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: The modified DataFrame with processed percentage columns.
                      Returns the original DataFrame if no percentage columns are found.
    """

    # Identify columns where values END WITH '%'
    percent_cols = [
        col for col in df.columns
        if df[col].dtype == 'object'
        and df[col].str.strip().str.endswith('%', na=False).any()
    ]

    # If no percentage columns are found, return the original DataFrame
    if not percent_cols:
        print("No percentage columns found to modify.")  # Print message
        return df

    print("The following columns ending with % were modified:") #Print message before the loop

    # Process identified percentage columns
    for col in percent_cols:
        # Clean data: (1) Strip whitespace, (2) Handle '-', (3) Remove trailing %
        cleaned_series = (
            df[col].str.strip()
            .replace('-', np.nan)  # Convert '-' to NaN
            .str.replace(r'%$', '', regex=True)  # Remove only ENDING %
        )
        
        # Convert to numeric (coerce invalid values to NaN)
        df[col] = pd.to_numeric(cleaned_series, errors='coerce')
        
        # Rename column
        # Check if the specific pattern "%" is already present in the column name
        if "%" not in col:
            # If it's NOT present, then rename by appending " %"
            new_col_name = f"{col} %"
            df.rename(columns={col: new_col_name}, inplace=True)
            print(f"- Renamed: {col} -> {new_col_name}") # Indicate the rename happened
        else:
            # If it IS already present, do nothing to the name, just print
            print(f"- Kept as is: {col} (already contains %)") # Indicate no rename needed

        
    return df

# Example Usage:
# Assuming you have a DataFrame called 'df'
# df = process_percentage_columns(df)

In [None]:
df = process_percentage_columns(df)
print(f'df.dtypes: {df.dtypes}')

In [None]:
numeric_columns = [
    "No.",
    "P/E",
    "Fwd P/E",
    "PEG",
    "P/S",
    "P/B",
    "P/C",
    "P/FCF",
    "Book/sh",
    "Cash/sh",
    "Dividend TTM",
    "EPS",
    "EPS next Q",
    "Short Ratio",
    "Curr R",
    "Quick R",
    "LTDebt/Eq",
    "Debt/Eq",
    "Beta",
    "ATR",
    "RSI",
    "Employees",
    "Recom",
    "Rel Volume",
    "Volume",
    "Target Price",
    "Prev Close",
    "Open",
    "High",
    "Low",
    "Price",
    "Holdings",
]

In [None]:
# Convert columns with numeric values to float
for col in numeric_columns:
  # Clean and convert to numeric
  df[col] = pd.to_numeric(
    df[col].str.replace('$', '').str.replace(',', ''),
    errors='coerce'  # Convert invalid values to NaN
  )

# Verify the conversion
for col in numeric_columns:
  print(f"{col}: {df[col].dtype}")

In [None]:
print(df.head())

In [None]:
# df_etfs = df.sort_values(by="AUM, M", ascending=False)
df_etfs = df.sort_values(by="AUM, M", ascending=False)
print(df_etfs.head(10))

In [None]:
print(f"df.dtypes:\n{df.dtypes}")

In [None]:
# print(df.describe())
print(df.info())

In [None]:
import pandas as pd

# Assuming 'df' is your DataFrame

print("Original DataFrame head:")
print(df.head())
print("-" * 30)

print("DataFrame Columns:", df.columns.tolist())
print("-" * 30)

try:
    # 1. Sort the DataFrame by 'MktCap AUM, M' in descending order, in place.
    print(f"Sorting DataFrame by 'MktCap AUM, M'...")
    df.sort_values(by='MktCap AUM, M', ascending=False, inplace=True)
    print("DataFrame sorted.")
    print("-" * 30)

    # 2. Set 'Ticker' as the index, if the column exists.
    if 'Ticker' in df.columns:
        df.set_index('Ticker', inplace=True)
        print("'Ticker' column successfully set as index.")
    else:
        print("Warning: 'Ticker' column not found. Skipping setting it as index.")
    print("-" * 30)

    # 3. Display the head of the modified DataFrame.
    print("Modified DataFrame head (first 20 rows):")
    print(df.head(20))

except KeyError as e:
    print(f"\nKeyError: The column {e} was not found in the DataFrame.")
    print("Please ensure the column name used for sorting ('MktCap AUM, M') exists and is spelled correctly.")
    print("Available columns:", df.columns.tolist())
except Exception as e:
    print(f"\nAn unexpected error occurred: {e}")
    import traceback
    traceback.print_exc()

In [None]:
import pandas as pd

# Assuming your DataFrame 'df' already exists

# Sort the DataFrame by the "MktCap AUM, M" column in descending order
df_sorted = df.sort_values(by="MktCap AUM, M", ascending=False).copy()
# Using .copy() here creates a new, non-fragmented DataFrame

# Add the "Rank" column
df_sorted["Rank"] = range(1, len(df_sorted) + 1)

# If you still want to update the original DataFrame (though generally not recommended
# after sorting), you could do:
# df["Rank"] = df.sort_values(by="MktCap AUM, M", ascending=False).reset_index(drop=True).index + 1

# Now df_sorted has the "Rank" column
print(f'df_sorted.head(3):\n{df_sorted.head(3)}')
print(f'\ndf_sorted.tail(3):\n{df_sorted.tail(3)}')




In [None]:
# print(df.describe())
print(df_sorted.info())

In [None]:
# Using PyArrow (default, recommended for most cases)
df_sorted.to_parquet(dest_path, engine='pyarrow', compression='zstd')
print(f'save df_sorted to {dest_path}')

# To load it later:
loaded_df = pd.read_parquet(dest_path, engine='pyarrow')
print(f'loaded_df.head(20):\n{loaded_df.head(20)}')