In [3]:
import requests
from bs4 import BeautifulSoup
# import time
# import random  # For a bit of randomness in the sleep time
import pandas as pd

selector = '.styled-table-new'

def download_yahoo_finance_table(url, selector):
    """
    Downloads table data from a Yahoo Finance page with rate limiting.
    """
    try:
        # Add a User-Agent header to mimic a browser
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}  # Example User-Agent
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)

        soup = BeautifulSoup(response.content, 'html.parser')
        table_body = soup.select_one(selector)

        if table_body is None:
            print(f"Error: Table body not found using selector: {selector}")
            return None

        rows = table_body.find_all('tr')
        if not rows:
            print("Error: No rows found in the table.")
            return None

        # Extract headers from the first row (th elements)
        headers_list = [th.text.strip() for th in rows[0].find_all('th')]

        data = []
        for row in rows:
            cells = row.find_all('td')
            row_data = [cell.text.strip() for cell in cells]
            if row_data:  # Only append if the row has data
                data.append(row_data)

        if not data:
            print("Error: No data found in the table rows.")
            return None

        df = pd.DataFrame(data, columns=headers_list)
        return df

    except requests.exceptions.RequestException as e:
        print(f"Error during request: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

In [8]:
import random

def add_random_column(columns_str):
    # Handle empty input to avoid conversion errors
    columns = []
    if columns_str:
        columns = [int(x) for x in columns_str.split(',')]
    
    # Generate set of available numbers (0-130 not in columns)
    available_nums = set(range(131)) - set(columns)
    
    if available_nums:
        # Determine how many numbers to add (1-3, but not exceeding available count)
        max_possible = len(available_nums)
        max_k = min(3, max_possible)
        k = random.randint(1, max_k)  # Randomly choose 1, 2, or 3 (or up to max possible)
        
        # Sample k unique numbers from available_nums
        new_columns = random.sample(list(available_nums), k)
        
        # Add new numbers to the list and sort
        columns.extend(new_columns)
        columns.sort()
        
        # Convert back to string
        return ','.join(map(str, columns))
    else:
        # Return original if no numbers available
        return columns_str

url_mktcap ='https://finviz.com/screener.ashx?v=152&ft=4&o=-marketcap&c='
url_columns ='0,1,2,3,4,6,14,42,43,44,45,46,47,48,49,50,51,52,53,54,57,58,59,67,65,66,103,100,109,120,121,122'
url_mktcap_rows = ['&r=1', '&r=21', '&r=41', '&r=61', '&r=81', '&r=101', '&r=121', '&r=141', '&r=161', '&r=181', '&r=201', '&r=221', '&r=241', '&r=261', '&r=281', '&r=301', '&r=321', '&r=341', '&r=361', '&r=381', '&r=401', '&r=421', '&r=441', '&r=461', '&r=481', '&r=501', '&r=521', '&r=541', '&r=561', '&r=581', '&r=601', '&r=621', '&r=641', '&r=661', '&r=681', '&r=701', '&r=721', '&r=741', '&r=761', '&r=781', '&r=801', '&r=821', '&r=841', '&r=861', '&r=881', '&r=901', '&r=921', '&r=941', '&r=961', '&r=981']
# shuffled_url_etfs_rows = random.sample(url_etfs_rows, len(url_etfs_rows))  # Returns a new shuffled list

# random_columns = add_random_column(url_columns)
# print(f"len: {len(random_columns)}, added_random_columns: {random_columns}")    

In [None]:
urls_mktcap = []

for _rows in url_mktcap_rows:
    # random_columns = add_random_column(url_columns)
    # url = url_mktcap + random_columns + _rows
    url = url_mktcap + url_columns + _rows
    urls_mktcap.append(url)

print(f'len(urls_mktcap): {len(urls_mktcap)}')
print(urls_mktcap[0:3])  # Print the length of the list of url)

len(urls_mktcap): 50
['https://finviz.com/screener.ashx?v=152&ft=4&o=-marketcap&c=0,1,2,3,4,6,14,42,43,44,45,46,47,48,49,50,51,52,53,54,57,58,59,67,65,66,103,100,109,120,121,122&r=1', 'https://finviz.com/screener.ashx?v=152&ft=4&o=-marketcap&c=0,1,2,3,4,6,14,42,43,44,45,46,47,48,49,50,51,52,53,54,57,58,59,67,65,66,103,100,109,120,121,122&r=21', 'https://finviz.com/screener.ashx?v=152&ft=4&o=-marketcap&c=0,1,2,3,4,6,14,42,43,44,45,46,47,48,49,50,51,52,53,54,57,58,59,67,65,66,103,100,109,120,121,122&r=41']


In [10]:
url_etfs ='https://finviz.com/screener.ashx?v=152&ft=4&o=-e.assetsundermanagement&c='
url_etfs_rows = ['&r=1', '&r=21', '&r=41', '&r=61', '&r=81', '&r=101', '&r=121', '&r=141', '&r=161', '&r=181', '&r=201', '&r=221', '&r=241', '&r=261', '&r=281', '&r=301', '&r=321', '&r=341', '&r=361', '&r=381']
# shuffled_url_etfs_rows = random.sample(url_etfs_rows, len(url_etfs_rows))  # Returns a new shuffled list

In [12]:
urls_etfs = []

for _rows in url_etfs_rows:
    # random_columns = add_random_column(url_columns)
    url = url_etfs + url_columns + _rows
    urls_etfs.append(url)

print(f'len(urls_etfs): {len(urls_etfs)}')
print(urls_etfs[0:3])  # Print the length of the list of url)

len(urls_etfs): 20
['https://finviz.com/screener.ashx?v=152&ft=4&o=-e.assetsundermanagement&c=0,1,2,3,4,6,14,42,43,44,45,46,47,48,49,50,51,52,53,54,57,58,59,67,65,66,103,100,109,120,121,122&r=1', 'https://finviz.com/screener.ashx?v=152&ft=4&o=-e.assetsundermanagement&c=0,1,2,3,4,6,14,42,43,44,45,46,47,48,49,50,51,52,53,54,57,58,59,67,65,66,103,100,109,120,121,122&r=21', 'https://finviz.com/screener.ashx?v=152&ft=4&o=-e.assetsundermanagement&c=0,1,2,3,4,6,14,42,43,44,45,46,47,48,49,50,51,52,53,54,57,58,59,67,65,66,103,100,109,120,121,122&r=41']


In [None]:
urls =  urls_mktcap + urls_etfs
shuffled_urls = random.sample(urls, len(urls))  # Returns a new shuffled list

# print(f'len(shuffled_urls): {len(shuffled_urls)}')
# print(shuffled_urls[3:6])  # Print the length of the list of url)  

len(shuffled_urls): 70
['https://finviz.com/screener.ashx?v=152&ft=4&o=-marketcap&c=0,1,2,3,4,6,14,42,43,44,45,46,47,48,49,50,51,52,53,54,57,58,59,67,65,66,103,100,109,120,121,122&r=641', 'https://finviz.com/screener.ashx?v=152&ft=4&o=-marketcap&c=0,1,2,3,4,6,14,42,43,44,45,46,47,48,49,50,51,52,53,54,57,58,59,67,65,66,103,100,109,120,121,122&r=141', 'https://finviz.com/screener.ashx?v=152&ft=4&o=-marketcap&c=0,1,2,3,4,6,14,42,43,44,45,46,47,48,49,50,51,52,53,54,57,58,59,67,65,66,103,100,109,120,121,122&r=801']


In [39]:
print(f'len(shuffled_urls): {len(shuffled_urls)}')
# print(shuffled_urls[6:20])  # Print the length of the list of url)
# for i, url in enumerate(shuffled_urls[14:17]):
for i, url in enumerate(shuffled_urls[15:18]):
    print(i, url)  

len(shuffled_urls): 70
0 https://finviz.com/screener.ashx?v=152&ft=4&o=-e.assetsundermanagement&c=0,1,2,3,4,6,14,42,43,44,45,46,47,48,49,50,51,52,53,54,57,58,59,67,65,66,103,100,109,120,121,122&r=101
1 https://finviz.com/screener.ashx?v=152&ft=4&o=-marketcap&c=0,1,2,3,4,6,14,42,43,44,45,46,47,48,49,50,51,52,53,54,57,58,59,67,65,66,103,100,109,120,121,122&r=401
2 https://finviz.com/screener.ashx?v=152&ft=4&o=-marketcap&c=0,1,2,3,4,6,14,42,43,44,45,46,47,48,49,50,51,52,53,54,57,58,59,67,65,66,103,100,109,120,121,122&r=21


In [None]:
import pandas as pd
import time
import random

urls_to_download = shuffled_urls[15:18]  # Adjust the range as needed
total_urls_to_download = len(urls_to_download)

df = pd.DataFrame()  # Initialized an empty DataFrame
processed_count = 0

columns_list = ['No.', 'Ticker', 'Company', 'Sector', 'Industry', 'Market Cap',
       'Dividend', 'Perf Week', 'Perf Month', 'Perf Quart', 'Perf Half',
       'Perf Year', 'Perf YTD', 'Beta', 'ATR', 'Volatility W', 'Volatility M',
       'SMA20', 'SMA50', 'SMA200', '52W High', '52W Low', 'RSI', 'Volume',
       'Price', 'Change', 'Single Category', 'Asset Type', 'AUM', 'Return% 1Y',
       'Return% 3Y', 'Return% 5Y',]

for url in urls_to_download :
    # Introduce a delay between requests (adjust as needed)
    delay_seconds = random.uniform(2, 4.5)  # Sleep between 2 and 5 seconds
    # print(f"Downloading {symbol}. Sleeping for {delay_seconds:.2f} seconds...")
    processed_count += 1
    print(f"Downloading {url}. Sleeping for {delay_seconds:.2f} seconds.  Processed {processed_count} / {total_urls_to_download} urls")
    time.sleep(delay_seconds)

    df_temp = download_yahoo_finance_table(url, selector)

    if df_temp is not None:
        df_temp_filtered = df_temp[columns_list]
        # Discards the original row indices of both DataFrames.
        # Creates a new sequential integer index
        df = pd.concat([df, df_temp_filtered], ignore_index=True)
    else:
        print(f"Failed to download data for {url}")

Downloading https://finviz.com/screener.ashx?v=152&ft=4&o=-e.assetsundermanagement&c=0,1,2,3,4,6,14,42,43,44,45,46,47,48,49,50,51,52,53,54,57,58,59,67,65,66,103,100,109,120,121,122&r=101. Sleeping for 3.68 seconds.  Processed 1 / 3 urls
Downloading https://finviz.com/screener.ashx?v=152&ft=4&o=-marketcap&c=0,1,2,3,4,6,14,42,43,44,45,46,47,48,49,50,51,52,53,54,57,58,59,67,65,66,103,100,109,120,121,122&r=401. Sleeping for 2.33 seconds.  Processed 2 / 3 urls
Downloading https://finviz.com/screener.ashx?v=152&ft=4&o=-marketcap&c=0,1,2,3,4,6,14,42,43,44,45,46,47,48,49,50,51,52,53,54,57,58,59,67,65,66,103,100,109,120,121,122&r=21. Sleeping for 3.56 seconds.  Processed 3 / 3 urls


In [45]:
df

Unnamed: 0,No.,Ticker,Company,Sector,Industry,Market Cap,Dividend,Perf Week,Perf Month,Perf Quart,...,RSI,Volume,Price,Change,Single Category,Asset Type,AUM,Return% 1Y,Return% 3Y,Return% 5Y
0,101,IUSG,iShares Core S&P U.S.Growth ETF,Financial,Exchange Traded Fund,-,0.64%,-3.82%,-12.12%,-10.81%,...,29.44,1389272,126.86,-1.81%,US Equities - US Style,Equities (Stocks),19.32B,12.98%,11.56%,15.95%
1,102,ACWI,iShares MSCI ACWI ETF,Financial,Exchange Traded Fund,-,1.74%,-2.98%,-5.86%,-6.24%,...,33.55,3329427,115.16,-1.14%,Global or ExUS Equities - Broad / Regional,Equities (Stocks),18.71B,8.70%,9.73%,11.77%
2,103,USFR,WisdomTree Floating Rate Treasury Fund,Financial,Exchange Traded Fund,-,4.95%,0.06%,0.00%,0.12%,...,52.56,4677728,50.41,0.00%,Bonds - Treasury & Government,Bonds,18.19B,4.83%,4.31%,2.64%
3,104,FBND,Fidelity Total Bond ETF,Financial,Exchange Traded Fund,-,4.58%,0.00%,1.06%,-0.87%,...,52.91,1471700,45.59,0.24%,Bonds - Broad Market,Bonds,18.15B,4.08%,0.29%,0.18%
4,105,IYW,iShares U.S. Technology ETF,Financial,Exchange Traded Fund,-,0.23%,-3.95%,-12.91%,-13.71%,...,31.48,636921,141.64,-2.09%,US Equities - Industry Sector,Equities (Stocks),18.07B,8.56%,16.28%,21.37%
5,106,VBK,Vanguard Small Cap Growth ETF,Financial,Exchange Traded Fund,-,0.61%,-4.60%,-13.90%,-16.99%,...,23.66,448126,247.97,-2.18%,US Equities - US Style,Equities (Stocks),17.67B,-1.03%,4.08%,7.51%
6,107,IDEV,iShares Core MSCI International Developed Mark...,Financial,Exchange Traded Fund,-,3.09%,-1.68%,0.85%,0.95%,...,50.6,3271210,68.98,-0.75%,Global or ExUS Equities - Broad / Regional,Equities (Stocks),17.55B,6.19%,8.73%,8.72%
7,108,XLU,Utilities Select Sector SPDR ETF,Financial,Exchange Traded Fund,-,2.90%,1.95%,-2.56%,-1.69%,...,45.19,8397778,77.36,0.25%,US Equities - Industry Sector,Equities (Stocks),17.31B,23.87%,5.38%,5.82%
8,109,FNDX,Schwab Fundamental U.S. Large Company ETF,Financial,Exchange Traded Fund,-,1.81%,-3.44%,-6.57%,-6.91%,...,28.02,2656851,23.03,-0.86%,US Equities - Factor & Thematic,Equities (Stocks),17.06B,8.20%,9.73%,15.40%
9,110,VOE,Vanguard Mid-Cap Value ETF,Financial,Exchange Traded Fund,-,2.18%,-2.84%,-5.09%,-8.68%,...,30.56,340247,156.61,-0.63%,US Equities - US Style,Equities (Stocks),17.05B,5.49%,5.91%,11.07%


In [None]:
import pandas as pd
import time
import random

urls_to_download = shuffled_urls[14:17]  # Adjust the range as needed
total_urls_to_download = len(urls_to_download)

df = pd.DataFrame()  # Initialized an empty DataFrame
processed_count = 0

columns_list = ['No.', 'Ticker', 'Company', 'Sector', 'Industry', 'Market Cap',
       'Dividend', 'Perf Week', 'Perf Month', 'Perf Quart', 'Perf Half',
       'Perf Year', 'Perf YTD', 'Beta', 'ATR', 'Volatility W', 'Volatility M',
       'SMA20', 'SMA50', 'SMA200', '52W High', '52W Low', 'RSI', 'Volume',
       'Price', 'Change', 'Single Category', 'Asset Type', 'AUM', 'Return% 1Y',
       'Return% 3Y', 'Return% 5Y',]

for url in urls_to_download :
    # Introduce a delay between requests (adjust as needed)
    delay_seconds = random.uniform(2, 4.5)  # Sleep between 2 and 5 seconds
    # print(f"Downloading {symbol}. Sleeping for {delay_seconds:.2f} seconds...")
    processed_count += 1
    print(f"Downloading {url}. Sleeping for {delay_seconds:.2f} seconds.  Processed {processed_count} / {total_urls_to_download} urls")
    time.sleep(delay_seconds)

    df_temp = download_yahoo_finance_table(url, selector)

    if df_temp is not None:
        df_temp_filtered = df_temp[columns_list]

        # --- Check and Clean Column Names ---
        # Remove leading/trailing whitespace from column names
        df_temp_filtered.columns = df_temp_filtered.columns.str.strip()

        # print('=========================')
        # print("Columns in df:", df.columns)
        # print("Columns in df_temp_filtered:", df_temp_filtered.columns)

        # Check for duplicates within df_temp_filtered
        if df_temp_filtered.columns.duplicated().any():
            print(f"WARNING: Duplicate columns found in df_temp_filtered for {url}!")
            # Rename duplicate columns to make them unique.  A simple approach is:
            cols = pd.Series(df_temp_filtered.columns)
            for dup in df_temp_filtered.columns[df_temp_filtered.columns.duplicated(keep=False)]:
                cols[df_temp_filtered.columns.isin([dup])] = [dup + '_' + str(i) if i != 0 else dup for i in range(cols[df_temp_filtered.columns.isin([dup])].size)]
            df_temp_filtered.columns = cols


        # Check for duplicates between df_temp_filtered and the existing df
        if not df.empty:
            new_cols = df_temp_filtered.columns.difference(df.columns)
            df_temp_filtered = df_temp_filtered[new_cols] # only keep new columns


        # ------------------------------------

        df = pd.concat([df, df_temp_filtered], ignore_index=True)
    else:
        print(f"Failed to download data for {url}")

In [15]:
# Compare columns in df with columns_list
print("Current columns in df:")
print(sorted(df.columns.tolist()))
print("\nExpected columns from columns_list:")
print(sorted(columns_list))

# Find missing columns
missing_columns = set(columns_list) - set(df.columns)
extra_columns = set(df.columns) - set(columns_list)

print("\nMissing columns (in columns_list but not in df):")
print(missing_columns)
print("\nExtra columns (in df but not in columns_list):")
print(extra_columns)

# Reorder columns to match columns_list
common_columns = [col for col in columns_list if col in df.columns]
df = df[common_columns]

print("\nFinal columns in df after reordering:")
print(df.columns.tolist())

Current columns in df:
['52W High', '52W Low', 'ATR', 'AUM', 'Asset Type', 'Beta', 'Change', 'Company', 'Dividend', 'Industry', 'Market Cap', 'No.', 'Perf Half', 'Perf Month', 'Perf Quart', 'Perf Week', 'Perf YTD', 'Perf Year', 'Price', 'RSI', 'Return% 1Y', 'Return% 3Y', 'Return% 5Y', 'SMA20', 'SMA200', 'SMA50', 'Sector', 'Single Category', 'Ticker', 'Volatility M', 'Volatility W', 'Volume']

Expected columns from columns_list:
['52W High', '52W Low', 'ATR', 'AUM', 'Asset Type', 'Beta', 'Change', 'Company', 'Dividend', 'Industry', 'Market Cap', 'No.', 'Perf Half', 'Perf Month', 'Perf Quart', 'Perf Week', 'Perf YTD', 'Perf Year', 'Price', 'RSI', 'Return% 1Y', 'Return% 3Y', 'Return% 5Y', 'SMA20', 'SMA200', 'SMA50', 'Sector', 'Single Category', 'Ticker', 'Volatility M', 'Volatility W', 'Volume']

Missing columns (in columns_list but not in df):
set()

Extra columns (in df but not in columns_list):
set()

Final columns in df after reordering:
['No.', 'Ticker', 'Company', 'Sector', 'Industr

In [None]:
df

In [None]:
len(columns_list)

In [None]:
df.info()

In [18]:
df

Unnamed: 0,No.,Ticker,Company,Sector,Industry,Market Cap,Dividend,Perf Week,Perf Month,Perf Quart,...,RSI,Volume,Price,Change,Single Category,Asset Type,AUM,Return% 1Y,Return% 3Y,Return% 5Y
0,861,ERJ,Embraer S.A. ADR,Industrials,Aerospace & Defense,9.37B,0.66%,-1.64%,22.04%,37.84%,...,69.76,1649849,51.0,0.79%,-,-,-,-,-,-
1,862,CUBE,CubeSmart,Real Estate,REIT - Industrial,9.33B,5.17%,-3.62%,-3.73%,-14.40%,...,40.28,2267566,40.73,-2.04%,-,-,-,-,-,-
2,863,HESM,Hess Midstream LP,Energy,Oil & Gas Midstream,9.28B,7.28%,2.34%,3.33%,12.97%,...,52.32,791866,40.68,0.10%,-,-,-,-,-,-
3,864,PAAS,Pan American Silver Corp,Basic Materials,Gold,9.27B,1.82%,4.41%,5.31%,9.92%,...,59.06,5528653,25.59,1.35%,-,-,-,-,-,-
4,865,MTZ,Mastec Inc,Industrials,Engineering & Construction,9.24B,-,-1.94%,-16.97%,-12.91%,...,38.72,912935,116.53,-1.16%,-,-,-,-,-,-
5,866,UWMC,UWM Holdings Corporation,Financial,Mortgage Finance,9.24B,6.92%,-7.22%,-7.81%,-7.96%,...,38.83,3902743,5.78,-1.03%,-,-,-,-,-,-
6,867,PRI,Primerica Inc,Financial,Insurance - Life,9.20B,1.48%,-2.27%,-4.63%,-3.64%,...,42.61,111130,276.7,-0.31%,-,-,-,-,-,-
7,868,CNM,Core & Main Inc,Industrials,Industrial Distribution,9.19B,-,-3.00%,-15.63%,-14.04%,...,29.05,2181384,46.27,-2.65%,-,-,-,-,-,-
8,869,BROS,Dutch Bros Inc,Consumer Cyclical,Restaurants,9.19B,-,-7.98%,-7.14%,11.94%,...,38.09,3155809,59.61,-4.33%,-,-,-,-,-,-
9,870,AGNC,AGNC Investment Corp,Real Estate,REIT - Mortgage,9.19B,14.34%,-0.99%,-1.38%,3.72%,...,44.00,19963444,10.04,-0.50%,-,-,-,-,-,-


In [None]:
print(df.info(), df_temp_filtered.info(), df_temp.info())

In [None]:
df