## Filtering out ADRs

Unfortunately the isAdr column on the API is garbage. No matter, we can just locally remove all the rows that contain a period in the ticker name. Might I be excluding some good stocks? who knows lol

In [10]:
import pandas as pd
import os

def filter_universal_symbols():
    """
    Filters out symbols containing '.' from universal_symbols.csv
    and saves the result to adrs_removed.csv.
    """
    input_path = 'Symbol Lists/universal_symbols.csv'
    output_path = 'Symbol Lists/adrs_removed.csv'

    # Ensure the output directory exists
    output_dir = os.path.dirname(output_path)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Read the data
    try:
        df = pd.read_csv(input_path)
    except FileNotFoundError:
        print(f"Error: Input file not found at {input_path}")
        return

    # Tickers with a . in them are ADRs.
    filtered_df = df[~df['symbol'].str.contains("\.", na=False)]

    # Record the output
    filtered_df.to_csv(output_path, index=False)
    print(f"Filtered data saved to {output_path}")
    print(f"Original count: {len(df)}")
    print(f"Filtered count: {len(filtered_df)}")
    print(f"Rows removed: {len(df) - len(filtered_df)}")

if __name__ == '__main__':
    filter_universal_symbols()

Filtered data saved to Symbol Lists/adrs_removed.csv
Original count: 85850
Filtered count: 36054
Rows removed: 49796


  filtered_df = df[~df['symbol'].str.contains("\.", na=False)]


## Getting Company Profile Data

Getting some relevant columns point in time for the tickers.


In [11]:
import pandas as pd
import requests
import os
from dotenv import load_dotenv
from datetime import date
import time
from tqdm.notebook import tqdm

# Load environment variables from .env file
load_dotenv()

# Get API key
api_key = os.getenv('API_KEY')

if not api_key:
    print("API_KEY not found in .env file. Please create a .env file with your API key.")
else:
    # Read the filtered symbols
    input_path = 'Symbol Lists/adrs_removed.csv'
    try:
        symbols_df = pd.read_csv(input_path)
        symbols = symbols_df['symbol'].tolist()

        profile_data = []
        failed_symbols = []
        
        # Using tqdm for a progress bar
        for symbol in tqdm(symbols, desc="Fetching Company Profiles"):
            url = f"https://financialmodelingprep.com/api/v3/profile/{symbol}?apikey={api_key}"
            
            retries = 3
            backoff_factor = 0.5
            
            for i in range(retries):
                try:
                    response = requests.get(url, timeout=20)
                    
                    # Check for rate limit response specifically
                    if response.status_code == 429:
                        # If we get a rate limit error, wait and retry
                        retry_after = int(response.headers.get("Retry-After", 1))
                        print(f"Rate limit hit for {symbol}. Retrying in {retry_after} seconds...")
                        time.sleep(retry_after)
                        continue # try again
                        
                    response.raise_for_status() # Raise an exception for other bad status codes (4xx or 5xx)
                    data = response.json()

                    if data:
                        profile = data[0]
                        current_date = date.today().strftime("%Y-%m-%d")
                        profile_info = {
                            'symbol': profile.get('symbol'),
                            'companyName': profile.get('companyName'),
                            'currency': profile.get('currency'),
                            'exchange': profile.get('exchangeShortName'),
                            'industry': profile.get('industry'),
                            'sector': profile.get('sector'),
                            'country': profile.get('country'),
                            'ipoDate': profile.get('ipoDate'),
                            'isEtf': profile.get('isEtf'),
                            'isActivelyTrading': profile.get('isActivelyTrading'),
                            'isAdr': profile.get('isAdr'),
                            'isFund': profile.get('isFund'),
                            'Current Date': current_date
                        }
                        profile_data.append(profile_info)
                    break # Success, break retry loop
                except requests.exceptions.RequestException as e:
                    if i < retries - 1:
                        sleep_time = backoff_factor * (2 ** i)
                        print(f"Request for {symbol} failed with {e}. Retrying in {sleep_time:.2f} seconds...")
                        time.sleep(sleep_time)
                    else:
                        print(f"Request for {symbol} failed after {retries} retries. Skipping.")
                        failed_symbols.append({'symbol': symbol, 'error': str(e)})
            
            # A small delay to be respectful to the API provider.
            time.sleep(0.05) 

        # Create DataFrame and save to CSV
        if profile_data:
            output_df = pd.DataFrame(profile_data)
            output_path = 'Symbol Lists/company_profiles.csv'
            try:
                output_df.to_csv(output_path, index=False)
                print(f"Company profile data saved to {output_path}")
                print(f"Successfully fetched profiles for {len(output_df)} out of {len(symbols)} symbols.")
            except PermissionError:
                print(f"Error: Could not save to {output_path}. Please make sure the file is not open in another program and you have write permissions.")
        else:
            print("No profile data was fetched.")
            
        if failed_symbols:
            failed_df = pd.DataFrame(failed_symbols)
            failed_output_path = 'Symbol Lists/failed_to_get_profile.csv'
            failed_df.to_csv(failed_output_path, index=False)
            print(f"Failed to fetch {len(failed_symbols)} symbols. Details saved to {failed_output_path}")


    except FileNotFoundError:
        print(f"Error: Input file not found at {input_path}")



Fetching Company Profiles:   0%|          | 0/36054 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Filtering out non-tradable

Now we will filter the company profiles to get a clean list of symbols for further analysis. We want to exclude ADRs, ETFs, Funds, and OTC stocks.


In [12]:
import pandas as pd

def filter_company_profiles():
    """
    Filters the company profiles based on isAdr, isEtf, isFund, and exchange.
    Saves the filtered symbols to a new CSV file.
    """
    input_path = 'Symbol lists/company_profiles.csv'
    output_path = 'Symbol Lists/removing_nonstocks_symbols.csv'

    try:
        df = pd.read_csv(input_path)

        # Apply the filters
        filtered_df = df[
            (df['isAdr'] == False) &
            (df['isEtf'] == False) &
            (df['isFund'] == False) &
            (df['exchange'] != 'OTC')
        ]
        
        # Select only the symbol column to save
        symbols_to_save = filtered_df[['symbol']]

        # Save the filtered symbols to a new CSV file
        symbols_to_save.to_csv(output_path, index=False)

        print(f"Filtered symbols saved to {output_path}")
        print(f"Original profiles count: {len(df)}")
        print(f"Filtered symbols count: {len(symbols_to_save)}")
        print(f"Symbols removed: {len(df) - len(symbols_to_save)}")

    except FileNotFoundError:
        print(f"Error: Input file not found at {input_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Run the function
filter_company_profiles()



Filtered symbols saved to Symbol Lists/removing_nonstocks_symbols.csv
Original profiles count: 36033
Filtered symbols count: 11178
Symbols removed: 24855


## Identifying Delisted Symbols

To ensure our symbol list is comprehensive, we need to account for symbols that have changed over time. The following script will use the Symbol Changes API to find any old symbols that have been replaced by the symbols in our `removing_nonstocks_symbols.csv` list. This will give us a list of delisted symbols that correspond to our active tickers.


In [None]:
import pandas as pd
import requests
import os
from dotenv import load_dotenv

def find_matching_old_symbols():
    """
    Finds old symbols from the symbol change API where the new symbol
    matches a symbol in our filtered list.
    """
    load_dotenv()
    api_key = os.getenv('API_KEY')
    
    if not api_key:
        print("API_KEY not found in .env file.")
        return

    filtered_symbols_path = 'Symbol Lists/removing_nonstocks_symbols.csv'
    output_path = 'Symbol Lists/changed_symbols.csv'

    try:
        # Read the filtered symbols into a set for fast lookup
        filtered_df = pd.read_csv(filtered_symbols_path)
        filtered_symbols_set = set(filtered_df['symbol'])
        
        # Call the symbol change API with a large limit to fetch all changes
        # The API defaults to a small limit (e.g., 100), so we must override it.
        url = f"https://financialmodelingprep.com/stable/symbol-change?limit=200000&apikey={api_key}"
        
        print("Fetching symbol changes from the API...")
        response = requests.get(url)
        response.raise_for_status()
        symbol_changes = response.json()
        print(f"Found {len(symbol_changes)} total symbol changes.")

        # Find oldSymbols where the newSymbol is in our list
        matching_old_symbols = []
        for change in symbol_changes:
            new_symbol = change.get('newSymbol')
            if new_symbol in filtered_symbols_set:
                old_symbol = change.get('oldSymbol')
                if old_symbol:
                    matching_old_symbols.append({'symbol': old_symbol})
        
        if matching_old_symbols:
            # Save the list of old symbols to a new CSV
            output_df = pd.DataFrame(matching_old_symbols)
            # Remove duplicates
            output_df.drop_duplicates(inplace=True)
            output_df.to_csv(output_path, index=False)
            
            print(f"Found {len(output_df)} matching changed symbols.")
            print(f"Saved the list to {output_path}")
        else:
            print("No matching old symbols were found.")

    except FileNotFoundError:
        print(f"Error: Input file not found at {filtered_symbols_path}")
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while calling the API: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Run the function
find_matching_old_symbols()



Fetching symbol changes from the API...
Found 4937 total symbol changes.
Found 2563 matching changed symbols.
Saved the list to ../output/changed_symbols.csv


## Combining Symbol Lists

Finally, we'll combine our list of currently active symbols (`removing_nonstocks_symbols.csv`) with our list of historical symbols (`changed_symbols.csv`) to create a complete, deduplicated list of all symbols we need to fetch data for.


In [13]:
import pandas as pd

def combine_and_deduplicate_symbols():
    """
    Combines filtered_symbols.csv and changed_symbols.csv, removes duplicates,
    and saves the result to fetchable_symbols.csv.
    """
    filtered_path = 'Symbol Lists/removing_nonstocks_symbols.csv'
    changed_path = 'Symbol Lists/changed_symbols.csv'
    output_path = 'Symbol Lists/fetchable_symbols.csv'

    try:
        filtered_df = pd.read_csv(filtered_path)
        changed_df = pd.read_csv(changed_path)

        # Combine the two dataframes
        combined_df = pd.concat([filtered_df, changed_df], ignore_index=True)
        
        original_count = len(combined_df)
        
        # Drop duplicates based on the 'symbol' column
        deduplicated_df = combined_df.drop_duplicates(subset=['symbol'])
        
        final_count = len(deduplicated_df)

        # Save the final list to a new CSV
        deduplicated_df.to_csv(output_path, index=False)
        
        print(f"Combined {len(filtered_df)} symbols from '{filtered_path.split('/')[-1]}' and {len(changed_df)} symbols from '{changed_path.split('/')[-1]}'.")
        print(f"Total symbols before deduplication: {original_count}")
        print(f"Removed {original_count - final_count} duplicate symbols.")
        print(f"Final count of fetchable symbols: {final_count}")
        print(f"Saved the final list to {output_path}")

    except FileNotFoundError as e:
        print(f"Error: A required file was not found. Please ensure both 'filtered_symbols.csv' and 'changed_symbols.csv' are in the 'output' directory.")
        print(f"Missing file details: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Run the function
combine_and_deduplicate_symbols()



Combined 11178 symbols from 'removing_nonstocks_symbols.csv' and 2563 symbols from 'changed_symbols.csv'.
Total symbols before deduplication: 13741
Removed 941 duplicate symbols.
Final count of fetchable symbols: 12800
Saved the final list to Symbol Lists/fetchable_symbols.csv
