## Step 1: Import Libraries

In [1]:
%run ../make_clean_names.py

In [2]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import logging
import concurrent.futures
from typing import Dict, List
from concurrent.futures import ThreadPoolExecutor
import polars as pl

from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('institutional_holders.log'),
        logging.StreamHandler()
    ]
)

# Create logger instance
logger = logging.getLogger(__name__)

def create_session():
    session = requests.Session()
    retries = Retry(
        total=3,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504]
    )
    session.mount('https://', HTTPAdapter(max_retries=retries, pool_maxsize=10))
    return session

# Get API key from environment variables
FMP_API_KEY = os.getenv('FMP_API_KEY')
if not FMP_API_KEY:
    raise ValueError("FMP_API_KEY not found in environment variables")

## Step 2: Import Symbols

In [3]:
def load_symbols(file_path: str) -> List[str]:
    """Load symbols from a text file"""
    try:
        with open(file_path, 'r') as f:
            symbols = [line.strip() for line in f if line.strip()]
        print(f"Loaded {len(symbols)} symbols from {file_path}")
        return symbols
    except Exception as e:
        print(f"Error loading symbols: {str(e)}")
        return []

symbols_file = '../tickers.txt'
symbols = load_symbols(symbols_file)

if symbols:
    print("Symbols:", symbols)
else:
    print("No symbols loaded.")

Loaded 61 symbols from ../tickers.txt
Symbols: ['AAPL', 'MSFT', 'GOOGL', 'GOOG', 'META', 'NVDA', 'AVGO', 'ORCL', 'CRM', 'ACN', 'ADBE', 'CSCO', 'INTC', 'NFLX', 'DIS', 'CMCSA', 'VZ', 'T', 'AMZN', 'TSLA', 'HD', 'MCD', 'NKE', 'SBUX', 'TGT', 'LOW', 'WMT', 'PG', 'KO', 'PEP', 'COST', 'BRK-B', 'JPM', 'BAC', 'WFC', 'GS', 'MS', 'BLK', 'UNH', 'JNJ', 'PFE', 'ABBV', 'MRK', 'LLY', 'CAT', 'BA', 'HON', 'UPS', 'RTX', 'GE', 'XOM', 'CVX', 'COP', 'SLB', 'LIN', 'APD', 'ECL', 'PLD', 'AMT', 'CCI', 'OSW']


## Step 3: AAPL Institutional Holders
- **Reference**: https://site.financialmodelingprep.com/developer/docs/institutional-holders-api

In [4]:
def fetch_data(api_key: str, session: requests.Session) -> Dict:
    """Fetch historical forex data for a single pair with pair identifier"""
    url = f"https://financialmodelingprep.com/api/v3/institutional-holder/AAPL"
    params = {
        "apikey": api_key
    }
        
    try:
        response = session.get(url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()
        return data
    except Exception as e:
        return None

# Create a session
session = create_session()

# Execute fetching
data = fetch_data(FMP_API_KEY, session)

# Convert to Polars DataFrame if data exists
df = pl.DataFrame(data)

## Step 4: Sort and Rank Top 10 Largest Transactions

In [5]:
# Get top 10 holders by change
top_10_holders = df.sort("change", descending=True).head(10)

# Get bottom 10 holders by change
bottom_10_holders = df.sort("change", descending=False).head(10)

# Concatenate top and bottom holders
combined_holders = pl.concat([top_10_holders, bottom_10_holders])

# Extract 'holder' column to a list
holder_list = combined_holders["holder"].to_list()

# Create a rank dictionary
rank_dict = {holder: rank for rank, holder in enumerate(holder_list)}

# Add rank column with specified return_dtype
df = df.with_columns([
    pl.col("holder")
    .map_elements(lambda x: rank_dict.get(x, float('inf')), return_dtype=pl.Float64)
    .alias("rank")
])

# Sort the DataFrame by rank
sorted_df = df.sort("rank")

print("Sorted DataFrame by rank:")
print(sorted_df)

Sorted DataFrame by rank:
shape: (5_075, 5)
┌─────────────────────────────────┬────────────┬──────────────┬────────────┬──────┐
│ holder                          ┆ shares     ┆ dateReported ┆ change     ┆ rank │
│ ---                             ┆ ---        ┆ ---          ┆ ---        ┆ ---  │
│ str                             ┆ i64        ┆ str          ┆ i64        ┆ f64  │
╞═════════════════════════════════╪════════════╪══════════════╪════════════╪══════╡
│ BlackRock Funding, Inc. /DE     ┆ 1093618174 ┆ 2024-09-30   ┆ 1093618174 ┆ 0.0  │
│ NORGES BANK                     ┆ 177534454  ┆ 2024-06-30   ┆ 177534454  ┆ 1.0  │
│ STATE STREET CORP               ┆ 584010284  ┆ 2024-09-30   ┆ 55935105   ┆ 2.0  │
│ VANGUARD GROUP INC              ┆ 1346616669 ┆ 2024-09-30   ┆ 21646442   ┆ 3.0  │
│ GEODE CAPITAL MANAGEMENT, LLC   ┆ 333857500  ┆ 2024-09-30   ┆ 20483787   ┆ 4.0  │
│ …                               ┆ …          ┆ …            ┆ …          ┆ …    │
│ RBC Capital Markets, LLC      

## Step 5: CIK Mapper By name API
- **Reference**: https://site.financialmodelingprep.com/developer/docs/cik-mapper-by-name-insider-trading

In [6]:
holders = combined_holders["holder"].str.strip_chars().unique().sort().to_list()
holders

['APG Asset Management N.V.',
 'BANK OF AMERICA CORP /DE/',
 'BERKSHIRE HATHAWAY INC',
 'BlackRock Funding, Inc. /DE',
 'BlackRock Inc.',
 'FIRST REPUBLIC INVESTMENT MANAGEMENT, INC.',
 'FMR LLC',
 'FRANKLIN RESOURCES INC',
 'GEODE CAPITAL MANAGEMENT, LLC',
 'GQG Partners LLC',
 'H&H International Investment, LLC',
 'JANE STREET GROUP, LLC',
 'Legal & General Group Plc',
 'MILLENNIUM MANAGEMENT LLC',
 'NORGES BANK',
 'PUBLIC EMPLOYEES RETIREMENT SYSTEM OF OHIO',
 'STATE STREET CORP',
 'Strategic Financial Concepts, LLC',
 'VANGUARD GROUP INC',
 'WELLINGTON MANAGEMENT GROUP LLP']

In [7]:
def fetch_cik(holder_name: str, api_key: str, session: requests.Session) -> Dict:
    """Fetch CIK data for a single holder name"""
    url = f"https://financialmodelingprep.com/api/v4/mapper-cik-name"
    params = {
        "name": holder_name,
        "page": 0,
        "apikey": api_key
    }
    try:
        response = session.get(url, params=params, timeout=10)
        response.raise_for_status()
        logger.info(f"Successfully fetched CIK data for {holder_name}")
        return response.json()
    except requests.exceptions.RequestException as e:
        logger.error(f"Request error fetching {holder_name}: {str(e)}")
        return None
    except Exception as e:
        logger.error(f"Unexpected error fetching {holder_name}: {str(e)}")
        return None

def fetch_all_cik(holder_names: List[str], api_key: str, session: requests.Session) -> List[Dict]:
    """Fetch CIK data for multiple holder names in parallel"""
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [
            executor.submit(fetch_cik, holder, api_key, session)
            for holder in holder_names
        ]
        results = [
            f.result() for f in concurrent.futures.as_completed(futures)
        ]
    
    return [r[0] for r in results if r and isinstance(r, list)]

# Create a session
session = create_session()

# Execute fetching
cik_data = fetch_all_cik(holders, FMP_API_KEY, session)

# Convert to Polars DataFrame if data exists
df_cik = pl.DataFrame(cik_data)

2025-01-27 10:05:01,899 - __main__ - INFO - Successfully fetched CIK data for APG Asset Management N.V.


2025-01-27 10:05:01,900 - __main__ - INFO - Successfully fetched CIK data for GQG Partners LLC


2025-01-27 10:05:01,900 - __main__ - INFO - Successfully fetched CIK data for GEODE CAPITAL MANAGEMENT, LLC


2025-01-27 10:05:01,902 - __main__ - INFO - Successfully fetched CIK data for FRANKLIN RESOURCES INC


2025-01-27 10:05:01,913 - __main__ - INFO - Successfully fetched CIK data for BlackRock Funding, Inc. /DE


2025-01-27 10:05:01,915 - __main__ - INFO - Successfully fetched CIK data for BlackRock Inc.


2025-01-27 10:05:01,949 - __main__ - INFO - Successfully fetched CIK data for BANK OF AMERICA CORP /DE/


2025-01-27 10:05:02,007 - __main__ - INFO - Successfully fetched CIK data for FIRST REPUBLIC INVESTMENT MANAGEMENT, INC.


2025-01-27 10:05:04,219 - __main__ - INFO - Successfully fetched CIK data for NORGES BANK


2025-01-27 10:05:04,219 - __main__ - INFO - Successfully fetched CIK data for H&H International Investment, LLC


2025-01-27 10:05:04,222 - __main__ - INFO - Successfully fetched CIK data for JANE STREET GROUP, LLC


2025-01-27 10:05:04,250 - __main__ - INFO - Successfully fetched CIK data for Legal & General Group Plc


2025-01-27 10:05:04,252 - __main__ - INFO - Successfully fetched CIK data for PUBLIC EMPLOYEES RETIREMENT SYSTEM OF OHIO


2025-01-27 10:05:04,319 - __main__ - INFO - Successfully fetched CIK data for Strategic Financial Concepts, LLC


2025-01-27 10:05:04,412 - __main__ - INFO - Successfully fetched CIK data for MILLENNIUM MANAGEMENT LLC


2025-01-27 10:05:04,414 - __main__ - INFO - Successfully fetched CIK data for STATE STREET CORP


2025-01-27 10:05:06,433 - __main__ - INFO - Successfully fetched CIK data for BERKSHIRE HATHAWAY INC


2025-01-27 10:05:06,528 - __main__ - INFO - Successfully fetched CIK data for WELLINGTON MANAGEMENT GROUP LLP


2025-01-27 10:05:06,557 - __main__ - INFO - Successfully fetched CIK data for VANGUARD GROUP INC


2025-01-27 10:05:06,839 - __main__ - INFO - Successfully fetched CIK data for FMR LLC


In [8]:
df_cik

reportingCik,reportingName
str,str
"""0000038777""","""FRANKLIN RESOURCES INC"""
"""0001364742""","""BlackRock Inc."""
"""0000070858""","""BANK OF AMERICA CORP /DE/"""
"""0001273087""","""MILLENNIUM MANAGEMENT LLC"""
"""0001067983""","""BERKSHIRE HATHAWAY INC"""
"""0000315066""","""FMR LLC"""


## Step 6: Merge DataFrames

In [9]:
df = combined_holders.join(
    df_cik,
    left_on="holder",
    right_on="reportingName",
    how="inner"
)

## Step 7: Clean Column Names

In [10]:
df = make_clean_names(df)
df = df.sort("change", descending=True)

df

holder,shares,date_reported,change,reporting_cik
str,i64,str,i64,str
"""FMR LLC""",363859362,"""2024-09-30""",18224005,"""0000315066"""
"""BlackRock Inc.""",1050215752,"""2024-06-30""",9692346,"""0001364742"""
"""FRANKLIN RESOURCES INC""",46376594,"""2024-09-30""",7273386,"""0000038777"""
"""MILLENNIUM MANAGEMENT LLC""",1224060,"""2024-09-30""",-11585042,"""0001273087"""
"""BANK OF AMERICA CORP /DE/""",80893112,"""2024-09-30""",-41754942,"""0000070858"""
"""BERKSHIRE HATHAWAY INC""",300000000,"""2024-09-30""",-100000000,"""0001067983"""


## Step 8: Write Polars to Parquet

In [11]:
# Define the output directory
output_dir = "../../../data/finance"

# Write the processed DataFrame to a Parquet file
df.write_parquet(f'{output_dir}/aapl_institutional_holders_change.parquet')

## Step 9: Read Parquet (Validate)

In [12]:
# Validate the output by reading the Parquet file and displaying the first few rows
pl.scan_parquet(f'{output_dir}/aapl_institutional_holders_change.parquet').collect()

holder,shares,date_reported,change,reporting_cik
str,i64,str,i64,str
"""FMR LLC""",363859362,"""2024-09-30""",18224005,"""0000315066"""
"""BlackRock Inc.""",1050215752,"""2024-06-30""",9692346,"""0001364742"""
"""FRANKLIN RESOURCES INC""",46376594,"""2024-09-30""",7273386,"""0000038777"""
"""MILLENNIUM MANAGEMENT LLC""",1224060,"""2024-09-30""",-11585042,"""0001273087"""
"""BANK OF AMERICA CORP /DE/""",80893112,"""2024-09-30""",-41754942,"""0000070858"""
"""BERKSHIRE HATHAWAY INC""",300000000,"""2024-09-30""",-100000000,"""0001067983"""
