# 13F InfoTable with YearQuarter
This notebook reads Excel workbooks from `data/extracted_13F_HR/blackrock`, loads only the `InfoTable` sheet, and adds a `YearQuarter` column derived from the filename (e.g., `2024Q3`).


In [None]:
from pathlib import Path
import pandas as pd
import re

DATA_DIR = Path('data/extracted_13F_HR/blackrock')
print('Data dir exists:', DATA_DIR.exists())
print('Sample files:', [p.name for p in sorted(DATA_DIR.glob('*.xlsx'))[:3]])


In [None]:
def year_quarter_from_filename(name: str):
    m = re.search(r'(\d{4})(\d{2})(\d{2})', name)
    if not m:
        return None
    year = int(m.group(1))
    month = int(m.group(2))
    q = (month - 1) // 3 + 1
    return f'{year}Q{q}'

files = sorted(DATA_DIR.glob('*.xlsx'))
frames = []
for f in files:
    yq = year_quarter_from_filename(f.name)
    df = None
    try:
        df = pd.read_excel(f, sheet_name='InfoTable', engine='openpyxl')
    except Exception as e:
        try:
            df = pd.read_excel(f, sheet_name='Information Table', engine='openpyxl')
        except Exception as e2:
            print(f'Skipping {f.name}: {e2}')
            continue
    df['YearQuarter'] = yq
    frames.append(df)

result = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
print('Rows:', len(result))
result.head()


In [None]:
# Apply class category transformation to merged DataFrame
from src.data_transformation import apply_class_category_column
apply_class_category_column(result, in_column='class_title', out_column='class_category')
print('Added class_category column. Top categories:', result['class_category'].value_counts().head(10).to_dict())
result[['class_title','class_category']].head()


In [None]:
# Build JSON mapping of unique class_title -> definition
import json

def parse_series(title):
    m = re.search(r'(?:SERIES|SER|SR)\s*([A-Z0-9]+)', title, flags=re.I)
    return m.group(1).upper() if m else None

def parse_class(title):
    m = re.search(r'(?:CLASS|CL)\s*([A-Z])', title, flags=re.I)
    return m.group(1).upper() if m else None

COMMON_TOKENS = {
    'COM': 'Common Stock',
    'COMMON': 'Common Stock',
    'ORD': 'Ordinary Shares (non-U.S. common)',
    'ORDINARY': 'Ordinary Shares (non-U.S. common)',
    'CAP STK': 'Capital Stock (equity)',
}
PREF_TOKENS = {'PFD','PREF','PREFERRED','DEP SHS','DEPOSITARY SHS'}
DEBT_TOKENS = {'NOTE','NOTES','NT','BOND','DEB','DEBENTURE','SR NT','SNR NT','SENIOR NOTES'}
DR_TOKENS = {'ADR','ADS','DEPOSITARY','DEP SHS','DEPOSITARY SHS','GDR','CDI'}
UNIT_TOKENS = {'UNIT','UNITS'}
WARRANT_TOKENS = {'WARRANT','WTS'}
RIGHT_TOKENS = {'RIGHT','RIGHTS','RT','RTS'}
ETF_TOKENS = {'ETF','ETN','INDEX','TRACKER'}
TRUST_TOKENS = {'TR','TRUST','BEN INT','BENEFICIAL','FUND'}
LP_TOKENS = {'LP','L P'}
REIT_TOKENS = {'REIT'}

def pick_base(title):
    t = title.upper()
    words = set(re.split(r'[^A-Z0-9]+', t))
    def has_any(s):
        return any(w in words for w in s)
    if has_any(ETF_TOKENS):
        return 'Exchange-Traded Fund/Note'
    if t.startswith('*W EXP') or has_any(WARRANT_TOKENS):
        return 'Warrant (right to buy equity before expiry)'
    if has_any(UNIT_TOKENS):
        return 'Unit (bundle of securities, often share + warrant)'
    if has_any(DR_TOKENS):
        return 'Depositary Receipts/Shares (US-traded representation of foreign equity)'
    if has_any(DEBT_TOKENS):
        return 'Corporate Debt Security (notes/debentures/bonds)'
    if has_any(PREF_TOKENS):
        return 'Preferred Stock (may be depositary shares)'
    if has_any(TRUST_TOKENS):
        return 'Trust/Fund/Beneficial Interest units'
    if has_any(REIT_TOKENS):
        return 'REIT equity/units'
    if has_any(LP_TOKENS):
        return 'Limited Partnership units'
    if has_any(COMMON_TOKENS):
        return 'Common/Ordinary Equity'
    return 'Security (unspecified class)'

def describe(title):
    base = pick_base(title)
    parts = [base]
    cls = parse_class(title)
    if cls:
        parts.append(f'Class {cls}')
    ser = parse_series(title)
    if ser:
        parts.append(f'Series {ser}')
    t = title.upper()
    if any(k in t for k in ['CV','CONV','CONVERTIBLE']):
        parts.append('Convertible')
    if 'MAND' in t:
        parts.append('Mandatory convertible')
    if 'CUM' in t and 'NON' not in t:
        parts.append('Cumulative')
    if 'NON' in t and 'CUM' in t:
        parts.append('Non-cumulative')
    if 'PERP' in t or 'PERPETUAL' in t:
        parts.append('Perpetual')
    if 'NEW' in t:
        parts.append('New issue / post corporate action')
    if 'DEP SHS' in t or 'DEPOSITARY SHS' in t:
        parts.append('Depositary shares (fractional interest)')
    return ', '.join(parts)

unique_titles = sorted(set(str(v).strip() for v in result['class_title'].dropna()))
mapping = {u: describe(u) for u in unique_titles}
out_dir = Path('data/metadata'); out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / 'class_title_definitions.json'
with open(out_path, 'w') as f:
    json.dump(mapping, f, indent=2, ensure_ascii=False)
print('Wrote', out_path, 'entries:', len(mapping))
list(mapping.items())[:10]
