# 00: Data Exploration & Schema Discovery

## Purpose
This notebook explores the raw CSV files (`holdings.csv` and `trades.csv`) to understand their structure, identify key columns, detect data quality issues, and generate schema documentation.

## Key Features
- Loads and examines CSV files
- Identifies fund identifiers, date columns, and P&L columns
- Performs data quality checks (missing values, duplicates, anomalies)
- Generates `schema_info.json` and `column_mappings.json` for downstream use

## Column Mapping Strategy
**IMPORTANT**: CSV files use different column names than the database:
- CSV: `PL_YTD`, `PL_MTD`, `PL_QTD`, `PL_DTD` (uppercase with underscores)
- Database: `plytd`, `plmtd`, `plqtd`, `pldtd` (lowercase, no underscores)
- CSV: `MV_Base`, `MV_Local`, `PortfolioName` (PascalCase)
- Database: `mvbase`, `mvlocal`, `portfolioname` (lowercase)

This notebook documents the CSV structure. The mapping to database column names happens in `01_data_ingestion.ipynb`.

In [1]:
# Cell 1: Imports and Setup
import pandas as pd
import json
from pathlib import Path
from datetime import datetime, timezone

# Project root
project_root = Path.cwd().parent
data_dir = project_root / "data"

print("✅ Imports loaded")
print(f"   Data directory: {data_dir}")

✅ Imports loaded
   Data directory: /Users/suren/Desktop/untitled folder/loop-task/data


In [2]:
# Cell 2: Load CSV Files
holdings_file = data_dir / "holdings.csv"
trades_file = data_dir / "trades.csv"

# Load holdings
holdings_df = pd.read_csv(holdings_file)
print(f"✅ Holdings loaded: {len(holdings_df)} rows, {len(holdings_df.columns)} columns")

# Load trades
trades_df = pd.read_csv(trades_file)
print(f"✅ Trades loaded: {len(trades_df)} rows, {len(trades_df.columns)} columns")

# Display column names
print(f"\nHoldings columns: {list(holdings_df.columns)}")
print(f"\nTrades columns: {list(trades_df.columns)}")

✅ Holdings loaded: 1022 rows, 25 columns
✅ Trades loaded: 649 rows, 31 columns

Holdings columns: ['AsOfDate', 'OpenDate', 'CloseDate', 'ShortName', 'PortfolioName', 'StrategyRefShortName', 'Strategy1RefShortName', 'Strategy2RefShortName', 'CustodianName', 'DirectionName', 'SecurityId', 'SecurityTypeName', 'SecName', 'StartQty', 'Qty', 'StartPrice', 'Price', 'StartFXRate', 'FXRate', 'MV_Local', 'MV_Base', 'PL_DTD', 'PL_QTD', 'PL_MTD', 'PL_YTD']

Trades columns: ['id', 'RevisionId', 'AllocationId', 'TradeTypeName', 'SecurityId', 'SecurityType', 'Name', 'Ticker', 'CUSIP', 'ISIN', 'TradeDate', 'SettleDate', 'Quantity', 'Price', 'TradeFXRate', 'Principal', 'Interest', 'TotalCash', 'AllocationQTY', 'AllocationPrincipal', 'AllocationInterest', 'AllocationFees', 'AllocationCash', 'PortfolioName', 'CustodianName', 'StrategyName', 'Strategy1Name', 'Strategy2Name', 'Counterparty', 'AllocationRule', 'IsCustomAllocation']


In [3]:
# Cell 3: Identify Key Columns

def identify_fund_column(df: pd.DataFrame) -> str:
    """Identify fund identifier column."""
    candidates = ["PortfolioName", "portfolioname", "Fund", "fund"]
    for col in candidates:
        if col in df.columns:
            return col
    return None

def identify_date_column(df: pd.DataFrame) -> str:
    """Identify date column."""
    candidates = ["AsOfDate", "asofdate", "TradeDate", "tradedate", "Date", "date"]
    for col in candidates:
        if col in df.columns:
            return col
    return None

def identify_pnl_columns(df: pd.DataFrame) -> list:
    """Identify P&L columns (from CSV - may have underscores)."""
    pnl_cols = []
    for col in df.columns:
        col_lower = col.lower()
        if "pl_ytd" in col_lower or "plytd" in col_lower:
            pnl_cols.append(col)
        elif "pl_mtd" in col_lower or "plmtd" in col_lower:
            pnl_cols.append(col)
        elif "pl_qtd" in col_lower or "plqtd" in col_lower:
            pnl_cols.append(col)
        elif "pl_dtd" in col_lower or "pldtd" in col_lower:
            pnl_cols.append(col)
    return pnl_cols

# Identify columns
holdings_fund_col = identify_fund_column(holdings_df)
holdings_date_col = identify_date_column(holdings_df)
holdings_pnl_cols = identify_pnl_columns(holdings_df)

trades_fund_col = identify_fund_column(trades_df)
trades_date_col = identify_date_column(trades_df)
trades_pnl_cols = identify_pnl_columns(trades_df)

print("Holdings:")
print(f"  Fund column: {holdings_fund_col}")
print(f"  Date column: {holdings_date_col}")
print(f"  P&L columns: {holdings_pnl_cols}")

print("\nTrades:")
print(f"  Fund column: {trades_fund_col}")
print(f"  Date column: {trades_date_col}")
print(f"  P&L columns: {trades_pnl_cols}")

Holdings:
  Fund column: PortfolioName
  Date column: AsOfDate
  P&L columns: ['PL_DTD', 'PL_QTD', 'PL_MTD', 'PL_YTD']

Trades:
  Fund column: PortfolioName
  Date column: TradeDate
  P&L columns: []


In [4]:
# Cell 4: Data Quality Checks

def check_missing_data(df: pd.DataFrame, name: str):
    """Check for missing values."""
    print(f"\n{name} - Missing Data:")
    for col in df.columns:
        missing = df[col].isna().sum()
        pct = (missing / len(df)) * 100
        if missing > 0:
            print(f"  {col}: {missing} ({pct:.1f}%)")

def check_duplicates(df: pd.DataFrame, name: str):
    """Check for duplicate rows."""
    duplicates = df.duplicated().sum()
    print(f"\n{name} - Duplicates: {duplicates} rows")

def check_unique_funds(df: pd.DataFrame, fund_col: str, name: str):
    """List unique funds."""
    if fund_col and fund_col in df.columns:
        unique_funds = df[fund_col].unique()
        print(f"\n{name} - Unique Funds: {len(unique_funds)}")
        print(f"  Sample: {list(unique_funds[:10])}")

# Run checks
check_missing_data(holdings_df, "Holdings")
check_duplicates(holdings_df, "Holdings")
check_unique_funds(holdings_df, holdings_fund_col, "Holdings")

check_missing_data(trades_df, "Trades")
check_duplicates(trades_df, "Trades")
check_unique_funds(trades_df, trades_fund_col, "Trades")


Holdings - Missing Data:
  CloseDate: 1006 (98.4%)
  MV_Local: 16 (1.6%)
  MV_Base: 16 (1.6%)

Holdings - Duplicates: 2 rows

Holdings - Unique Funds: 19
  Sample: ['Garfield', 'Heather', 'MNC Investment Fund', 'Northpoint 401K', 'CoYold 1', 'Opium Holdings Partners', 'Ytum', 'Platpot', 'Hi Yield', 'Warren Lee IG']

Trades - Missing Data:
  Ticker: 448 (69.0%)
  CUSIP: 151 (23.3%)
  ISIN: 125 (19.3%)
  TradeFXRate: 649 (100.0%)
  AllocationRule: 2 (0.3%)

Trades - Duplicates: 0 rows

Trades - Unique Funds: 16
  Sample: ['HoldCo 1', 'HoldCo 3', 'HoldCo 11', 'HoldCo 7', 'Redfield Accu-Fund', 'UNC Investment Fund', 'ClientA', 'Leatherwood Trust MA', 'Platpot Fund', 'Optimum Holdings Partners']


In [5]:
# Cell 5: Generate Schema Info

# Create schema_info.json (CSV column names - for reference)
schema_info = {
    "holdings": {
        "columns": list(holdings_df.columns),
        "fund_column": holdings_fund_col,
        "date_column": holdings_date_col,
        "pnl_columns": holdings_pnl_cols,
        "shape": list(holdings_df.shape),
        "dtypes": {col: str(dtype) for col, dtype in holdings_df.dtypes.items()},
        "unique_funds": int(holdings_df[holdings_fund_col].nunique()) if holdings_fund_col else 0,
        "sample_funds": list(holdings_df[holdings_fund_col].unique()[:10]) if holdings_fund_col else []
    },
    "trades": {
        "columns": list(trades_df.columns),
        "fund_column": trades_fund_col,
        "date_column": trades_date_col,
        "pnl_columns": trades_pnl_cols,
        "shape": list(trades_df.shape),
        "dtypes": {col: str(dtype) for col, dtype in trades_df.dtypes.items()},
        "unique_funds": int(trades_df[trades_fund_col].nunique()) if trades_fund_col else 0,
        "sample_funds": list(trades_df[trades_fund_col].unique()[:10]) if trades_fund_col else []
    }
}

# Save schema_info.json
schema_info_path = project_root / "notebook2" / "schema_info.json"
with open(schema_info_path, "w") as f:
    json.dump(schema_info, f, indent=2)

print(f"✅ schema_info.json saved to {schema_info_path}")
print(f"   Holdings: {len(schema_info['holdings']['columns'])} columns")
print(f"   Trades: {len(schema_info['trades']['columns'])} columns")

✅ schema_info.json saved to /Users/suren/Desktop/untitled folder/loop-task/notebook2/schema_info.json
   Holdings: 25 columns
   Trades: 31 columns


In [6]:
# Cell 6: Generate Column Mappings (CSV → Database)

# CRITICAL: This mapping converts CSV column names to database column names
# Database uses lowercase, no underscores for P&L columns

def create_column_mapping(csv_columns: list) -> dict:
    """Create mapping from CSV columns to database columns."""
    mapping = {}
    
    for col in csv_columns:
        col_lower = col.lower()
        
        # P&L columns: PL_YTD → plytd, PL_MTD → plmtd, etc.
        if "pl_ytd" in col_lower or col_lower == "plytd":
            mapping[col] = "plytd"
        elif "pl_mtd" in col_lower or col_lower == "plmtd":
            mapping[col] = "plmtd"
        elif "pl_qtd" in col_lower or col_lower == "plqtd":
            mapping[col] = "plqtd"
        elif "pl_dtd" in col_lower or col_lower == "pldtd":
            mapping[col] = "pldtd"
        # Market value columns
        elif "mv_base" in col_lower or col_lower == "mvbase":
            mapping[col] = "mvbase"
        elif "mv_local" in col_lower or col_lower == "mvlocal":
            mapping[col] = "mvlocal"
        # Other columns: convert to lowercase
        else:
            mapping[col] = col_lower
    
    return mapping

# Create mappings
holdings_mapping = create_column_mapping(holdings_df.columns)
trades_mapping = create_column_mapping(trades_df.columns)

# Create column_mappings.json (database column names)
column_mappings = {
    "holdings": {
        "columns": [holdings_mapping[col] for col in holdings_df.columns],
        "fund_column": holdings_mapping.get(holdings_fund_col, "portfolioname"),
        "date_column": holdings_mapping.get(holdings_date_col, "asofdate")
    },
    "trades": {
        "columns": [trades_mapping[col] for col in trades_df.columns],
        "fund_column": trades_mapping.get(trades_fund_col, "portfolioname"),
        "date_column": trades_mapping.get(trades_date_col, "tradedate")
    }
}

# Save column_mappings.json
column_mappings_path = project_root / "notebook2" / "column_mappings.json"
with open(column_mappings_path, "w") as f:
    json.dump(column_mappings, f, indent=2)

print(f"✅ column_mappings.json saved to {column_mappings_path}")
print(f"   Holdings: {len(column_mappings['holdings']['columns'])} columns")
print(f"   Trades: {len(column_mappings['trades']['columns'])} columns")
print(f"\n   Key mappings:")
if 'plytd' in column_mappings['holdings']['columns']:
    print(f"     PL_YTD → plytd")
if 'mvbase' in column_mappings['holdings']['columns']:
    print(f"     MV_Base → mvbase")

✅ column_mappings.json saved to /Users/suren/Desktop/untitled folder/loop-task/notebook2/column_mappings.json
   Holdings: 25 columns
   Trades: 31 columns

   Key mappings:
     PL_YTD → plytd
     MV_Base → mvbase
