In [8]:
import pandas as pd
import os
from pathlib import Path

DATA_DIR = Path(os.getcwd()).parent.parent / "data"

In [None]:
pre_sep2024_transactions_raw_df = pd.read_csv(DATA_DIR / "vanguard_raw_pre_sep2024_transactions.csv")
pre_sep2024_prices_raw_df = pd.read_csv(DATA_DIR / "vanguard_raw_pre_sep2024_prices.csv")
post_sep2024_raw_df = pd.read_csv(DATA_DIR / "vanguard_raw_post_sep2024.csv")

### Dataframe Schema Information

#### Vanguard Pre Sep 2024 Transactions DataFrame
- **Columns**: ['Subcategory', 'Total World', 'Large Cap', 'Mid Cap', 'Small Cap', 'International', 'Emerging Markets', 'Inflation Protected', 'Tax Exempt', 'International.1', 'Gold', 'Real Estate']
- **Structure**: 
  - Row 0: Asset tickers (VT, VOO, VO, VB, VXUS, VWO, VTIP, VTEB, BNDX, AAAU, VNQ)
  - Row 1: Subcategory totals
  - Rows 2+: Date-indexed transaction data with share quantities

#### Vanguard Pre Sep 2024 Prices DataFrame
- **Columns**: ['Category', 'Stocks', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Bonds', 'Unnamed: 8', 'Unnamed: 9', 'Gold', 'Real Estate']
- **Structure**:
  - Row 0: Category totals
  - Row 1: Subcategory labels
  - Row 2: Asset tickers (VT, VOO, VO, VB, VXUS, VWO, VTIP, VTEB, BNDX, AAAU, VNQ)
  - Row 3: Subcategory totals
  - Rows 4+: Date-indexed price data with dollar amounts

#### Vanguard Post Sep 2024 DafaFrame
- **Columns**: ['Account Number', 'Trade Date', 'Settlement Date', 'Transaction Type', 'Transaction Description', 'Investment Name', 'Symbol', 'Shares', 'Share Price', 'Principal Amount', 'Commissions and Fees', 'Net Amount', 'Accrued Interest', 'Account Type']

## Pre Sep 2024 - Data Loaded from Gsheets

In [None]:
# Extract tickers from the asset row (index 0 in transactions, index 2 in prices)
tx_tickers = pre_sep2024_transactions_raw_df.iloc[0, 1:].values  # Skip first column
price_tickers = pre_sep2024_prices_raw_df.iloc[2, 1:].values    # Skip first column

# Get the data starting from the actual transaction/price rows
tx_data = pre_sep2024_transactions_raw_df.iloc[2:].copy()  # Start from index 2
price_data = pre_sep2024_prices_raw_df.iloc[4:].copy()     # Start from index 4

tx_data = tx_data.reset_index(drop=True)
price_data = price_data.reset_index(drop=True)

In [11]:
display(tx_tickers)
price_tickers

array(['VT', 'VOO', 'VO', 'VB', 'VXUS', 'VWO', 'VTIP', 'VTEB', 'BNDX',
       'AAAU', 'VNQ'], dtype=object)

array(['VT', 'VOO', 'VO', 'VB', 'VXUS', 'VWO', 'VTIP', 'VTEB', 'BNDX',
       'AAAU', 'VNQ'], dtype=object)

In [12]:
# Create the combined data
pre_sep2024_data = []
for i in range(len(tx_data)):
    date = tx_data.iloc[i, 0]  # First column is date
    
    # Iterate through each ticker column
    for j, ticker in enumerate(tx_tickers):
        if pd.isna(ticker):  # Skip NaN tickers
            continue
            
        # Get shares from transactions (j+1 because we skip the date column)
        shares = tx_data.iloc[i, j+1]
        
        # Get price from prices dataframe (j+1 because we skip the date column)
        price_str = price_data.iloc[i, j+1] if i < len(price_data) else None
        
        # Only add row if both shares and price exist and are not NaN
        if not pd.isna(shares) and not pd.isna(price_str) and shares != 0:
            # Clean price string (remove $ and commas)
            if isinstance(price_str, str):
                price = float(price_str.replace('$', '').replace(',', ''))
            else:
                price = float(price_str) if not pd.isna(price_str) else None # type: ignore
                
            if price is not None:
                pre_sep2024_data.append({
                    'date': date,
                    'ticker': ticker,
                    'price': price,
                    'shares': shares
                })

In [13]:
# Create the final dataframe
pre_sep2024_df = pd.DataFrame(pre_sep2024_data)

pre_sep2024_df['date'] = pd.to_datetime(pre_sep2024_df['date'])
pre_sep2024_df = pre_sep2024_df.sort_values(['date', 'ticker']).reset_index(drop=True)

pre_sep2024_df.head(10)

Unnamed: 0,date,ticker,price,shares
0,2021-06-03,VB,223.6,1
1,2021-06-03,VOO,384.13,1
2,2021-06-03,VT,102.79,6
3,2021-06-03,VWO,54.61,1
4,2021-06-18,AAAU,17.65,5
5,2021-06-18,BNDX,57.1,1
6,2021-06-18,VNQ,102.99,1
7,2021-06-18,VO,232.39,1
8,2021-06-18,VTEB,55.33,2
9,2021-06-18,VTIP,52.11,3


## Post Sep 2024 - Data Loaded from Vanguard Export

In [None]:
post_sep2024_data = []

for _, row in post_sep2024_raw_df.iterrows():
    if row['Transaction Type'] == 'Buy' and not pd.isna(row['Shares']) and not pd.isna(row['Share Price']):
        post_sep2024_data.append({
            'date': pd.to_datetime(row['Trade Date']),
            'ticker': row['Symbol'],
            'price': row['Share Price'],
            'shares': row['Shares']
        })

post_sep2024_df = pd.DataFrame(post_sep2024_data)
post_sep2024_df = post_sep2024_df.sort_values(['date', 'ticker']).reset_index(drop=True)
post_sep2024_df.head(10)