# Holding Details

## Data Processing

In [None]:
import pandas as pd
import numpy as np
import io

In [None]:
from google.colab import files
uploaded = files.upload()

Saving Holding Details.csv to Holding Details.csv


In [None]:
df = pd.read_csv(io.BytesIO(uploaded['Holding Details.csv']))
print(df.head())

     PORTFOLIOCODE CURRENCYCODE   CURRENCY LANGUAGECODE  ISSUERNAME  \
0  ASTFIENHCRMODEL          USD  US Dollar        en-US         NaN   
1  ASTFIENHCRMODEL          USD  US Dollar        en-US         NaN   
2  ASTFIENHCRMODEL          USD  US Dollar        en-US         NaN   
3  ASTFIENHCRMODEL          USD  US Dollar        en-US         NaN   
4  ASTFIENHCRMODEL          USD  US Dollar        en-US         NaN   

         ISSUENAME ISSUEDISPLAYNAME    COSTBASIS  QUANTITY  \
0  WESTPAC BANKING  WESTPAC BANKING  179861.4000  180000.0   
1   ANGLO AMERICAN   ANGLO AMERICAN  318323.2000  320000.0   
2  COCA-COLA FEMSA  COCA-COLA FEMSA  182021.3322  174000.0   
3   ANHEUSER-BUSCH   ANHEUSER-BUSCH  261632.6140  250000.0   
4   BMW US CAP LLC   BMW US CAP LLC  174275.1155  175000.0   

   MARKETVALUEWITHOUTACCRUEDINCOME  ...  CUSTOMCLASSIFICATION3  \
0                        162253.80  ...                    NaN   
1                        319590.40  ...                    NaN   
2 

## Data Analyzing

In [None]:
import pandas as pd

# Load the data
df = pd.read_csv("Holding Details.csv")

# Step 1: Schema Summary - Understand non-null % and type for each column
schema_summary = pd.DataFrame({
    "Column Name": df.columns,
    "Non-Null Count": df.notnull().sum().values,
    "Total Rows": len(df),
    "Percent Filled": (df.notnull().sum().values / len(df) * 100).round(2),
    "Data Type": df.dtypes.values
}).sort_values(by="Percent Filled", ascending=False)

# Step 2: Focus on usable + missing critical fields
required_cols = [
    'PORTFOLIOCODE', 'CURRENCYCODE', 'CURRENCY', 'LANGUAGECODE',
    'ISSUEDISPLAYNAME', 'PORTFOLIOWEIGHT', 'PRICE',
    'PRIMARYSUBINDUSTRYNAME', 'PRIMARYSECTORNAME', 'HISTORYDATE'
]

missing_cols = [col for col in required_cols if col not in df.columns]
existing_required_df = schema_summary[schema_summary['Column Name'].isin(required_cols)]

# Step 3: Output recommendation message for pipeline planning
print("✅ Holdings Schema Analysis Complete.\n")
print("💡 Recommended Columns for Synthetic Pipeline:")
for col in required_cols:
    status = "✅ Available" if col in df.columns else "❌ Missing"
    print(f" - {col}: {status}")

# Step 4: Show summary table
existing_required_df.reset_index(drop=True, inplace=True)
existing_required_df

✅ Holdings Schema Analysis Complete.

💡 Recommended Columns for Synthetic Pipeline:
 - PORTFOLIOCODE: ✅ Available
 - CURRENCYCODE: ✅ Available
 - CURRENCY: ✅ Available
 - LANGUAGECODE: ✅ Available
 - ISSUEDISPLAYNAME: ✅ Available
 - PORTFOLIOWEIGHT: ✅ Available
 - PRICE: ✅ Available
 - PRIMARYSUBINDUSTRYNAME: ✅ Available
 - PRIMARYSECTORNAME: ✅ Available
 - HISTORYDATE: ✅ Available


Unnamed: 0,Column Name,Non-Null Count,Total Rows,Percent Filled,Data Type
0,PORTFOLIOCODE,100,100,100.0,object
1,CURRENCYCODE,100,100,100.0,object
2,CURRENCY,100,100,100.0,object
3,LANGUAGECODE,100,100,100.0,object
4,ISSUEDISPLAYNAME,100,100,100.0,object
5,HISTORYDATE,100,100,100.0,object
6,PRIMARYSECTORNAME,100,100,100.0,object
7,PRICE,99,100,99.0,float64
8,PORTFOLIOWEIGHT,0,100,0.0,float64
9,PRIMARYSUBINDUSTRYNAME,0,100,0.0,float64


## Data Generating

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

def generate_synthetic_holdings(n=10):
    np.random.seed(42)
    tickers = ["AAPL", "GOOG", "MSFT", "PLD", "AMT", "EQIX", "DLR", "SPG", "O", "PSA"]
    sectors = ["Technology", "Industrial", "Real Estate", "Telecom", "Health Care"]
    subindustries = ["Data Center REITs", "Retail REITs", "Industrial REITs", "Health Care REITs", "Telecom Tower REITs"]

    df_syn = pd.DataFrame({
        "PORTFOLIOCODE": ["SYNTHETICMODEL"] * n,
        "CURRENCYCODE": ["USD"] * n,
        "CURRENCY": ["US Dollar"] * n,
        "LANGUAGECODE": ["en-US"] * n,
        "TICKER": np.random.choice(tickers, n),
        "ISSUEDISPLAYNAME": [f"Synthetic Corp {i+1}" for i in range(n)],
        "PORTFOLIOWEIGHT": np.round(np.random.uniform(0.5, 5.0, n), 2),
        "PRICE": np.round(np.random.uniform(50, 500, n), 2),
        "PRIMARYSECTORNAME": np.random.choice(sectors, n),
        "PRIMARYSUBINDUSTRYNAME": np.random.choice(subindustries, n),
        "HISTORYDATE": [datetime.today().strftime('%Y-%m-%d')] * n
    })
    return df_syn

# Generate and preview synthetic data
synthetic_df = generate_synthetic_holdings(10)
synthetic_df.head()

Unnamed: 0,PORTFOLIOCODE,CURRENCYCODE,CURRENCY,LANGUAGECODE,TICKER,ISSUEDISPLAYNAME,PORTFOLIOWEIGHT,PRICE,PRIMARYSECTORNAME,PRIMARYSUBINDUSTRYNAME,HISTORYDATE
0,SYNTHETICMODEL,USD,US Dollar,en-US,DLR,Synthetic Corp 1,3.21,244.38,Technology,Retail REITs,2025-06-25
1,SYNTHETICMODEL,USD,US Dollar,en-US,PLD,Synthetic Corp 2,3.69,181.05,Real Estate,Retail REITs,2025-06-25
2,SYNTHETICMODEL,USD,US Dollar,en-US,SPG,Synthetic Corp 3,0.59,325.33,Health Care,Data Center REITs,2025-06-25
3,SYNTHETICMODEL,USD,US Dollar,en-US,AMT,Synthetic Corp 4,4.86,112.77,Real Estate,Retail REITs,2025-06-25
4,SYNTHETICMODEL,USD,US Dollar,en-US,DLR,Synthetic Corp 5,4.25,181.47,Health Care,Telecom Tower REITs,2025-06-25


## Data Validating

In [None]:
def validate_holdings_data(df):
    issues = {}
    required_cols = [
        'PORTFOLIOCODE', 'CURRENCYCODE', 'TICKER', 'ISSUEDISPLAYNAME',
        'PORTFOLIOWEIGHT', 'PRICE', 'PRIMARYSECTORNAME', 'PRIMARYSUBINDUSTRYNAME', 'HISTORYDATE'
    ]

    print("🔍 Starting Holdings Data Validation...\n")

    for col in required_cols:
        nulls = df[col].isnull().sum()
        if nulls > 0:
            issues[col] = f"❌ {nulls} missing values"
            print(f"[{col}] ❌ FAILED: {nulls} null entries found.")
        else:
            print(f"[{col}] ✅ PASSED: No missing values.")

    # Value range checks
    if not df['PORTFOLIOWEIGHT'].between(0, 100).all():
        issues['PORTFOLIOWEIGHT'] = "⚠️ Out-of-bounds values (expected 0–100)"
        print("[PORTFOLIOWEIGHT] ⚠️ WARNING: Some values fall outside the 0–100 range.")
    else:
        print("[PORTFOLIOWEIGHT] ✅ PASSED: All values within expected range.")

    if not df['PRICE'].between(0, 10000).all():
        issues['PRICE'] = "⚠️ Price range seems off"
        print("[PRICE] ⚠️ WARNING: Detected potential outlier prices.")
    else:
        print("[PRICE] ✅ PASSED: All prices within expected range.")

    print("\n📋 Validation Summary:")
    if issues:
        for k, v in issues.items():
            print(f" - {k}: {v}")
    else:
        print("✅ All checks passed. Data is ready for ingestion.")

    return issues

# Run the validator
validation_issues = validate_holdings_data(synthetic_df)

🔍 Starting Holdings Data Validation...

[PORTFOLIOCODE] ✅ PASSED: No missing values.
[CURRENCYCODE] ✅ PASSED: No missing values.
[TICKER] ✅ PASSED: No missing values.
[ISSUEDISPLAYNAME] ✅ PASSED: No missing values.
[PORTFOLIOWEIGHT] ✅ PASSED: No missing values.
[PRICE] ✅ PASSED: No missing values.
[PRIMARYSECTORNAME] ✅ PASSED: No missing values.
[PRIMARYSUBINDUSTRYNAME] ✅ PASSED: No missing values.
[HISTORYDATE] ✅ PASSED: No missing values.
[PORTFOLIOWEIGHT] ✅ PASSED: All values within expected range.
[PRICE] ✅ PASSED: All prices within expected range.

📋 Validation Summary:
✅ All checks passed. Data is ready for ingestion.


## Data Pipeline Plan (?)

In [None]:
pipeline_plan = {
    "input_source": "synthetic_df (DataFrame)",
    "transformations": [
        "Validate columns",
        "Convert HISTORYDATE to YYYY-MM-DD",
        "Ensure all required fields"
    ],
    "target_table": "AST_REALESTATE_DB.DBO.HOLDINGSDETAILS",
    "destination": "Snowflake (via snowflake.connector)",
    "notes": "Insert only required fields, others can be NULL"
}

import json
print("📦 Pipeline Plan:\n")
print(json.dumps(pipeline_plan, indent=4))

📦 Pipeline Plan:

{
    "input_source": "synthetic_df (DataFrame)",
    "transformations": [
        "Validate columns",
        "Convert HISTORYDATE to YYYY-MM-DD",
        "Ensure all required fields"
    ],
    "target_table": "AST_REALESTATE_DB.DBO.HOLDINGSDETAILS",
    "destination": "Snowflake (via snowflake.connector)",
    "notes": "Insert only required fields, others can be NULL"
}


## Upload to Snowflake (?)

In [None]:
# !pip install snowflake-connector-python
import snowflake.connector

# Replace with your actual Snowflake credentials
conn = snowflake.connector.connect(
    user='YOUR_USERNAME',
    password='YOUR_PASSWORD',
    account='YOUR_ACCOUNT_REGION',
    warehouse='YOUR_WAREHOUSE',
    database='AST_REALESTATE_DB',
    schema='DBO'
)

cursor = conn.cursor()

# Prepare insert statement
insert_sql = """
INSERT INTO HOLDINGSDETAILS (
    PORTFOLIOCODE, CURRENCYCODE, CURRENCY, LANGUAGECODE,
    TICKER, ISSUEDISPLAYNAME, PORTFOLIOWEIGHT, PRICE,
    PRIMARYSECTORNAME, PRIMARYSUBINDUSTRYNAME, HISTORYDATE
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""

# Convert DataFrame to tuples
data_tuples = list(synthetic_df.itertuples(index=False, name=None))

# Execute in batch
cursor.executemany(insert_sql, data_tuples)
conn.commit()
cursor.close()
conn.close()
print("✅ Data successfully uploaded to Snowflake.")

## Simulate Base Holding Data

In [4]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Number of simulated REITs
num_holdings = 10

# Generate weights that sum to ~1.00 using Dirichlet distribution
weights = np.round(np.random.dirichlet(np.ones(num_holdings), size=1)[0], 2)

# Build base synthetic dataset
base_data = pd.DataFrame({
    'Holding': [f'REIT_{i+1}' for i in range(num_holdings)],
    'Weight': weights,
    'Price': np.round(np.random.uniform(30, 120, num_holdings), 2),
    'Shares Held': np.random.randint(100000, 500000, num_holdings),
    'EPS_now': np.round(np.random.uniform(1.5, 5.0, num_holdings), 2),
    'EPS_5y_ago': np.round(np.random.uniform(1.0, 3.5, num_holdings), 2),
    'Book Value/Share': np.round(np.random.uniform(25, 60, num_holdings), 2),
})

# Preview
base_data.head()

Unnamed: 0,Holding,Weight,Price,Shares Held,EPS_now,EPS_5y_ago,Book Value/Share
0,REIT_1,0.05,31.85,256730,3.66,1.96,51.44
1,REIT_2,0.29,117.29,446622,2.84,1.04,39.88
2,REIT_3,0.13,104.92,484681,4.94,1.58,32.28
3,REIT_4,0.09,49.11,249503,3.13,1.6,44.87
4,REIT_5,0.02,46.36,230523,4.51,2.71,26.1


## Calculate Per-Holding Values

In [5]:
# Calculate derived financial metrics for each synthetic REIT
base_data['Market Value'] = base_data['Price'] * base_data['Shares Held']
base_data['P/E'] = base_data['Price'] / base_data['EPS_now']
base_data['P/B'] = base_data['Price'] / base_data['Book Value/Share']
base_data['ROE'] = base_data['EPS_now'] / base_data['Book Value/Share']
base_data['Earnings Growth Rate'] = ((base_data['EPS_now'] / base_data['EPS_5y_ago'])**(1/5)) - 1

# Show result
base_data.head()

Unnamed: 0,Holding,Weight,Price,Shares Held,EPS_now,EPS_5y_ago,Book Value/Share,Market Value,P/E,P/B,ROE,Earnings Growth Rate
0,REIT_1,0.05,31.85,256730,3.66,1.96,51.44,8176850.5,8.702186,0.619168,0.071151,0.133039
1,REIT_2,0.29,117.29,446622,2.84,1.04,39.88,52384294.38,41.299296,2.941073,0.071214,0.222523
2,REIT_3,0.13,104.92,484681,4.94,1.58,32.28,50852730.52,21.238866,3.25031,0.153036,0.25607
3,REIT_4,0.09,49.11,249503,3.13,1.6,44.87,12253092.33,15.690096,1.094495,0.069757,0.143628
4,REIT_5,0.02,46.36,230523,4.51,2.71,26.1,10687046.28,10.279379,1.776245,0.172797,0.107239
