# Mortgage Lending Data - Exploratory Analysis
**Table:** COCO_DEMO.PUBLIC.MORTGAGE_LENDING_DEMO_DATA

In [None]:
import snowflake.snowpark.functions as F
from snowflake.snowpark.context import get_active_session
import pandas as pd
import matplotlib.pyplot as plt

session = get_active_session()
df = session.table('COCO_DEMO.PUBLIC.MORTGAGE_LENDING_DEMO_DATA')
print(f'Total rows: {df.count():,}')

## Data Sample & Schema

In [None]:
df.limit(10).to_pandas()

In [None]:
df.describe().to_pandas()

## Loan Type Distribution

In [None]:
loan_type_dist = df.group_by('LOAN_TYPE_NAME').agg(F.count('*').alias('count')).to_pandas()
loan_type_dist.plot(kind='bar', x='LOAN_TYPE_NAME', y='COUNT', title='Loan Type Distribution', legend=False)
plt.xlabel('Loan Type')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

## Loan Purpose Distribution

In [None]:
loan_purpose_dist = df.group_by('LOAN_PURPOSE_NAME').agg(F.count('*').alias('count')).to_pandas()
loan_purpose_dist.plot(kind='bar', x='LOAN_PURPOSE_NAME', y='COUNT', title='Loan Purpose Distribution', legend=False)
plt.xlabel('Loan Purpose')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

## Mortgage Response Analysis

In [None]:
response_dist = df.group_by('MORTGAGERESPONSE').agg(F.count('*').alias('count')).to_pandas()
response_dist.plot(kind='pie', y='COUNT', labels=response_dist['MORTGAGERESPONSE'], autopct='%1.1f%%', title='Mortgage Response Distribution')
plt.ylabel('')
plt.tight_layout()
plt.show()

## Income vs Loan Amount

In [None]:
sample_df = df.sample(n=5000).select('APPLICANT_INCOME_000S', 'LOAN_AMOUNT_000S', 'MORTGAGERESPONSE').to_pandas()
plt.figure(figsize=(10, 6))
plt.scatter(sample_df['APPLICANT_INCOME_000S'], sample_df['LOAN_AMOUNT_000S'], c=sample_df['MORTGAGERESPONSE'], alpha=0.5, cmap='coolwarm')
plt.xlabel('Applicant Income (000s)')
plt.ylabel('Loan Amount (000s)')
plt.title('Income vs Loan Amount (colored by Response)')
plt.colorbar(label='Mortgage Response')
plt.tight_layout()
plt.show()

## Top Counties by Application Volume

In [None]:
county_dist = df.group_by('COUNTY_NAME').agg(F.count('*').alias('count')).order_by(F.col('count').desc()).limit(15).to_pandas()
plt.figure(figsize=(12, 6))
plt.barh(county_dist['COUNTY_NAME'], county_dist['COUNT'])
plt.xlabel('Number of Applications')
plt.ylabel('County')
plt.title('Top 15 Counties by Application Volume')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## Loan Amount Distribution

In [None]:
loan_amounts = df.select('LOAN_AMOUNT_000S').to_pandas()
plt.figure(figsize=(10, 6))
plt.hist(loan_amounts['LOAN_AMOUNT_000S'], bins=50, edgecolor='black')
plt.xlabel('Loan Amount (000s)')
plt.ylabel('Frequency')
plt.title('Distribution of Loan Amounts')
plt.tight_layout()
plt.show()

## Machine Learning: Mortgage Approval Prediction

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import numpy as np

ml_df = df.select(
    'LOAN_TYPE_NAME', 'LOAN_PURPOSE_NAME', 'APPLICANT_INCOME_000S', 
    'LOAN_AMOUNT_000S', 'COUNTY_NAME', 'MORTGAGERESPONSE'
).to_pandas()

print(f"Dataset shape: {ml_df.shape}")
print(f"Missing values:\n{ml_df.isnull().sum()}")

In [None]:
ml_df = ml_df.dropna()
print(f"After dropping nulls: {ml_df.shape}")

le_loan_type = LabelEncoder()
le_loan_purpose = LabelEncoder()
le_county = LabelEncoder()

ml_df['LOAN_TYPE_ENC'] = le_loan_type.fit_transform(ml_df['LOAN_TYPE_NAME'])
ml_df['LOAN_PURPOSE_ENC'] = le_loan_purpose.fit_transform(ml_df['LOAN_PURPOSE_NAME'])
ml_df['COUNTY_ENC'] = le_county.fit_transform(ml_df['COUNTY_NAME'])

feature_cols = ['LOAN_TYPE_ENC', 'LOAN_PURPOSE_ENC', 'APPLICANT_INCOME_000S', 'LOAN_AMOUNT_000S', 'COUNTY_ENC']
X = ml_df[feature_cols]
y = ml_df['MORTGAGERESPONSE']

print(f"Features: {feature_cols}")
print(f"Target distribution:\n{y.value_counts(normalize=True)}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}")

model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)
print("Model training complete!")

In [None]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_proba):.4f}")
print(f"\nClassification Report:\n{classification_report(y_test, y_pred)}")

In [None]:
feature_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(8, 5))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance - Mortgage Approval Model')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## Register Model in Snowflake

In [None]:
from snowflake.ml.registry import Registry

session.use_role("AICOLLEGE")
session.use_database("COCO_DEMO")
session.use_schema("PUBLIC")

reg = Registry(session=session, database_name="COCO_DEMO", schema_name="PUBLIC")

sample_input = X_train.head(5)

mv = reg.log_model(
    model,
    model_name="MORTGAGE_APPROVAL_MODEL",
    version_name="V1",
    sample_input_data=sample_input,
    conda_dependencies=["scikit-learn"],
    comment="Random Forest classifier for mortgage approval prediction"
)

print(f"Model registered: {mv.model_name} version {mv.version_name}")

In [None]:
print("Available functions:")
print(mv.show_functions())

## XGBoost Model

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1,
    eval_metric='logloss'
)

xgb_model.fit(X_train, y_train)
print("XGBoost training complete!")

In [None]:
y_pred_xgb = xgb_model.predict(X_test)
y_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]

print("=== XGBoost Results ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_proba_xgb):.4f}")
print(f"\n{classification_report(y_test, y_pred_xgb)}")

In [None]:
print("=== Model Comparison ===")
print(f"Random Forest - Accuracy: {accuracy_score(y_test, y_pred):.4f}, ROC-AUC: {roc_auc_score(y_test, y_proba):.4f}")
print(f"XGBoost       - Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}, ROC-AUC: {roc_auc_score(y_test, y_proba_xgb):.4f}")

## Register XGBoost Model & Set as Default

In [None]:
mv_xgb = reg.log_model(
    xgb_model,
    model_name="MORTGAGE_APPROVAL_MODEL",
    version_name="V2_XGBOOST",
    sample_input_data=sample_input,
    conda_dependencies=["xgboost"],
    comment="XGBoost classifier for mortgage approval prediction"
)

print(f"Model registered: {mv_xgb.model_name} version {mv_xgb.version_name}")

In [None]:
model_ref = reg.get_model("MORTGAGE_APPROVAL_MODEL")
model_ref.default = "V2_XGBOOST"

print(f"Default version set to: {model_ref.default}")

## Model Explainability (SHAP)

In [None]:
import shap

explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_test)

print(f"SHAP values computed for {len(shap_values)} samples")

In [None]:
shap.summary_plot(shap_values, X_test, feature_names=feature_cols, show=True)

In [None]:
shap.summary_plot(shap_values, X_test, feature_names=feature_cols, plot_type='bar', show=True)

In [None]:
shap.plots.waterfall(shap.Explanation(values=shap_values[0], base_values=explainer.expected_value, feature_names=feature_cols), show=True)
print("\nExplanation for first test sample:")

## SQL Inference with XGBoost Model

In [None]:
# Re-log the model with WAREHOUSE target platform for SQL inference
from snowflake.ml.registry import Registry

reg = Registry(session=session, database_name="COCO_DEMO", schema_name="PUBLIC")

# Use the same sample_input from training (encoded features)
# sample_input contains: LOAN_TYPE_ENC, LOAN_PURPOSE_ENC, APPLICANT_INCOME_000S, LOAN_AMOUNT_000S, COUNTY_ENC

# Re-log with WAREHOUSE target
model_ref = reg.log_model(
    model=xgb_model,
    model_name="MORTGAGE_APPROVAL_MODEL",
    version_name="V3_SQL",
    target_platforms=["WAREHOUSE"],
    sample_input_data=sample_input,
    comment="XGBoost model for SQL inference"
)

print(f"Model logged: {model_ref.model_name} version {model_ref.version_name}")

In [None]:
# Create synthetic test data based on X_test statistics
import pandas as pd
import numpy as np

# Get X_test statistics for synthetic data generation
print("X_test statistics:")
print(X_test.describe())

# Create synthetic test data with similar distribution
np.random.seed(42)
n_samples = 20

synthetic_data = pd.DataFrame({
    'LOAN_TYPE_ENC': np.random.randint(0, X_test['LOAN_TYPE_ENC'].max() + 1, n_samples),
    'LOAN_PURPOSE_ENC': np.random.randint(0, X_test['LOAN_PURPOSE_ENC'].max() + 1, n_samples),
    'APPLICANT_INCOME_000S': np.random.uniform(X_test['APPLICANT_INCOME_000S'].min(), 
                                                X_test['APPLICANT_INCOME_000S'].quantile(0.95), n_samples).astype(int),
    'LOAN_AMOUNT_000S': np.random.uniform(X_test['LOAN_AMOUNT_000S'].min(), 
                                          X_test['LOAN_AMOUNT_000S'].quantile(0.95), n_samples).astype(int),
    'COUNTY_ENC': np.random.randint(0, min(X_test['COUNTY_ENC'].max() + 1, 100), n_samples)
})

print(f"\nSynthetic test data ({n_samples} samples):")
synthetic_data

In [None]:
# Upload synthetic data to Snowflake for SQL inference
from snowflake.snowpark.types import IntegerType, StructType, StructField

# Create Snowpark DataFrame and save as table
synthetic_sp_df = session.create_dataframe(synthetic_data)
synthetic_sp_df.write.mode('overwrite').save_as_table('COCO_DEMO.PUBLIC.SYNTHETIC_MORTGAGE_TEST_DATA')

print("Synthetic test data uploaded to COCO_DEMO.PUBLIC.SYNTHETIC_MORTGAGE_TEST_DATA")

In [None]:
%%sql -r synthetic_data_check
SELECT * FROM COCO_DEMO.PUBLIC.SYNTHETIC_MORTGAGE_TEST_DATA

In [None]:
%%sql -r sql_predictions
SELECT 
    LOAN_TYPE_ENC,
    LOAN_PURPOSE_ENC,
    APPLICANT_INCOME_000S,
    LOAN_AMOUNT_000S,
    COUNTY_ENC,
    COCO_DEMO.PUBLIC.MORTGAGE_APPROVAL_MODEL!PREDICT(
        LOAN_TYPE_ENC, 
        LOAN_PURPOSE_ENC, 
        APPLICANT_INCOME_000S, 
        LOAN_AMOUNT_000S, 
        COUNTY_ENC
    ) AS PREDICTION
FROM COCO_DEMO.PUBLIC.SYNTHETIC_MORTGAGE_TEST_DATA

In [None]:
# Display SQL inference results
print("=== SQL Inference Results ===")
print(f"Total predictions: {len(sql_predictions)}")
print(f"\nPrediction distribution:")
print(sql_predictions['PREDICTION'].value_counts())
print(f"\nFull results:")
sql_predictions

## Cleanup Code
Remove all created objects: Agent, Semantic View, and the entire COCO_DEMO database

In [None]:
%%sql -r drop_agent_result
DROP AGENT IF EXISTS COCO_DEMO.PUBLIC.COCO_DEMO_AGENT

In [None]:
%%sql -r drop_semantic_view_result
DROP VIEW IF EXISTS COCO_DEMO.PUBLIC.COCO_MORTAGE_SEMANTIC_VIEW

In [None]:
%%sql -r drop_database_result
DROP DATABASE IF EXISTS COCO_DEMO