# Mortgage Lending - Exploratory Data Analysis

Dataset: `COCO-MEETUP-OSLO.PUBLIC.MORTGAGE_LENDING` (369,245 rows)

In [None]:
import pandas as pd
import snowflake.snowpark as snowpark
from snowflake.snowpark.context import get_active_session

session = get_active_session()

In [None]:
df = session.table('"COCO-MEETUP-OSLO".PUBLIC.MORTGAGE_LENDING').to_pandas()
print(f"Shape: {df.shape}")
print(f"\nColumn Types:\n{df.dtypes}")

In [None]:
print("Missing Values:")
print(df.isnull().sum())
print(f"\nMissing Value Percentages:")
print((df.isnull().sum() / len(df) * 100).round(2))

In [None]:
df.head(10)

## Statistical Summary

In [None]:
df.describe()

In [None]:
print("Categorical Column Value Counts:")
for col in ['LOAN_TYPE_NAME', 'LOAN_PURPOSE_NAME', 'COUNTY_NAME', 'MORTGAGERESPONSE']:
    print(f"\n--- {col} ---")
    print(df[col].value_counts())

## Distribution Analysis

In [None]:
print("Loan Type Distribution:")
loan_type_dist = df['LOAN_TYPE_NAME'].value_counts(normalize=True) * 100
print(loan_type_dist.round(2))

print("\nLoan Purpose Distribution:")
loan_purpose_dist = df['LOAN_PURPOSE_NAME'].value_counts(normalize=True) * 100
print(loan_purpose_dist.round(2))

print("\nMortgage Response Distribution:")
response_dist = df['MORTGAGERESPONSE'].value_counts(normalize=True) * 100
print(response_dist.round(2))

In [None]:
print("Income Statistics by Loan Type:")
df.groupby('LOAN_TYPE_NAME')['APPLICANT_INCOME_000S'].describe()

In [None]:
print("Loan Amount Statistics by Loan Purpose:")
df.groupby('LOAN_PURPOSE_NAME')['LOAN_AMOUNT_000S'].describe()

## Visualizations

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].hist(df['APPLICANT_INCOME_000S'].dropna(), bins=50, edgecolor='black')
axes[0].set_title('Applicant Income Distribution')
axes[0].set_xlabel('Income (000s)')
axes[0].set_ylabel('Frequency')

axes[1].hist(df['LOAN_AMOUNT_000S'].dropna(), bins=50, edgecolor='black', color='orange')
axes[1].set_title('Loan Amount Distribution')
axes[1].set_xlabel('Loan Amount (000s)')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

df['LOAN_TYPE_NAME'].value_counts().plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Loan Type Counts')
axes[0].set_xlabel('Loan Type')
axes[0].tick_params(axis='x', rotation=45)

df['LOAN_PURPOSE_NAME'].value_counts().plot(kind='bar', ax=axes[1], color='coral')
axes[1].set_title('Loan Purpose Counts')
axes[1].set_xlabel('Loan Purpose')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
top_counties = df['COUNTY_NAME'].value_counts().head(15)

plt.figure(figsize=(12, 6))
top_counties.plot(kind='barh', color='teal')
plt.title('Top 15 Counties by Loan Count')
plt.xlabel('Number of Loans')
plt.ylabel('County')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x='LOAN_PURPOSE_NAME', y='LOAN_AMOUNT_000S')
plt.title('Loan Amount by Loan Purpose')
plt.xlabel('Loan Purpose')
plt.ylabel('Loan Amount (000s)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df.sample(5000), x='APPLICANT_INCOME_000S', y='LOAN_AMOUNT_000S', 
                hue='LOAN_PURPOSE_NAME', alpha=0.5)
plt.title('Income vs Loan Amount (5000 sample)')
plt.xlabel('Applicant Income (000s)')
plt.ylabel('Loan Amount (000s)')
plt.legend(title='Loan Purpose', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
numeric_cols = ['APPLICANT_INCOME_000S', 'LOAN_AMOUNT_000S', 'MORTGAGERESPONSE']
corr_matrix = df[numeric_cols].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

---
# Machine Learning: Mortgage Approval Prediction

Target: `MORTGAGERESPONSE` (binary classification)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
import numpy as np

## Data Preprocessing

In [None]:
ml_df = df.copy()
ml_df = ml_df.drop(columns=['LOAN_ID', 'TS'])
ml_df = ml_df.dropna(subset=['APPLICANT_INCOME_000S'])
print(f"Shape after dropping nulls: {ml_df.shape}")

In [None]:
le_loan_type = LabelEncoder()
le_loan_purpose = LabelEncoder()
le_county = LabelEncoder()

ml_df['LOAN_TYPE_ENCODED'] = le_loan_type.fit_transform(ml_df['LOAN_TYPE_NAME'])
ml_df['LOAN_PURPOSE_ENCODED'] = le_loan_purpose.fit_transform(ml_df['LOAN_PURPOSE_NAME'])
ml_df['COUNTY_ENCODED'] = le_county.fit_transform(ml_df['COUNTY_NAME'])

print("Encoding complete")
print(f"Loan Types: {list(le_loan_type.classes_)}")
print(f"Loan Purposes: {list(le_loan_purpose.classes_)}")

## Feature Selection and Train/Test Split

In [None]:
feature_cols = ['APPLICANT_INCOME_000S', 'LOAN_AMOUNT_000S', 'LOAN_TYPE_ENCODED', 'LOAN_PURPOSE_ENCODED', 'COUNTY_ENCODED']
X = ml_df[feature_cols]
y = ml_df['MORTGAGERESPONSE']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"\nTarget distribution in train set:")
print(y_train.value_counts(normalize=True).round(3))

## Model Training

In [None]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)
print("Model training complete")

## Model Evaluation

In [None]:
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"\nAccuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()
print(feature_importance)

## Register Model to Snowflake ML Registry

In [None]:
from snowflake.ml.registry import Registry

reg = Registry(session=session, database_name='"oslo-demo"', schema_name="PUBLIC")

In [None]:
sample_input = X_train.head(100)

model_version = reg.log_model(
    model=rf_model,
    model_name="MORTGAGE_APPROVAL_MODEL",
    version_name="V1",
    sample_input_data=sample_input,
    comment="Random Forest model for mortgage approval prediction"
)
print(f"Model registered: {model_version.model_name} version {model_version.version_name}")

In [None]:
model_version.set_metric("accuracy", accuracy_score(y_test, y_pred))
model_version.set_metric("roc_auc", roc_auc_score(y_test, y_pred_proba))
print("Metrics logged to registry")

In [None]:
print("\nRegistered Models:")
reg.show_models()