create python notebook pd.ipynb that include
- A statistical model (Logistic Regression) to calculate the probability of default for retail mortgage customers.
- use model parameters relevant to Dutch market. Also include two ESG related model parameters
- A function to generate synthetic data for running the model.
- add explicit linters to support beginner python developer
- generate multiple relevant visualizations with explicit description.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report, roc_curve
import warnings

# Explicitly define linting for beginner-friendly Python development
# pylint: disable=invalid-name, too-few-public-methods, missing-function-docstring
warnings.filterwarnings("ignore")

# Function to generate synthetic data
def generate_synthetic_data(n_samples=1000, random_state=42):
    np.random.seed(random_state)
    
    # Mortgage-relevant factors
    loan_to_value = np.random.uniform(50, 100, n_samples)  # Loan-to-value ratio (50%-100%)
    income = np.random.normal(50000, 15000, n_samples)  # Annual income in EUR
    credit_score = np.random.randint(300, 850, n_samples)  # Credit score range
    interest_rate = np.random.uniform(1.5, 6.0, n_samples)  # Interest rate in %
    debt_to_income = np.random.uniform(20, 50, n_samples)  # Debt-to-income ratio in %
    
    # ESG factors
    energy_efficiency = np.random.randint(1, 10, n_samples)  # Scale 1-10 (higher = better)
    sustainability_loan = np.random.choice([0, 1], size=n_samples, p=[0.7, 0.3])  # Sustainability loan flag
    
    # Probability of default (binary outcome: 1 = default, 0 = no default)
    default_probability = (
        0.3 * (loan_to_value / 100) +
        -0.2 * (income / 100000) +
        -0.25 * (credit_score / 850) +
        0.2 * (interest_rate / 5) +
        0.25 * (debt_to_income / 50) +
        -0.15 * (energy_efficiency / 10) +
        -0.1 * sustainability_loan +
        np.random.normal(0, 0.1, n_samples)
    )
    default = (default_probability > np.median(default_probability)).astype(int)
    
    data = pd.DataFrame({
        'Loan-to-Value': loan_to_value,
        'Income': income,
        'Credit Score': credit_score,
        'Interest Rate': interest_rate,
        'Debt-to-Income': debt_to_income,
        'Energy Efficiency': energy_efficiency,
        'Sustainability Loan': sustainability_loan,
        'Default': default
    })
    return data

# Generate dataset
data = generate_synthetic_data()

# Split data
X = data.drop(columns=['Default'])
y = data['Default']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)
y_pred_prob = model.predict_proba(X_test_scaled)[:, 1]

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
conf_matrix = confusion_matrix(y_test, y_pred)

# Display results
print(f'Accuracy: {accuracy:.2f}')
print(f'ROC AUC: {roc_auc:.2f}')
print('Confusion Matrix:\n', conf_matrix)
print('Classification Report:\n', classification_report(y_test, y_pred))

# Visualizations
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No Default', 'Default'], yticklabels=['No Default', 'Default'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

# Feature importance visualization
feature_importance = pd.Series(model.coef_[0], index=X.columns).sort_values()
plt.figure(figsize=(8, 6))
feature_importance.plot(kind='barh', color='teal')
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.title('Feature Importance in Logistic Regression')
plt.show()
