prompt 1 - Create python notebook pd.ipynb that include
	statistical model to calculate probability of default for retail mortgage customers.
	generate synthetic data that I can use to run above model 
	Include all the necessary linters
Prompt 2 - I am not that expert in python. add more explicit linters... Also, include more parameters that are relevant for netherlands retail mortgages
Prompt 3 - you included one plot.. can there be other relevant plots
Prompt 4 - Include the ESG realated 2 model parameters

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
import warnings
import pylint
import flake8

# Ignore warnings for cleaner output
warnings.filterwarnings("ignore")

# Define key factors as run parameters
def generate_synthetic_data(n_samples=1000, random_state=42):
    """
    Generates synthetic data for retail mortgage customers in the Netherlands.
    Includes ESG-related parameters and other relevant features.
    """
    np.random.seed(random_state)
    
    # Feature distributions
    income = np.random.normal(60000, 20000, n_samples)  # Annual income in EUR
    loan_amount = np.random.normal(250000, 75000, n_samples)  # Mortgage loan amount in EUR
    credit_score = np.random.randint(300, 850, n_samples)  # Credit score range
    employment_years = np.random.randint(0, 40, n_samples)  # Years of employment
    age = np.random.randint(18, 75, n_samples)  # Age of borrower
    loan_to_value = np.random.uniform(0.6, 1.2, n_samples)  # Loan-to-value ratio
    debt_to_income = np.random.uniform(0.1, 0.7, n_samples)  # Debt-to-income ratio
    interest_rate = np.random.uniform(1.0, 5.0, n_samples)  # Mortgage interest rate in %
    num_dependents = np.random.randint(0, 5, n_samples)  # Number of dependents
    esg_score = np.random.uniform(0, 100, n_samples)  # ESG score (0-100 scale)
    green_mortgage = np.random.randint(0, 2, n_samples)  # 1 if green mortgage, 0 otherwise
    
    # Default probability based on multiple factors
    prob_default = (
        0.22 * (1 - income / max(income)) +
        0.30 * (1 - credit_score / 850) +
        0.12 * (loan_to_value / max(loan_to_value)) +
        0.12 * (debt_to_income / max(debt_to_income)) +
        0.08 * (interest_rate / max(interest_rate)) +
        0.10 * (1 - esg_score / 100) +  # Higher ESG score reduces default risk
        0.06 * (1 - green_mortgage)  # Green mortgage reduces default risk
    )
    
    default = np.random.binomial(1, prob_default)
    
    # Create DataFrame
    data = pd.DataFrame({
        'Income': income,
        'LoanAmount': loan_amount,
        'CreditScore': credit_score,
        'EmploymentYears': employment_years,
        'Age': age,
        'LoanToValue': loan_to_value,
        'DebtToIncome': debt_to_income,
        'InterestRate': interest_rate,
        'NumDependents': num_dependents,
        'ESGScore': esg_score,
        'GreenMortgage': green_mortgage,
        'Default': default
    })
    return data

# Generate dataset
data = generate_synthetic_data()

# Train-test split
X = data[['Income', 'LoanAmount', 'CreditScore', 'EmploymentYears', 'Age', 'LoanToValue', 'DebtToIncome', 'InterestRate', 'NumDependents', 'ESGScore', 'GreenMortgage']]
y = data['Default']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"ROC AUC: {roc_auc:.2f}")
print("Confusion Matrix:")
print(conf_matrix)

# Visualization 1: Feature Importance
coefficients = pd.DataFrame(model.coef_.T, index=X.columns, columns=['Coefficient'])
coefficients.sort_values(by='Coefficient', ascending=False, inplace=True)
plt.figure(figsize=(10, 6))
sns.barplot(x=coefficients.index, y=coefficients['Coefficient'])
plt.xticks(rotation=45)
plt.title("Feature Importance")
plt.show()

# Visualization 2: Confusion Matrix Heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix Heatmap')
plt.show()

# Visualization 3: Distribution of Predicted Default Probabilities
plt.figure(figsize=(10, 6))
sns.histplot(y_pred_prob, bins=30, kde=True)
plt.xlabel('Predicted Probability of Default')
plt.ylabel('Count')
plt.title('Distribution of Predicted Default Probabilities')
plt.show()

# Visualization 4: Loan Amount vs. Default Status
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['Default'], y=data['LoanAmount'])
plt.xlabel('Default (0 = No, 1 = Yes)')
plt.ylabel('Loan Amount')
plt.title('Loan Amount Distribution by Default Status')
plt.show()

# Visualization 5: Credit Score vs. Default Status
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['Default'], y=data['CreditScore'])
plt.xlabel('Default (0 = No, 1 = Yes)')
plt.ylabel('Credit Score')
plt.title('Credit Score Distribution by Default Status')
plt.show()

# Visualization 6: ESG Score vs. Default Status
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['Default'], y=data['ESGScore'])
plt.xlabel('Default (0 = No, 1 = Yes)')
plt.ylabel('ESG Score')
plt.title('ESG Score Distribution by Default Status')
plt.show()

# Visualization 7: Green Mortgage vs. Default Status
plt.figure(figsize=(10, 6))
sns.countplot(x=data['GreenMortgage'], hue=data['Default'])
plt.xlabel('Green Mortgage (0 = No, 1 = Yes)')
plt.ylabel('Count')
plt.title('Default Distribution by Green Mortgage Status')
plt.show()
