In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Download from Kaggle: "Give Me Some Credit"
# https://www.kaggle.com/competitions/GiveMeSomeCredit/data
# Or use this synthetic data generator:

def generate_synthetic_credit_data(n_samples=10000):
    """Generate synthetic credit application data"""
    np.random.seed(42)
    
    data = {
        'applicant_id': [f'APP_{i:06d}' for i in range(n_samples)],
        'age': np.random.randint(18, 75, n_samples),
        'annual_income': np.random.lognormal(10.5, 0.8, n_samples),
        'debt_to_income_ratio': np.random.uniform(0, 2, n_samples),
        'num_credit_lines': np.random.randint(0, 15, n_samples),
        'num_late_payments': np.random.poisson(0.5, n_samples),
        'credit_utilization': np.random.uniform(0, 1.5, n_samples),
        'months_since_last_delinquency': np.random.randint(0, 120, n_samples),
        'num_credit_inquiries': np.random.randint(0, 10, n_samples),
        'purchase_amount': np.random.uniform(500, 10000, n_samples),
    }
    
    df = pd.DataFrame(data)
    
    # Generate target (default risk) based on features
    default_score = (
        (df['debt_to_income_ratio'] > 0.5).astype(int) * 0.3 +
        (df['num_late_payments'] > 2).astype(int) * 0.3 +
        (df['credit_utilization'] > 0.8).astype(int) * 0.2 +
        (df['age'] < 25).astype(int) * 0.1 +
        (df['num_credit_inquiries'] > 5).astype(int) * 0.1
    )
    
    # Add noise and convert to binary
    df['default_risk'] = (default_score + np.random.uniform(0, 0.3, n_samples) > 0.5).astype(int)
    
    return df

# Generate data
df = generate_synthetic_credit_data(10000)
df.to_csv('data/credit_applications.csv', index=False)

print(f"Dataset created: {len(df)} samples")
print(f"Default rate: {df['default_risk'].mean():.2%}")
print("\nFirst few rows:")
print(df.head())

OSError: Cannot save file into a non-existent directory: 'data'

In [None]:
# Save train/test split for later monitoring
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.to_csv('data/train_reference.csv', index=False)
test_df.to_csv('data/test_data.csv', index=False)

print("âœ… Data preparation complete!")