In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from datetime import datetime, timedelta
import random
# Number of samples
num_samples = 40000

# Seed for reproducibility
np.random.seed(42)

def generate_correlated_features(num_samples):
    # Generate base features
    age = np.random.normal(40, 12, num_samples).clip(18, 80).astype(int)
    experience = (age - 18 - np.random.normal(4, 2, num_samples).clip(0)).clip(0).astype(int)
    education_level = np.random.choice(['High School', 'Associate', 'Bachelor', 'Master', 'Doctorate'], num_samples, p=[0.3, 0.2, 0.3, 0.15, 0.05])
    
    # Education affects income and credit score
    edu_impact = {'High School': 0, 'Associate': 0.1, 'Bachelor': 0.2, 'Master': 0.3, 'Doctorate': 0.4}
    edu_factor = np.array([edu_impact[level] for level in education_level])
    
    # Generate correlated income, credit score, and employment status
    base_income = np.random.lognormal(10.5, 0.6, num_samples) * (1 + edu_factor) * (1 + experience / 100)
    income_noise = np.random.normal(0, 0.1, num_samples)

    low_income_max = 190 * 12  # Convert monthly $190 to annual income
    middle_income_min = 190 * 12
    middle_income_max = 500 * 12
    high_income_min = 500 * 12

    # Define proportions for each income range
    low_income_ratio = 0.3  # Average of 30-40%
    middle_income_ratio = 0.5  # Average of 40-50%
    high_income_ratio = 0.20  # Average of 10-20%

    # Calculate the number of samples for each range
    low_income_count = int(num_samples * low_income_ratio)
    middle_income_count = int(num_samples * middle_income_ratio)
    high_income_count = num_samples - low_income_count - middle_income_count  # Remaining samples

    # Generate income values for each range
    low_income = np.random.uniform(1500, low_income_max, low_income_count).astype(int)  # $1500/year is the minimum
    middle_income = np.random.uniform(middle_income_min, middle_income_max, middle_income_count).astype(int)
    high_income = np.random.uniform(high_income_min, 36000, high_income_count).astype(int)  # $36,000/year is the maximum

    # Combine income groups and shuffle for randomness
    annual_income = np.concatenate([low_income, middle_income, high_income])
    np.random.shuffle(annual_income)


    credit_score_base = 300 + 300 * stats.beta.rvs(5, 1.5, size=num_samples)
    credit_score = (credit_score_base + edu_factor * 100 + experience * 1.5 + income_noise * 100).clip(300, 850).astype(int)
    
    employment_status_probs = np.column_stack([
        0.9 - edu_factor * 0.3,  # Employed
        0.05 + edu_factor * 0.2,  # Self-Employed
        0.05 + edu_factor * 0.1   # Unemployed
    ])
    employment_status = np.array(['Employed', 'Self-Employed', 'Unemployed'])[np.argmax(np.random.random(num_samples)[:, np.newaxis] < employment_status_probs.cumsum(axis=1), axis=1)]
    
    return age, experience, education_level, annual_income, credit_score, employment_status

# def generate_random_dates(num_samples, start_date="2018-01-01", end_date="2023-12-31"):
#     """
#     Generate random dates within a specified range.

#     Args:
#         num_samples (int): Number of random dates to generate.
#         start_date (str): Start of the date range (inclusive).
#         end_date (str): End of the date range (inclusive).

#     Returns:
#         list: List of random datetime objects.
#     """
#     start_date = datetime.strptime(start_date, "%Y-%m-%d")
#     end_date = datetime.strptime(end_date, "%Y-%m-%d")
#     date_range = (end_date - start_date).days
    
#     random_dates = [
#         start_date + timedelta(days=random.randint(0, date_range))
#         for _ in range(num_samples)
#     ]
#     return random_dates

def generate_time_based_features(num_samples):
    start_date = datetime(2018, 1, 1)
    dates = [start_date + timedelta(days=i) for i in range(num_samples)]
    return dates

age, experience, education_level, annual_income, credit_score, employment_status = generate_correlated_features(num_samples)
application_dates = generate_time_based_features(num_samples)

data = {
    'ApplicationDate': application_dates,
    'Age': age,
    'AnnualIncome': annual_income,
    'CreditScore': credit_score,
    'EmploymentStatus': employment_status,
    'EducationLevel': education_level,
    'Experience': experience,
    'LoanAmount': np.random.lognormal(10, 0.5, num_samples).astype(int),
    'LoanDuration': np.random.choice([12, 24, 36, 48, 60, 72, 84, 96, 108, 120], num_samples, p=[0.05, 0.1, 0.2, 0.2, 0.2, 0.1, 0.05, 0.05, 0.025, 0.025]),
    'MaritalStatus': np.random.choice(['Single', 'Married', 'Divorced', 'Widowed'], num_samples, p=[0.3, 0.5, 0.15, 0.05]),
    'NumberOfDependents': np.random.choice([0, 1, 2, 3, 4, 5], num_samples, p=[0.3, 0.25, 0.2, 0.15, 0.07, 0.03]),
    'HomeOwnershipStatus': np.random.choice(['Own', 'Rent', 'Mortgage', 'Other'], num_samples, p=[0.2, 0.3, 0.4, 0.1]),
    'MonthlyDebtPayments': np.random.lognormal(6, 0.5, num_samples).astype(int),
    'CreditCardUtilizationRate': np.random.beta(2, 5, num_samples),
    'NumberOfOpenCreditLines': np.random.poisson(3, num_samples).clip(0, 15).astype(int),
    'NumberOfCreditInquiries': np.random.poisson(1, num_samples).clip(0, 10).astype(int),
    'DebtToIncomeRatio': np.random.beta(2, 5, num_samples),
    'BankruptcyHistory': np.random.choice([0, 1], num_samples, p=[0.95, 0.05]),
    'LoanPurpose': np.random.choice(['Home', 'Auto', 'Education', 'Debt Consolidation', 'Other'], num_samples, p=[0.3, 0.2, 0.15, 0.25, 0.1]),
    'PreviousLoanDefaults': np.random.choice([0, 1], num_samples, p=[0.9, 0.1]),
    'PaymentHistory': np.random.poisson(24, num_samples).clip(0, 60).astype(int),
    'LengthOfCreditHistory': np.random.randint(1, 30, num_samples),
    'SavingsAccountBalance': np.random.lognormal(8, 1, num_samples).astype(int),
    'CheckingAccountBalance': np.random.lognormal(7, 1, num_samples).astype(int),
    'TotalAssets': np.random.lognormal(11, 1, num_samples).astype(int),
    'TotalLiabilities': np.random.lognormal(10, 1, num_samples).astype(int),
    'MonthlyIncome': annual_income / 12,
    'UtilityBillsPaymentHistory': np.random.beta(8, 2, num_samples),
    'JobTenure': np.random.poisson(5, num_samples).clip(0, 40).astype(int),
}

# Create DataFrame
df = pd.DataFrame(data)

# Ensure TotalAssets is always greater than or equal to the sum of SavingsAccountBalance and CheckingAccountBalance
df['TotalAssets'] = np.maximum(df['TotalAssets'], df['SavingsAccountBalance'] + df['CheckingAccountBalance'])

# Add more complex derived features
min_net_worth = 1000  # Set a minimum net worth
df['NetWorth'] = np.maximum(df['TotalAssets'] - df['TotalLiabilities'], min_net_worth)

# More realistic interest rate based on credit score, loan amount, and loan duration
df['BaseInterestRate'] = 0.03 + (850 - df['CreditScore']) / 2000 + df['LoanAmount'] / 400000 + df['LoanDuration'] / 1200
df['InterestRate'] = df['BaseInterestRate'] * (1 + np.random.normal(0, 0.1, num_samples)).clip(0.8, 1.2)

df['MonthlyLoanPayment'] = (df['LoanAmount'] * (df['InterestRate']/12)) / (1 - (1 + df['InterestRate']/12)**(-df['LoanDuration']))
df['TotalDebtToIncomeRatio'] = (df['MonthlyDebtPayments'] + df['MonthlyLoanPayment']) / df['MonthlyIncome']

# Create a more complex loan approval rule
def loan_approval_rule(row):
    score = 0
    score += (row['CreditScore'] - 600) / 250  # Credit score factor
    score += (24000 - row['AnnualIncome']) / 24000  # Income factor
    score += (row['TotalDebtToIncomeRatio'] - 0.4) * 2  # DTI factor
    score += (row['LoanAmount'] - 1800) / 9000  # Loan amount factor
    score += (row['InterestRate'] - 0.05) * 10  # Interest rate factor
    score += 0.5 if row['BankruptcyHistory'] == 1 else 0  # Bankruptcy penalty
    score += 0.3 if row['PreviousLoanDefaults'] == 1 else 0  # Previous default penalty
    score += 0.2 if row['EmploymentStatus'] == 'Unemployed' else 0  # Employment status factor
    score -= 0.1 if row['HomeOwnershipStatus'] in ['Own', 'Mortgage'] else 0  # Home ownership factor
    score -= row['PaymentHistory'] / 120  # Payment history factor
    score -= row['LengthOfCreditHistory'] / 60  # Length of credit history factor
    score -= row['NetWorth'] / 5000  # Net worth factor
    
    # Age factor (slight preference for middle-aged applicants)
    score += abs(row['Age'] - 40) / 100
    
    # Experience factor
    score -= row['Experience'] / 200
    
    # Education factor
    edu_score = {'High School': 0.2, 'Associate': 0.1, 'Bachelor': 0, 'Master': -0.1, 'Doctorate': -0.2}
    score += edu_score[row['EducationLevel']]
    
    # Seasonal factor (higher approval rates in spring/summer)
    month = row['ApplicationDate'].month
    score -= 0.1 if 3 <= month <= 8 else 0
    
    # Random factor to add some unpredictability
    score += np.random.normal(0, 0.1)
    
    return 1 if score < 1 else 0  # Adjust this threshold to change overall approval rate

df['LoanApproved'] = df.apply(loan_approval_rule, axis=1)

# Add some noise and outliers
noise_mask = np.random.choice([True, False], num_samples, p=[0.01, 0.99])
df.loc[noise_mask, 'AnnualIncome'] = (df.loc[noise_mask, 'AnnualIncome'] * np.random.uniform(1.5, 2.0, noise_mask.sum())).astype(int)

low_net_worth_mask = df['NetWorth'] == min_net_worth
df.loc[low_net_worth_mask, 'NetWorth'] += np.random.randint(0, 1000, size=low_net_worth_mask.sum())

# Print some statistics
print(f"Loan Approval Rate: {df['LoanApproved'].mean():.2%}")
print(f"Average Credit Score: {df['CreditScore'].mean():.0f}")
print(f"Average Annual Income: ${df['AnnualIncome'].mean():.0f}")
print(f"Average Loan Amount: ${df['LoanAmount'].mean():.0f}")
print(f"Average Total Debt-to-Income Ratio: {df['TotalDebtToIncomeRatio'].mean():.2f}")
print(f"Average Interest Rate: {df['InterestRate'].mean():.2%}")

def assign_credit_score_risk(credit_score):
    if credit_score >= 750: return 1
    elif 700 <= credit_score < 750: return 2
    elif 650 <= credit_score < 700: return 3
    elif 600 <= credit_score < 650: return 4
    else: return 5

def assign_dti_risk(dti):
    if dti < 0.20: return 1
    elif 0.20 <= dti < 0.30: return 2
    elif 0.30 <= dti < 0.40: return 3
    elif 0.40 <= dti < 0.50: return 4
    else: return 5

def assign_payment_history_risk(payment_history):
    if payment_history >= 99: return 1
    elif 97 <= payment_history < 99: return 2
    elif 95 <= payment_history < 97: return 3
    elif 90 <= payment_history < 95: return 4
    else: return 5

def assign_bankruptcy_risk(bankruptcy_history):
    return 5 if bankruptcy_history else 1

def assign_previous_defaults_risk(previous_defaults):
    if previous_defaults == 0: return 1
    elif previous_defaults == 1: return 3
    else: return 5

def assign_utilization_risk(utilization):
    if utilization < 0.20: return 1
    elif 0.20 <= utilization < 0.40: return 2
    elif 0.40 <= utilization < 0.60: return 3
    elif 0.60 <= utilization < 0.80: return 4
    else: return 5

def assign_credit_history_risk(length_of_history):
    if length_of_history >= 10: return 1
    elif 7 <= length_of_history < 10: return 2
    elif 5 <= length_of_history < 7: return 3
    elif 3 <= length_of_history < 5: return 4
    else: return 5

def assign_income_risk(annual_income):
    if annual_income >= 10800: return 1
    elif 4200 <= annual_income < 10800: return 2
    elif 2400 <= annual_income < 4200: return 3
    elif 1200 <= annual_income < 2400: return 4
    else: return 5

def assign_employment_risk(employment_status):
    if employment_status == 'Employed': return 1
    elif employment_status == 'Self-employed': return 2
    elif employment_status == 'Part-time': return 3
    else: return 4  # Unemployed or other

def assign_net_worth_risk(net_worth):
    if net_worth >= 50000: return 1
    elif 25000 <= net_worth < 50000: return 2
    elif 10000 <= net_worth < 20000: return 3
    elif 5000 <= net_worth < 10000: return 4
    else: return 5

# Refined overall risk calculation
def calculate_overall_risk(row):
    base_score = (
        assign_credit_score_risk(row['CreditScore']) * 3 +
        assign_dti_risk(row['DebtToIncomeRatio']) * 2 +
        assign_payment_history_risk(row['PaymentHistory']) * 2 +
        assign_bankruptcy_risk(row['BankruptcyHistory']) +
        assign_previous_defaults_risk(row['PreviousLoanDefaults']) +
        assign_utilization_risk(row['CreditCardUtilizationRate']) +
        assign_credit_history_risk(row['LengthOfCreditHistory']) +
        assign_income_risk(row['AnnualIncome']) * 3 +
        assign_employment_risk(row['EmploymentStatus']) +
        assign_net_worth_risk(row['NetWorth']) * 2
    )
    
    # Adjust score based on loan approval status
    if row['LoanApproved'] == 1:  # Assuming 1 means approved
        base_score *= 0.8  # Reduce risk score for approved loans
    
    return base_score

# Apply the refined risk calculation
df['RiskScore'] = df.apply(calculate_overall_risk, axis=1)

def assign_risk_level(score):
    if score > 60:
        return 'Low Risk'
    elif 40 <= score <= 60:
        return 'Medium Risk'
    else:
        return 'High Risk'

# # Apply the risk level categorization
df[' vRiskLevel'] = df['RiskScore'].apply(assign_risk_level)

# Save to CSV
df.to_csv('risk_level5.csv', index=False)
print("\nFocused synthetic data saved to 'focused_synthetic_loan_data.csv'")

# Display final feature count
print(f"\nTotal number of features (including label): {len(df.columns)}")
print("\nFeatures:")
for column in df.columns:
    print(f"- {column}")

Loan Approval Rate: 37.36%
Average Credit Score: 571
Average Annual Income: $6896
Average Loan Amount: $24965
Average Total Debt-to-Income Ratio: 5.26
Average Interest Rate: 27.68%

Focused synthetic data saved to 'focused_synthetic_loan_data.csv'

Total number of features (including label): 37

Features:
- ApplicationDate
- Age
- AnnualIncome
- CreditScore
- EmploymentStatus
- EducationLevel
- Experience
- LoanAmount
- LoanDuration
- MaritalStatus
- NumberOfDependents
- HomeOwnershipStatus
- MonthlyDebtPayments
- CreditCardUtilizationRate
- NumberOfOpenCreditLines
- NumberOfCreditInquiries
- DebtToIncomeRatio
- BankruptcyHistory
- LoanPurpose
- PreviousLoanDefaults
- PaymentHistory
- LengthOfCreditHistory
- SavingsAccountBalance
- CheckingAccountBalance
- TotalAssets
- TotalLiabilities
- MonthlyIncome
- UtilityBillsPaymentHistory
- JobTenure
- NetWorth
- BaseInterestRate
- InterestRate
- MonthlyLoanPayment
- TotalDebtToIncomeRatio
- LoanApproved
- RiskScore
-  vRiskLevel


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error, r2_score

# Load the dataset
data = pd.read_csv("risk_level5.csv")

# Drop unnecessary columns
columns_to_drop = [
    "ApplicationDate", "EducationLevel", "Experience", "MaritalStatus", "NumberOfDependents",
    "HomeOwnershipStatus", "MonthlyDebtPayments", "CreditCardUtilizationRate",
    "NumberOfOpenCreditLines", "NumberOfCreditInquiries", "DebtToIncomeRatio",
    "BankruptcyHistory", "PreviousLoanDefaults", "PaymentHistory", "LengthOfCreditHistory",
    "SavingsAccountBalance", "CheckingAccountBalance", "TotalAssets", "TotalLiabilities",
    "MonthlyIncome", "UtilityBillsPaymentHistory", "JobTenure", "NetWorth",
    "BaseInterestRate", "MonthlyLoanPayment", "TotalDebtToIncomeRatio"
]
data = data.drop(columns=columns_to_drop, axis=1)

# Function to prepare data and train models
def train_model(target_column, drop_column, is_regression=False):
    print(f"\n=== Training for {target_column}, dropping {drop_column} ===")

    # Drop the specified column
    data_to_train = data.drop(columns=[drop_column])

    # Separate features and target
    X = data_to_train.drop(columns=[target_column])
    y = data_to_train[target_column]

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Preprocessing: Handle categorical and numeric features
    categorical_features = X.select_dtypes(include=["object"]).columns
    numeric_features = X.select_dtypes(include=["number"]).columns

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numeric_features),
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ]
    )

    # Define models based on regression or classification
    if is_regression:
        models = {
            "Linear Regression": LinearRegression(),
            "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42),
            "Gradient Boosting Regressor": GradientBoostingRegressor(random_state=42),
        }
    else:
        models = {
            "Logistic Regression": LogisticRegression(max_iter=1000),
            "Random Forest Classifier": RandomForestClassifier(n_estimators=100, random_state=42),
            "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=42),
        }

    # Train and evaluate each model
    for model_name, model in models.items():
        print(f"\n--- {model_name} ---")
        pipeline = Pipeline(steps=[
            ("preprocessor", preprocessor),
            ("classifier", model),
        ])
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)

        # Evaluation based on regression or classification
        if is_regression:
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            print(f"Mean Squared Error: {mse:.4f}")
            print(f"R^2 Score: {r2:.4f}")
        else:
            print(classification_report(y_test, y_pred))
            print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# Train for LoanApproved (classification)
train_model(target_column="LoanApproved", drop_column="RiskScore", is_regression=False)

# Train for RiskScore (regression)
train_model(target_column="RiskScore", drop_column="LoanApproved", is_regression=True)



=== Training for LoanApproved, dropping RiskScore ===

--- Logistic Regression ---
              precision    recall  f1-score   support

           0       0.88      0.99      0.94      5014
           1       0.99      0.78      0.87      2986

    accuracy                           0.91      8000
   macro avg       0.94      0.89      0.90      8000
weighted avg       0.92      0.91      0.91      8000

Accuracy: 0.9145

--- Random Forest Classifier ---
              precision    recall  f1-score   support

           0       0.88      0.99      0.94      5014
           1       0.98      0.78      0.87      2986

    accuracy                           0.91      8000
   macro avg       0.93      0.89      0.90      8000
weighted avg       0.92      0.91      0.91      8000

Accuracy: 0.9145

--- Gradient Boosting Classifier ---
              precision    recall  f1-score   support

           0       0.88      0.99      0.94      5014
           1       0.99      0.78      0.87    