In [18]:
import pandas as pd
import numpy as np
from datetime import datetime

# Set random seed for reproducibility
np.random.seed(42)

# Generate date range with economic trends
years = np.random.choice(np.arange(2020, 2026), size=1000)

# Simulate dataset with temporal trends
data = {
    'year': years,
    'credit_score': np.clip(np.random.normal(650, 100, 1000) + (years-2020)*10, 300, 850).astype(int),
    'annual_income': np.random.randint(20000, 200000, 1000) * (1 + 0.03*(years-2020)),
    'total_debt': np.random.randint(5000, 150000, 1000) * (1 + 0.05*(years-2020)),
    'loan_to_value_ratio': np.random.uniform(0.1, 1.5, 1000) * (1 + 0.01*(years-2020)),
    'debt_to_income_ratio': np.random.uniform(0.1, 2.0, 1000),
    'interest_rate': np.random.uniform(3.0, 7.0, 1000) + 0.1*(years-2020),
    'unemployment_rate': np.random.uniform(3.5, 10.0, 1000) - 0.5*(years-2020),
}

# Create DataFrame first
historical_df = pd.DataFrame(data)

# Calculate default probabilities per year in a vectorized way
historical_df['default_prob'] = 0.15 + 0.01*(historical_df['year'] - 2020)
historical_df['default'] = [np.random.choice([0, 1], p=[1-p, p]) 
                          for p in historical_df['default_prob']]

# Clean up temporary column
historical_df.drop('default_prob', axis=1, inplace=True)

# Add dates
historical_df['date'] = historical_df['year'].apply(lambda y: datetime(y, np.random.randint(1,13), 1))

# Create 2026 prediction data (future scenario)
future_data = {
    'year': [2026]*100,
    'credit_score': np.clip(np.random.normal(700, 75, 100), 300, 850).astype(int),
    'annual_income': np.random.randint(25000, 220000, 100) * 1.03**6,
    'total_debt': np.random.randint(6000, 160000, 100) * 1.05**6,
    'loan_to_value_ratio': np.random.uniform(0.15, 1.6, 100),
    'debt_to_income_ratio': np.random.uniform(0.15, 2.2, 100),
    'interest_rate': np.random.uniform(5.0, 8.0, 100),
    'unemployment_rate': np.random.uniform(3.0, 8.0, 100),
}

future_df = pd.DataFrame(future_data)
future_df['date'] = datetime(2026, 1, 1)

# Save datasets
historical_df.to_csv('historical_credit_data_2020-2025.csv', index=False)
future_df.to_csv('prediction_data_2026.csv', index=False)

print("Datasets created successfully!")

Datasets created successfully!


In [19]:
# Modified code with error handling
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Load data
df = pd.read_csv('credit_data.csv')

# 1. Handle missing values and infinite values
df.replace([np.inf, -np.inf], np.nan, inplace=True)  # Convert inf to NaN
df.fillna(df.mean(), inplace=True)

In [20]:
# 2. Create new feature safely
df['debt_to_income'] = np.where(
    df['annual_income'] == 0,  # Prevent division by zero
    0,  # Default value when income is zero
    df['total_debt'] / df['annual_income']
)

# 3. Split data BEFORE scaling
X = df.drop('default', axis=1)
y = df['default']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# 4. Scale only numerical features after split
numerical_features = ['credit_score', 'annual_income', 'total_debt', 
                     'loan_to_value_ratio', 'debt_to_income_ratio', 'debt_to_income']
scaler = MinMaxScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

# 5. Final NaN check
print("Missing values in training:", X_train.isna().sum().sum())
print("Missing values in testing:", X_test.isna().sum().sum())

# Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


Missing values in training: 0
Missing values in testing: 0


RandomForestClassifier(random_state=42)

In [22]:
# Evaluate
y_pred = model.predict(X_test)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.845

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.99      0.92       170
           1       0.00      0.00      0.00        30

    accuracy                           0.84       200
   macro avg       0.42      0.50      0.46       200
weighted avg       0.72      0.84      0.78       200



In [23]:
# Predict future scenarios (example)
future_data = pd.DataFrame({
    'credit_score': [0.8], 
    'annual_income': [0.6],
    'total_debt': [0.7],
    'loan_to_value_ratio': [1.2],
    'debt_to_income_ratio': [0.5],
    'debt_to_income': [0.4]
})

In [24]:
future_risk = model.predict(future_data)
print("\nPredicted Risk for Future Scenario:", "High Risk" if future_risk[0] else "Low Risk")


Predicted Risk for Future Scenario: Low Risk
