# Survival Analysis: LendingClub Loan Default Prediction
## Time-to-Default Analysis

**Objective**: Predict time until loan default and identify risk factors for early default.

**Dataset**: LendingClub Loan Data
- Available on Kaggle: https://www.kaggle.com/datasets/wordsforthewise/lending-club

**Survival Analysis Setup**:
- **Time variable**: Months since loan issuance (derived from issue_d and last payment date)
- **Event**: Loan default (loan_status = 'Charged Off' or 'Default')
- **Censoring**: Active loans or fully paid loans

In [None]:
# Install required packages
!pip install lifelines pandas numpy matplotlib seaborn scikit-learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lifelines import KaplanMeierFitter, CoxPHFitter, WeibullAFTFitter
from lifelines.statistics import logrank_test, multivariate_logrank_test
from lifelines.utils import median_survival_times
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('Set2')
%matplotlib inline

## 1. Data Loading and Exploration

In [None]:
# Load LendingClub data
# Download from Kaggle: https://www.kaggle.com/datasets/wordsforthewise/lending-club
# Use accepted_2007_to_2018Q4.csv for comprehensive data
df = pd.read_csv('accepted_2007_to_2018Q4.csv', low_memory=False)

print(f"Dataset shape: {df.shape}")
print(f"\nLoan status distribution:")
print(df['loan_status'].value_counts())

In [None]:
# Key columns for survival analysis
key_cols = ['loan_amnt', 'funded_amnt', 'term', 'int_rate', 'installment', 
           'grade', 'sub_grade', 'emp_length', 'home_ownership', 'annual_inc',
           'verification_status', 'issue_d', 'loan_status', 'purpose', 'dti',
           'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal',
           'revol_util', 'total_acc', 'last_pymnt_d', 'last_pymnt_amnt',
           'last_credit_pull_d', 'application_type', 'addr_state']

# Select and clean data
df_clean = df[key_cols].copy()
print(f"\nSelected columns: {len(key_cols)}")
print(f"Missing value summary:")
print(df_clean.isnull().sum()[df_clean.isnull().sum() > 0].head(10))

## 2. Feature Engineering for Survival Analysis

In [None]:
# Parse dates
def parse_date(date_str):
    """Parse LendingClub date format (Mon-YYYY)"""
    try:
        return pd.to_datetime(date_str, format='%b-%Y')
    except:
        return pd.NaT

df_clean['issue_date'] = df_clean['issue_d'].apply(parse_date)
df_clean['last_payment_date'] = df_clean['last_pymnt_d'].apply(parse_date)

# Calculate duration (months from issue to last payment or current date)
current_date = pd.Timestamp('2018-12-31')  # Dataset end date

def calculate_duration(row):
    """Calculate months from issue to last payment or current date"""
    if pd.isna(row['issue_date']):
        return np.nan
    
    if pd.isna(row['last_payment_date']):
        end_date = current_date
    else:
        end_date = row['last_payment_date']
    
    duration = (end_date.year - row['issue_date'].year) * 12 + \
               (end_date.month - row['issue_date'].month)
    return max(duration, 0.5)  # Minimum 0.5 months

df_clean['duration'] = df_clean.apply(calculate_duration, axis=1)

print("Duration statistics (months):")
print(df_clean['duration'].describe())

In [None]:
# Define event (1 = default, 0 = censored)
default_statuses = ['Charged Off', 'Default', 'Does not meet the credit policy. Status:Charged Off']
censored_statuses = ['Fully Paid', 'Current', 'In Grace Period', 'Late (16-30 days)', 'Late (31-120 days)']

df_clean['event'] = df_clean['loan_status'].apply(
    lambda x: 1 if x in default_statuses else 0 if x in censored_statuses else np.nan
)

print("Event distribution:")
print(f"Defaults (event=1): {df_clean['event'].sum():,.0f} ({df_clean['event'].mean():.1%})")
print(f"Censored (event=0): {(df_clean['event']==0).sum():,.0f}")
print(f"Unknown/excluded: {df_clean['event'].isna().sum():,.0f}")

In [None]:
# Clean numeric features
df_clean['int_rate'] = df_clean['int_rate'].str.rstrip('%').astype('float')
df_clean['revol_util'] = df_clean['revol_util'].str.rstrip('%').astype('float')
df_clean['term_months'] = df_clean['term'].str.extract('(\d+)').astype('float')

# Clean employment length
def clean_emp_length(emp):
    if pd.isna(emp) or emp == 'n/a':
        return 0
    elif '< 1' in emp:
        return 0.5
    elif '10+' in emp:
        return 10
    else:
        return float(emp.split()[0])

df_clean['emp_length_years'] = df_clean['emp_length'].apply(clean_emp_length)

# Drop rows with missing critical values
survival_df = df_clean.dropna(subset=['duration', 'event']).copy()

print(f"\nSurvival dataset shape: {survival_df.shape}")
print(f"Events: {survival_df['event'].sum():,.0f}")
print(f"Censored: {(survival_df['event']==0).sum():,.0f}")

## 3. Kaplan-Meier Analysis: Overall Survival

In [None]:
# Overall survival curve
kmf = KaplanMeierFitter()
kmf.fit(survival_df['duration'], survival_df['event'], label='All Loans')

fig, ax = plt.subplots(figsize=(12, 6))
kmf.plot_survival_function(ax=ax, ci_show=True)
plt.title('Kaplan-Meier Survival Curve: Time to Loan Default', fontsize=14, fontweight='bold')
plt.xlabel('Months Since Loan Issue', fontsize=12)
plt.ylabel('Probability of Non-Default', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Median survival time: {kmf.median_survival_time_:.1f} months")
print(f"\nSurvival probabilities:")
for months in [12, 24, 36, 48, 60]:
    prob = kmf.predict(months)
    print(f"  {months} months: {prob:.1%} (default rate: {1-prob:.1%})")

## 4. Stratified Analysis: By Loan Grade

In [None]:
# Survival curves by loan grade
fig, ax = plt.subplots(figsize=(14, 7))

grades = sorted(survival_df['grade'].dropna().unique())
for grade in grades:
    mask = survival_df['grade'] == grade
    kmf_grade = KaplanMeierFitter()
    kmf_grade.fit(survival_df[mask]['duration'], 
                  survival_df[mask]['event'], 
                  label=f'Grade {grade}')
    kmf_grade.plot_survival_function(ax=ax, ci_show=False)

plt.title('Survival Curves by Loan Grade', fontsize=14, fontweight='bold')
plt.xlabel('Months Since Loan Issue', fontsize=12)
plt.ylabel('Probability of Non-Default', fontsize=12)
plt.legend(title='Loan Grade', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Log-rank test
result = multivariate_logrank_test(
    survival_df['duration'],
    survival_df['grade'],
    survival_df['event']
)
print(f"\nLog-rank test for Loan Grade:")
print(f"Test statistic: {result.test_statistic:.4f}")
print(f"p-value: {result.p_value:.4e}")
print(f"Significant: {'Yes' if result.p_value < 0.05 else 'No'}")

## 5. Stratified Analysis: By Term Length

In [None]:
# Compare by loan term
fig, ax = plt.subplots(figsize=(12, 6))

for term in survival_df['term'].dropna().unique():
    mask = survival_df['term'] == term
    kmf_term = KaplanMeierFitter()
    kmf_term.fit(survival_df[mask]['duration'], 
                 survival_df[mask]['event'], 
                 label=term)
    kmf_term.plot_survival_function(ax=ax, ci_show=True)

plt.title('Survival Curves by Loan Term', fontsize=14, fontweight='bold')
plt.xlabel('Months Since Loan Issue', fontsize=12)
plt.ylabel('Probability of Non-Default', fontsize=12)
plt.legend(title='Term')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Stratified Analysis: By Home Ownership

In [None]:
# Compare by home ownership
fig, ax = plt.subplots(figsize=(12, 6))

ownership_types = ['MORTGAGE', 'RENT', 'OWN']
for ownership in ownership_types:
    mask = survival_df['home_ownership'] == ownership
    if mask.sum() > 100:  # Only plot if sufficient data
        kmf_own = KaplanMeierFitter()
        kmf_own.fit(survival_df[mask]['duration'], 
                    survival_df[mask]['event'], 
                    label=ownership)
        kmf_own.plot_survival_function(ax=ax, ci_show=False)

plt.title('Survival Curves by Home Ownership', fontsize=14, fontweight='bold')
plt.xlabel('Months Since Loan Issue', fontsize=12)
plt.ylabel('Probability of Non-Default', fontsize=12)
plt.legend(title='Home Ownership')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Cox Proportional Hazards Model

In [None]:
# Prepare features for Cox model
numeric_features = ['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti',
                   'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec',
                   'revol_bal', 'revol_util', 'total_acc', 'emp_length_years',
                   'term_months']

categorical_features = ['grade', 'home_ownership', 'verification_status', 'purpose']

# Create Cox dataset
cox_df = survival_df[numeric_features + categorical_features + ['duration', 'event']].copy()

# Handle missing values
cox_df[numeric_features] = cox_df[numeric_features].fillna(cox_df[numeric_features].median())

# Encode categorical variables
cox_df_encoded = pd.get_dummies(cox_df, columns=categorical_features, drop_first=True)

# Remove any remaining NaN
cox_df_encoded = cox_df_encoded.dropna()

print(f"Cox dataset shape: {cox_df_encoded.shape}")
print(f"Features: {cox_df_encoded.shape[1] - 2}")

In [None]:
# Sample data for computational efficiency (optional for large datasets)
sample_size = min(100000, len(cox_df_encoded))
cox_sample = cox_df_encoded.sample(n=sample_size, random_state=42)

print(f"Using sample size: {len(cox_sample):,} for Cox model")

In [None]:
# Fit Cox model
cph = CoxPHFitter(penalizer=0.1)
cph.fit(cox_sample, duration_col='duration', event_col='event')

print("Cox Proportional Hazards Model Summary:")
print(f"Concordance Index: {cph.concordance_index_:.4f}")
print(f"Log-likelihood: {cph.log_likelihood_:.4f}")
print(f"AIC: {cph.AIC_:.4f}")

In [None]:
# Display significant coefficients
summary = cph.summary
summary['hazard_ratio'] = np.exp(summary['coef'])
summary_sorted = summary.sort_values('p', ascending=True)

print("\nTop 20 Most Significant Factors:")
significant = summary_sorted[summary_sorted['p'] < 0.05].head(20)
print(significant[['coef', 'hazard_ratio', 'p']].to_string())

In [None]:
# Visualize top hazard ratios
fig, ax = plt.subplots(figsize=(10, 10))

top_factors = significant.head(15)
y_pos = np.arange(len(top_factors))
hazard_ratios = top_factors['hazard_ratio'].values
labels = [label[:40] for label in top_factors.index]  # Truncate long names

colors = ['red' if hr > 1 else 'green' for hr in hazard_ratios]
ax.barh(y_pos, hazard_ratios - 1, color=colors, alpha=0.6)
ax.axvline(0, color='black', linestyle='--', linewidth=1)
ax.set_yticks(y_pos)
ax.set_yticklabels(labels)
ax.set_xlabel('Hazard Ratio - 1', fontsize=12)
ax.set_title('Top 15 Default Risk Factors\n(Red: Increases risk, Green: Decreases risk)', 
            fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

## 8. Weibull Accelerated Failure Time Model

In [None]:
# Fit Weibull AFT model (parametric alternative to Cox)
# Select key features for interpretability
aft_features = ['loan_amnt', 'int_rate', 'annual_inc', 'dti', 'revol_util', 
                'delinq_2yrs', 'term_months', 'duration', 'event']

# Add important categorical dummies
grade_cols = [col for col in cox_sample.columns if 'grade_' in col]
aft_features.extend(grade_cols[:5])  # Top 5 grades

aft_df = cox_sample[aft_features].copy()

# Fit model
aft = WeibullAFTFitter(penalizer=0.1)
aft.fit(aft_df, duration_col='duration', event_col='event')

print("Weibull AFT Model Summary:")
print(f"Concordance Index: {aft.concordance_index_:.4f}")
print(f"\nTop coefficients:")
print(aft.summary.sort_values('p').head(10)[['coef', 'exp(coef)', 'p']])

## 9. Risk Stratification and Scoring

In [None]:
# Calculate risk scores
risk_scores = cph.predict_partial_hazard(cox_sample)

# Create risk groups
risk_percentiles = np.percentile(risk_scores, [20, 40, 60, 80])
risk_groups = pd.cut(risk_scores, 
                     bins=[0] + list(risk_percentiles) + [np.inf],
                     labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])

cox_sample['risk_group'] = risk_groups

# Plot survival by risk group
fig, ax = plt.subplots(figsize=(12, 6))

for group in ['Very Low', 'Low', 'Medium', 'High', 'Very High']:
    mask = cox_sample['risk_group'] == group
    kmf_risk = KaplanMeierFitter()
    kmf_risk.fit(cox_sample[mask]['duration'], 
                 cox_sample[mask]['event'], 
                 label=group)
    kmf_risk.plot_survival_function(ax=ax, ci_show=False)

plt.title('Survival Curves by Risk Stratification', fontsize=14, fontweight='bold')
plt.xlabel('Months Since Loan Issue', fontsize=12)
plt.ylabel('Probability of Non-Default', fontsize=12)
plt.legend(title='Risk Group')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\nRisk Group Distribution:")
print(cox_sample['risk_group'].value_counts().sort_index())

print("\nDefault rates by risk group:")
default_by_risk = cox_sample.groupby('risk_group')['event'].agg(['mean', 'count'])
print(default_by_risk)

## 10. Expected Default Time Predictions

In [None]:
# Predict median survival time for sample loans
sample_loans = cox_sample.head(5)

print("Expected time to default for sample loans:")
print("="*70)

for idx in range(5):
    loan = cox_sample.iloc[idx:idx+1]
    
    # Get survival function
    surv_func = cph.predict_survival_function(loan)
    
    # Find median survival time
    median_time = surv_func.quantile(0.5).values[0]
    
    # Get risk score
    risk = risk_scores.iloc[idx]
    
    print(f"\nLoan {idx+1}:")
    print(f"  Risk Score: {risk:.2f}")
    print(f"  Risk Group: {risk_groups.iloc[idx]}")
    print(f"  Predicted median time to event: {median_time:.1f} months")
    print(f"  Probability of default at:")
    for months in [12, 24, 36]:
        prob_default = 1 - surv_func.loc[months].values[0]
        print(f"    {months} months: {prob_default:.1%}")

## 11. Model Validation

In [None]:
# Check proportional hazards assumption
print("Checking Proportional Hazards Assumption...")
print("="*70)

# Test on key numeric features
test_features = ['int_rate', 'dti', 'loan_amnt', 'annual_inc']
cox_test = cox_sample[test_features + ['duration', 'event']].copy()

cph_test = CoxPHFitter()
cph_test.fit(cox_test, duration_col='duration', event_col='event')
cph_test.check_assumptions(cox_test, p_value_threshold=0.05, show_plots=True)

## 12. Key Insights and Recommendations

In [None]:
print("=" * 80)
print("KEY FINDINGS: LOAN DEFAULT SURVIVAL ANALYSIS")
print("=" * 80)

print(f"\n1. Overall Default Statistics:")
print(f"   - Median time to default: {kmf.median_survival_time_:.1f} months")
print(f"   - 1-year non-default rate: {kmf.predict(12):.1%}")
print(f"   - 3-year non-default rate: {kmf.predict(36):.1%}")
print(f"   - 5-year non-default rate: {kmf.predict(60):.1%}")

print(f"\n2. Top Risk Factors (Increase Default Hazard):")
top_risk = summary_sorted[summary_sorted['hazard_ratio'] > 1].head(5)
for idx, (factor, row) in enumerate(top_risk.iterrows(), 1):
    print(f"   {idx}. {factor}: HR={row['hazard_ratio']:.3f}")

print(f"\n3. Top Protective Factors (Decrease Default Hazard):")
top_protect = summary_sorted[summary_sorted['hazard_ratio'] < 1].head(5)
for idx, (factor, row) in enumerate(top_protect.iterrows(), 1):
    print(f"   {idx}. {factor}: HR={row['hazard_ratio']:.3f}")

print(f"\n4. Model Performance:")
print(f"   - Cox C-Index: {cph.concordance_index_:.4f}")
print(f"   - Weibull AFT C-Index: {aft.concordance_index_:.4f}")

print(f"\n5. Risk Stratification Effectiveness:")
for group in ['Very Low', 'Low', 'Medium', 'High', 'Very High']:
    mask = cox_sample['risk_group'] == group
    default_rate = cox_sample[mask]['event'].mean()
    print(f"   {group}: {default_rate:.1%} default rate")

print("\n" + "=" * 80)
print("RECOMMENDATIONS")
print("=" * 80)
print("1. Implement tiered interest rates based on predicted default time")
print("2. Require additional collateral for high-risk loans (HR > 1.5)")
print("3. Focus collection efforts on loans predicted to default within 12 months")
print("4. Offer refinancing options to medium-risk borrowers before critical periods")
print("5. Use survival probabilities for dynamic loan loss provisioning")
print("6. Monitor grade D-G loans more closely during first 24 months")
print("=" * 80)

## Next Steps

1. **Time-varying covariates**: Model payment history changes over time
2. **Competing risks**: Separate early payoff from default
3. **Machine learning**: Random Survival Forests, DeepSurv
4. **Economic factors**: Include macroeconomic indicators (unemployment, interest rates)
5. **Stress testing**: Simulate survival under different economic scenarios
6. **Dynamic pricing**: Use survival predictions for real-time rate adjustments