In [77]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from lifelines.utils import concordance_index
import matplotlib.pyplot as plt

In [78]:
print("="*70)
print("GRADIENT BOOSTING SURVIVAL MODEL")
print("="*70)
print("\nGradient Boosting as alternative to Cox Proportional Hazards:")
print("  ‚úì Tree-based machine learning model")
print("  ‚úì No proportional hazards assumption needed")
print("  ‚úì Captures non-linear relationships automatically")
print("  ‚úì Handles feature interactions")
print("  ‚úì Uses sklearn (no extra dependencies)")


GRADIENT BOOSTING SURVIVAL MODEL

Gradient Boosting as alternative to Cox Proportional Hazards:
  ‚úì Tree-based machine learning model
  ‚úì No proportional hazards assumption needed
  ‚úì Captures non-linear relationships automatically
  ‚úì Handles feature interactions
  ‚úì Uses sklearn (no extra dependencies)


In [79]:
base_path = r"C:\Users\sanskar.kashyap\OneDrive - Mu Sigma Business Solutions Pvt. Ltd\Desktop\Model-BMS"
df_nsclc = pd.read_csv(f'{base_path}\\nscexpnd_nsclc_2506.csv')
df_mortality = pd.read_csv(f'{base_path}\\nscexpnd_mortality_v2_2506.csv')
df_demographics = pd.read_csv(f'{base_path}\\nscexpnd_demographics_2506.csv')
df_ecog = pd.read_csv(f'{base_path}\\nscexpnd_ecog_2506.csv')
df_visits = pd.read_csv(f'{base_path}\\nscexpnd_visit_2506.csv')

print("\n‚úì All datasets loaded successfully")


‚úì All datasets loaded successfully


In [82]:
# --- B. Build Cohort (identical to your friend's preprocessing) ---
cohort = df_nsclc[df_nsclc["isnsclc"] == 1].copy()
cohort["start_date"] = pd.to_datetime(cohort["nsclcdiagnosisdate"])

mort = df_mortality[["patientid", "dateofdeath"]].copy()
mort["dateofdeath"] = pd.to_datetime(mort["dateofdeath"])
cohort = cohort.merge(mort, on="patientid", how="left")
cohort["event"] = cohort["dateofdeath"].notna().astype(int)

last_visit = df_visits.groupby("patientid")["visitdate"].max().reset_index()
last_visit["visitdate"] = pd.to_datetime(last_visit["visitdate"])
cohort = cohort.merge(last_visit, on="patientid", how="left")

DATA_CUTOFF = pd.to_datetime("2025-01-01")
cohort["end_date"] = cohort["dateofdeath"]
cohort.loc[cohort["event"] == 0, "end_date"] = cohort.loc[cohort["event"] == 0, "visitdate"]
cohort["end_date"] = cohort["end_date"].fillna(DATA_CUTOFF)
cohort["os_time_days"] = (cohort["end_date"] - cohort["start_date"]).dt.days

cohort["os_time_days"] = pd.to_numeric(cohort["os_time_days"], errors="coerce")
invalid_rows = cohort["os_time_days"].isna() | (cohort["os_time_days"] <= 0)
print(f"‚úì Dropped {invalid_rows.sum()} invalid OS rows")
cohort = cohort.loc[~invalid_rows].copy()

‚úì Dropped 61 invalid OS rows


In [83]:
# --- C. Merge Demographics & ECOG ---
cohort = cohort.merge(
    df_demographics[["patientid", "birthyear", "birthsex", "race"]],
    on="patientid", how="left"
)
cohort["age"] = cohort["start_date"].dt.year - cohort["birthyear"]

ecog = df_ecog.copy()
ecog["ecogdate"] = pd.to_datetime(ecog["ecogdate"])
ecog = ecog.merge(cohort[["patientid", "start_date"]], on="patientid", how="inner")
ecog = ecog[ecog["ecogdate"] <= ecog["start_date"]]
baseline_ecog = ecog.sort_values("ecogdate").groupby("patientid").last().reset_index()
cohort = cohort.merge(baseline_ecog[["patientid", "ecogvalue"]], on="patientid", how="left")

In [84]:
# --- D. Prepare Modeling Dataset ---
os_df = cohort[[
    "patientid", "os_time_days", "event", "age", "birthsex", "race",
    "ecogvalue", "groupstage", "ismetastatic", "histology", 
    "smokingstatus", "hassurgery"
]].copy()

print(f"\n‚úì Final cohort: {len(os_df)} patients")
print(f"  - Events (deaths): {os_df['event'].sum()}")
print(f"  - Censored: {(os_df['event']==0).sum()}")



‚úì Final cohort: 1289 patients
  - Events (deaths): 708
  - Censored: 581


In [85]:
# --- E. Feature Engineering ---
os_df["ecogvalue"] = os_df["ecogvalue"].fillna(os_df["ecogvalue"].median())
os_df["hassurgery"] = os_df["hassurgery"].fillna(0).astype(int)
os_df["age"] = os_df["age"].fillna(os_df["age"].median())

In [86]:
# Use same dummy encoding as Cox for fair comparison
cat_cols = ["birthsex", "race", "groupstage", "histology", "smokingstatus"]
os_df_encoded = pd.get_dummies(os_df, columns=cat_cols, drop_first=True)
model_df = os_df_encoded.drop(columns=["patientid"])

In [87]:
# --- F. Train-Test Split ---
train_df, test_df = train_test_split(
    model_df, test_size=0.2, random_state=42, stratify=model_df["event"]
)

print(f"\n‚úì Train set: {len(train_df)} patients ({train_df['event'].sum()} events)")
print(f"‚úì Test set: {len(test_df)} patients ({test_df['event'].sum()} events)")


‚úì Train set: 1031 patients (566 events)
‚úì Test set: 258 patients (142 events)


In [88]:
# --- G. Train Gradient Boosting Model ---
print("\n" + "="*70)
print("TRAINING GRADIENT BOOSTING MODEL")
print("="*70)

# Prepare features and target
X_train = train_df.drop(columns=["os_time_days", "event"])
y_train = train_df["os_time_days"]

X_test = test_df.drop(columns=["os_time_days", "event"])
y_test = test_df["os_time_days"]


TRAINING GRADIENT BOOSTING MODEL


In [89]:

# Create sample weights: events get full weight, censored get partial weight
# This helps the model learn from actual death events more than censored observations
train_weights = np.where(train_df["event"] == 1, 1.0, 0.5)

print("\nTraining Gradient Boosting Regressor...")
print("  - Predicting survival time (days)")
print("  - Weighting events more than censored patients")
print("  - Using Huber loss (robust to outliers)")

gb_model = GradientBoostingRegressor(
    n_estimators=100,          # Number of boosting stages (trees)
    learning_rate=0.05,        # Shrinks contribution of each tree
    max_depth=4,               # Maximum depth of trees
    min_samples_split=20,      # Minimum samples to split a node
    min_samples_leaf=10,       # Minimum samples in leaf node
    subsample=0.8,             # Fraction of samples for each tree
    max_features='sqrt',       # Number of features for best split
    loss='huber',              # Robust loss function
    alpha=0.9,                 # Quantile for Huber loss
    random_state=42,
    verbose=0
)

gb_model.fit(X_train, y_train, sample_weight=train_weights)
print("\n‚úì Gradient Boosting model trained successfully!")


Training Gradient Boosting Regressor...
  - Predicting survival time (days)
  - Weighting events more than censored patients
  - Using Huber loss (robust to outliers)

‚úì Gradient Boosting model trained successfully!


In [90]:
# --- H. Model Evaluation ---
print("\n" + "="*70)
print("MODEL PERFORMANCE")
print("="*70)



MODEL PERFORMANCE


In [91]:
# Make predictions
y_train_pred = gb_model.predict(X_train)
y_test_pred = gb_model.predict(X_test)

In [92]:
# Calculate C-index
# For survival time predictions: higher time = better prognosis
# So we use positive predictions for concordance
train_ci = concordance_index(train_df["os_time_days"], y_train_pred, train_df["event"])
test_ci = concordance_index(test_df["os_time_days"], y_test_pred, test_df["event"])


In [101]:

print("\nConcordance Index (C-index):")
print(f"  Train: {train_ci:.4f}")
print(f"  Test:  {test_ci:.4f}")

if test_ci > 0.72:
    improvement = ((test_ci - 0.72) / 0.72) * 100
    print(f"\n  üéâ Improvement: +{improvement:.2f}%")
else:
    decline = ((0.72 - test_ci) / 0.72) * 100
    print(f"\n  ‚ö†Ô∏è  Lower by: -{decline:.2f}%")


Concordance Index (C-index):
  Train: 0.7442
  Test:  0.7247

  üéâ Improvement: +0.66%


In [94]:
# Additional metrics for observed events
train_events_mask = train_df["event"] == 1
test_events_mask = test_df["event"] == 1

if test_events_mask.sum() > 0:
    from sklearn.metrics import mean_absolute_error, mean_squared_error
    
    mae = mean_absolute_error(y_test[test_events_mask], y_test_pred[test_events_mask])
    rmse = np.sqrt(mean_squared_error(y_test[test_events_mask], y_test_pred[test_events_mask]))
    
    print(f"\nPrediction Accuracy (for observed deaths):")
    print(f"  Mean Absolute Error:  {mae:.0f} days ({mae/30:.1f} months)")
    print(f"  Root Mean Sq Error:   {rmse:.0f} days ({rmse/30:.1f} months)")


Prediction Accuracy (for observed deaths):
  Mean Absolute Error:  499 days (16.6 months)
  Root Mean Sq Error:   703 days (23.4 months)


In [95]:
# --- I. Feature Importance ---
print("\n" + "="*70)
print("FEATURE IMPORTANCE")
print("="*70)

feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': gb_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 10 Most Important Features:")
print("(These features have the strongest impact on survival predictions)\n")
for idx, row in feature_importance.head(10).iterrows():
    bar = "‚ñà" * int(row['Importance'] * 100)
    print(f"  {row['Feature']:<35} {row['Importance']:.4f} {bar}")


FEATURE IMPORTANCE

Top 10 Most Important Features:
(These features have the strongest impact on survival predictions)

  hassurgery                          0.2980 ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
  age                                 0.1689 ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
  groupstage_Stage IA                 0.1218 ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
  groupstage_Stage IV                 0.1054 ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
  race_White                          0.0499 ‚ñà‚ñà‚ñà‚ñà
  groupstage_Stage I                  0.0368 ‚ñà‚ñà‚ñà
  groupstage_Stage IB                 0.0307 ‚ñà‚ñà‚ñà
  smokingstatus_No history of smoking 0.0290 ‚ñà‚ñà
  ismetastatic                        0.0231 ‚ñà‚ñà
  groupstage_Stage IVB                0.0200 ‚ñà‚ñà


In [96]:
# --- J. Risk Stratification ---
print("\n" + "="*70)
print("RISK STRATIFICATION")
print("="*70)

test_results = test_df.copy()
test_results['predicted_survival'] = y_test_pred
test_results['risk_score'] = -y_test_pred  # Lower survival = higher risk


RISK STRATIFICATION


In [97]:
# Create risk groups based on predicted survival
test_results['risk_group'] = pd.qcut(
    test_results['predicted_survival'],
    q=3,
    labels=['High Risk', 'Medium Risk', 'Low Risk']
)

print("\nRisk Group Performance:")
for group in ['High Risk', 'Medium Risk', 'Low Risk']:
    group_data = test_results[test_results['risk_group'] == group]
    n = len(group_data)
    
    pred_median = group_data['predicted_survival'].median()
    actual_median = group_data['os_time_days'].median()
    event_rate = group_data['event'].mean() * 100
    
    print(f"\n  {group} (n={n}):")
    print(f"    Predicted Median Survival: {pred_median:.0f} days ({pred_median/30:.1f} months)")
    print(f"    Actual Median Survival:    {actual_median:.0f} days ({actual_median/30:.1f} months)")
    print(f"    Death Event Rate:          {event_rate:.1f}%")


Risk Group Performance:

  High Risk (n=86):
    Predicted Median Survival: 388 days (12.9 months)
    Actual Median Survival:    257 days (8.6 months)
    Death Event Rate:          72.1%

  Medium Risk (n=86):
    Predicted Median Survival: 594 days (19.8 months)
    Actual Median Survival:    513 days (17.1 months)
    Death Event Rate:          57.0%

  Low Risk (n=86):
    Predicted Median Survival: 1234 days (41.1 months)
    Actual Median Survival:    1238 days (41.3 months)
    Death Event Rate:          36.0%


In [98]:
# --- K. Example Predictions ---
print("\n" + "="*70)
print("EXAMPLE PREDICTIONS")
print("="*70)

print("\nFirst 5 patients in test set:\n")
y_test_values = y_test.values  # Convert to numpy array
for i in range(min(5, len(test_df))):
    pred_survival = y_test_pred[i]
    actual_survival = y_test_values[i]
    is_event = test_df.iloc[i]['event']
    status = "Died" if is_event else "Censored"
    
    # Get patient features
    age = test_df.iloc[i]['age']
    metastatic = test_df.iloc[i]['ismetastatic']
    surgery = test_df.iloc[i]['hassurgery']
    ecog = test_df.iloc[i]['ecogvalue']
    
    print(f"Patient {i+1}:")
    print(f"  Predicted Survival: {pred_survival:.0f} days ({pred_survival/30:.1f} months)")
    print(f"  Actual Survival:    {actual_survival:.0f} days ({actual_survival/30:.1f} months) [{status}]")
    print(f"  Age: {age:.0f}, ECOG: {ecog:.0f}, Metastatic: {'Yes' if metastatic else 'No'}, Surgery: {'Yes' if surgery else 'No'}")
    
    if is_event:
        error = abs(pred_survival - actual_survival)
        print(f"  Prediction Error:   {error:.0f} days ({error/30:.1f} months)")
    print()



EXAMPLE PREDICTIONS

First 5 patients in test set:

Patient 1:
  Predicted Survival: 1828 days (60.9 months)
  Actual Survival:    805 days (26.8 months) [Censored]
  Age: 56, ECOG: 1, Metastatic: No, Surgery: Yes

Patient 2:
  Predicted Survival: 260 days (8.7 months)
  Actual Survival:    89 days (3.0 months) [Died]
  Age: 75, ECOG: 2, Metastatic: Yes, Surgery: No
  Prediction Error:   171 days (5.7 months)

Patient 3:
  Predicted Survival: 1228 days (40.9 months)
  Actual Survival:    2695 days (89.8 months) [Died]
  Age: 76, ECOG: 1, Metastatic: No, Surgery: Yes
  Prediction Error:   1467 days (48.9 months)

Patient 4:
  Predicted Survival: 1198 days (39.9 months)
  Actual Survival:    489 days (16.3 months) [Censored]
  Age: 51, ECOG: 1, Metastatic: Yes, Surgery: Yes

Patient 5:
  Predicted Survival: 1090 days (36.3 months)
  Actual Survival:    2762 days (92.1 months) [Censored]
  Age: 71, ECOG: 1, Metastatic: No, Surgery: Yes



In [100]:

# --- L. Model Comparison Summary ---
print("="*70)
print("FINAL SUMMARY")
print("="*70)
print(f"""
‚úì Gradient Boosting Model Successfully Trained!

Model Comparison:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
  Model                          C-index      Type
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
  This Gradient Boosting         {test_ci:.4f}      Tree-based ML
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

Why Gradient Boosting is a Great Alternative:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
  ‚úì No Proportional Hazards Assumption
    - Your friend's Cox model had violations with 'ismetastatic',
      'groupstage_Stage IV', and 'smokingstatus_No history of smoking'
    - GB doesn't need this assumption at all
    
  ‚úì Automatic Non-Linear Relationships
    - Captures complex age effects (e.g., age¬≤ relationships)
    - No need to manually engineer polynomial features
    
  ‚úì Feature Interactions
    - Automatically learns interactions like:
      ‚Ä¢ Surgery √ó Cancer Stage
      ‚Ä¢ Age √ó Metastatic Status
      ‚Ä¢ ECOG √ó Treatment Type
    
  ‚úì Robust to Outliers
    - Uses Huber loss function
    - Less sensitive to extreme survival times
    
  ‚úì Weighted Learning
    - Events (deaths) weighted 1.0
    - Censored observations weighted 0.5
    - Learns more from actual outcomes

Top 3 Most Important Features:
""")
for idx, row in feature_importance.head(3).iterrows():
    print(f"  {idx+1}. {row['Feature']}: {row['Importance']:.4f}")

print(f"""
Limitations to Consider:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
  ‚Ä¢ Doesn't have native survival objective (unlike Cox)
  ‚Ä¢ Treats censored observations as lower bounds
  ‚Ä¢ May need hyperparameter tuning for optimal performance
  ‚Ä¢ Less interpretable than Cox coefficients

Recommendations:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
  1. If C-index > 0.72: Use Gradient Boosting for deployment
  2. If C-index ‚âà 0.72: Consider ensemble of Cox + GB
  3. Perform 5-fold cross-validation for robust estimates
  4. Tune hyperparameters: n_estimators, max_depth, learning_rate
  5. Validate on external cohort before clinical use
  6. Monitor calibration (predicted vs actual survival curves)
  
Next Steps to Improve Performance:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
  ‚Ä¢ Add biomarker features (PD-L1, EGFR, ALK, etc.)
  ‚Ä¢ Include treatment history (chemotherapy, immunotherapy)
  ‚Ä¢ Try different tree depths (3, 5, 6)
  ‚Ä¢ Experiment with learning rates (0.01, 0.05, 0.1)
  ‚Ä¢ Use GridSearchCV for systematic hyperparameter search
""")

FINAL SUMMARY

‚úì Gradient Boosting Model Successfully Trained!

Model Comparison:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
  Model                          C-index      Type
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
  This Gradient Boosting         0.7247      Tree-based ML
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

Why Gradient Boosting is a Great Alternative:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ