In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

import joblib


In [2]:
# Load Data
data = pd.read_csv('dataset/dataset.csv')
data.fillna(method='ffill', inplace=True)

# Encode categorical features
label_encoders = {}
categorical_cols = ['gender', 'country', 'cancer_stage', 'family_history', 'smoking_status',
                    'hypertension', 'asthma', 'cirrhosis', 'other_cancer', 'treatment_type', 'survived']

for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Convert dates to numerical timestamps
data['diagnosis_date'] = pd.to_datetime(data['diagnosis_date']).astype(int) // 10**9
data['end_treatment_date'] = pd.to_datetime(data['end_treatment_date']).astype(int) // 10**9


  data.fillna(method='ffill', inplace=True)


In [3]:
# Scale numerical features
scaler = StandardScaler()
numerical_cols = ['age', 'bmi', 'cholesterol_level', 'diagnosis_date', 'end_treatment_date']
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])


In [4]:
X = data.drop(['id', 'survived'], axis=1)
y = data['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
from imblearn.over_sampling import SMOTE

# ✅ Handle imbalance using SMOTE only (remove undersampling)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

print(f"Before SMOTE: {y_train.value_counts()}")
print(f"After SMOTE: {pd.Series(y_resampled).value_counts()}")


Before SMOTE: survived
0    555357
1    156643
Name: count, dtype: int64
After SMOTE: survived
0    555357
1    555357
Name: count, dtype: int64


In [13]:
# Train XGBoost with imbalance handling

xgb = XGBClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,  # Reduced to 0.05
    scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]) * 0.1,
    random_state=42
)




xgb.fit(X_resampled, y_resampled)
xgb_pred = xgb.predict(X_test)

print("\n🏆 XGBoost Results:")
print(f"Accuracy: {accuracy_score(y_test, xgb_pred)}")
print(classification_report(y_test, xgb_pred))


🏆 XGBoost Results:
Accuracy: 0.7788707865168539
              precision    recall  f1-score   support

           0       0.78      1.00      0.88    138639
           1       0.00      0.00      0.00     39361

    accuracy                           0.78    178000
   macro avg       0.39      0.50      0.44    178000
weighted avg       0.61      0.78      0.68    178000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
joblib.dump(xgb, 'models/best_model.pkl')
joblib.dump(scaler, 'models/scaler.pkl')
joblib.dump(label_encoders, 'models/label_encoders.pkl')

['models/label_encoders.pkl']