In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

import joblib


In [20]:
# Load Data
data = pd.read_csv('dataset/dataset.csv')
data.fillna(method='ffill', inplace=True)

# Encode categorical features
label_encoders = {}
categorical_cols = ['gender', 'country', 'cancer_stage', 'family_history', 'smoking_status',
                    'hypertension', 'asthma', 'cirrhosis', 'other_cancer', 'treatment_type', 'survived']

for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Convert dates to numerical timestamps
data['diagnosis_date'] = pd.to_datetime(data['diagnosis_date']).astype(int) // 10**9
data['end_treatment_date'] = pd.to_datetime(data['end_treatment_date']).astype(int) // 10**9


  data.fillna(method='ffill', inplace=True)


In [21]:
# Scale numerical features
scaler = StandardScaler()
numerical_cols = ['age', 'bmi', 'cholesterol_level', 'diagnosis_date', 'end_treatment_date']
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])


In [22]:
X = data.drop(['id', 'survived'], axis=1)
y = data['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:
from imblearn.combine import SMOTEENN

# First use SMOTE + undersampling to balance data
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

print(f"Before SMOTE + Undersampling: {y_train.value_counts()}")
print(f"After SMOTE + Undersampling: {pd.Series(y_resampled).value_counts()}")


Before SMOTE + Undersampling: survived
0    555357
1    156643
Name: count, dtype: int64
After SMOTE + Undersampling: survived
1    468500
0    209061
Name: count, dtype: int64


In [23]:
# Train XGBoost with imbalance handling

xgb = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]) * 0.3,  # Reduced weight
    random_state=42
)



xgb.fit(X_resampled, y_resampled)
xgb_pred = xgb.predict(X_test)

print("\n🏆 XGBoost Results:")
print(f"Accuracy: {accuracy_score(y_test, xgb_pred)}")
print(classification_report(y_test, xgb_pred))


🏆 XGBoost Results:
Accuracy: 0.4474887640449438
              precision    recall  f1-score   support

           0       0.78      0.40      0.53    138639
           1       0.22      0.60      0.32     39361

    accuracy                           0.45    178000
   macro avg       0.50      0.50      0.43    178000
weighted avg       0.66      0.45      0.49    178000



In [24]:
joblib.dump(xgb, 'models/best_model.pkl')
joblib.dump(scaler, 'models/scaler.pkl')
joblib.dump(label_encoders, 'models/label_encoders.pkl')

['models/label_encoders.pkl']