In [40]:
# Data handling
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning & Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# For advanced model (optional)
import xgboost as xgb

# Display settings
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")

In [41]:
# Load dataset
df = pd.read_csv('dataset_med.csv')

# Show first few rows
df.head()

Unnamed: 0,id,age,gender,country,diagnosis_date,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
0,1,64.0,Male,Sweden,2016-04-05,Stage I,Yes,Passive Smoker,29.4,199,0,0,1,0,Chemotherapy,2017-09-10,0
1,2,50.0,Female,Netherlands,2023-04-20,Stage III,Yes,Passive Smoker,41.2,280,1,1,0,0,Surgery,2024-06-17,1
2,3,65.0,Female,Hungary,2023-04-05,Stage III,Yes,Former Smoker,44.0,268,1,1,0,0,Combined,2024-04-09,0
3,4,51.0,Female,Belgium,2016-02-05,Stage I,No,Passive Smoker,43.0,241,1,1,0,0,Chemotherapy,2017-04-23,0
4,5,37.0,Male,Luxembourg,2023-11-29,Stage I,No,Passive Smoker,19.7,178,0,0,0,0,Combined,2025-01-08,0


In [42]:
# Check shape and basic info
print("Shape of the dataset:", df.shape)
print("\nColumn info:")
df.info()

# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

Shape of the dataset: (890000, 17)

Column info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 890000 entries, 0 to 889999
Data columns (total 17 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  890000 non-null  int64  
 1   age                 890000 non-null  float64
 2   gender              890000 non-null  object 
 3   country             890000 non-null  object 
 4   diagnosis_date      890000 non-null  object 
 5   cancer_stage        890000 non-null  object 
 6   family_history      890000 non-null  object 
 7   smoking_status      890000 non-null  object 
 8   bmi                 890000 non-null  float64
 9   cholesterol_level   890000 non-null  int64  
 10  hypertension        890000 non-null  int64  
 11  asthma              890000 non-null  int64  
 12  cirrhosis           890000 non-null  int64  
 13  other_cancer        890000 non-null  int64  
 14  treatment_type      890000 non-null

#### Handling missing data in "survival" column

In [43]:
df['survived'].isna().sum()

np.int64(0)

In [44]:
# Drop rows where the target value is NaN
df = df.dropna(subset=['survived'])

In [45]:
df['survived'].isna().sum()

np.int64(0)

### Data Cleaning & Preprocessing

In [46]:
# Reload the original dataset
df = pd.read_csv('dataset_med.csv')

# Confirm again that survived is numeric
print(df['survived'].unique())  # Should be [0 1]


[0 1]


In [47]:
df = pd.read_csv("dataset_med.csv")

# Drop rows with missing target
df = df.dropna(subset=['survived'])

# Print column types
print(df.dtypes)


id                      int64
age                   float64
gender                 object
country                object
diagnosis_date         object
cancer_stage           object
family_history         object
smoking_status         object
bmi                   float64
cholesterol_level       int64
hypertension            int64
asthma                  int64
cirrhosis               int64
other_cancer            int64
treatment_type         object
end_treatment_date     object
survived                int64
dtype: object


In [48]:
# Reload dataset 
df = pd.read_csv("dataset_med.csv")
df = df.dropna(subset=['survived'])  # just in case

# Clean binary categorical columns
binary_cols = ['family_history', 'hypertension', 'asthma', 'cirrhosis', 'other_cancer']
for col in binary_cols:
    df[col] = df[col].map({'yes': 1, 'no': 0})

# Encode gender
df['gender'] = df['gender'].map({'male': 0, 'female': 1})

# Convert dates to datetime
df['diagnosis_date'] = pd.to_datetime(df['diagnosis_date'], errors='coerce')
df['end_treatment_date'] = pd.to_datetime(df['end_treatment_date'], errors='coerce')

# Create treatment_duration and drop date columns
df['treatment_duration'] = (df['end_treatment_date'] - df['diagnosis_date']).dt.days
df = df.drop(['diagnosis_date', 'end_treatment_date'], axis=1)

# One-hot encode multi-category columns
df = pd.get_dummies(df, columns=['country', 'smoking_status', 'treatment_type', 'cancer_stage'], drop_first=True)

# Fill numeric missing values
df.fillna(df.median(numeric_only=True), inplace=True)

# Final check
print("✅ Data types after preprocessing:\n", df.dtypes)


✅ Data types after preprocessing:
 id                                 int64
age                              float64
gender                           float64
family_history                   float64
bmi                              float64
cholesterol_level                  int64
hypertension                     float64
asthma                           float64
cirrhosis                        float64
other_cancer                     float64
survived                           int64
treatment_duration                 int64
country_Belgium                     bool
country_Bulgaria                    bool
country_Croatia                     bool
country_Cyprus                      bool
country_Czech Republic              bool
country_Denmark                     bool
country_Estonia                     bool
country_Finland                     bool
country_France                      bool
country_Germany                     bool
country_Greece                      bool
country_Hungary       

In [49]:
X = df.drop(['id', 'survived'], axis=1)
y = df['survived']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


### RandomForestClassifier Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [None]:
# Predict on test data
y_pred = model.predict(X_test)

# Evaluate
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


## XG Boost Model:

In [52]:
import xgboost as xgb
from xgboost import XGBClassifier

# Create the model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [53]:
# Predict
y_pred = xgb_model.predict(X_test)

# Evaluate
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.7797415730337078

Classification Report:
               precision    recall  f1-score   support

           0       0.78      1.00      0.88    138799
           1       0.38      0.00      0.00     39201

    accuracy                           0.78    178000
   macro avg       0.58      0.50      0.44    178000
weighted avg       0.69      0.78      0.68    178000


Confusion Matrix:
 [[138786     13]
 [ 39193      8]]


### Save the Model

In [54]:
# Save feature names used in model training
import joblib
joblib.dump(X_train.columns.tolist(), "xgb_feature_columns.pkl")


['xgb_feature_columns.pkl']

In [55]:
import joblib

# Save to file
joblib.dump(xgb_model, "xgboost_lung_survival_model.pkl")

['xgboost_lung_survival_model.pkl']