In [18]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load
# +
# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## **Lung Cancer Detection**

In [19]:
import pandas as pd

## Loading the Dataset

In [20]:
df_train = pd.read_csv("/kaggle/input/idealize-2025-datathon-competition/train.csv")
df_test = pd.read_csv("/kaggle/input/idealize-2025-datathon-competition/test.csv")

## Data exploration

In [21]:
df_train.isna().sum()

record_id                     0
first_name                    0
last_name                     0
sex                           0
patient_age                   0
residence_state               0
height_cm                     0
weight_kg                     0
smoking_status                0
cigarettes_per_day       499285
cholesterol_mg_dl             0
family_cancer_history         0
has_other_cancer              0
asthma_diagnosis              0
liver_condition               0
blood_pressure_status         0
diagnosis_date                0
cancer_stage                  0
treatment_start_date          0
treatment_type                0
treatment_end_date            0
survival_status               0
dtype: int64

In [22]:
df_test.isna().sum()

record_id                     0
first_name                    0
last_name                     0
sex                           0
patient_age                   0
residence_state               0
height_cm                     0
weight_kg                     0
smoking_status                0
cigarettes_per_day       125330
cholesterol_mg_dl             0
family_cancer_history         0
has_other_cancer              0
asthma_diagnosis              0
liver_condition               0
blood_pressure_status         0
diagnosis_date                0
cancer_stage                  0
treatment_start_date          0
treatment_type                0
treatment_end_date            0
dtype: int64

In [23]:
df_train.describe()

Unnamed: 0,record_id,patient_age,height_cm,weight_kg,cigarettes_per_day,cholesterol_mg_dl,cancer_stage,survival_status
count,999999.0,999999.0,999999.0,999999.0,500714.0,999999.0,999999.0,999999.0
mean,500000.0,55.006108,168.486227,86.847382,10.000669,233.693956,2.498489,0.219604
std,288674.990255,10.000513,9.205421,25.791027,5.47909,43.394144,1.11859,0.413979
min,1.0,8.0,134.0,32.0,1.0,150.0,1.0,0.0
25%,250000.5,48.0,162.0,65.0,5.0,197.0,1.0,0.0
50%,500000.0,55.0,168.0,86.0,10.0,242.0,2.0,0.0
75%,749999.5,62.0,175.0,106.0,15.0,271.0,3.0,0.0
max,999999.0,101.0,206.0,180.0,19.0,300.0,4.0,1.0


In [24]:
df_test.describe()

Unnamed: 0,record_id,patient_age,height_cm,weight_kg,cigarettes_per_day,cholesterol_mg_dl,cancer_stage
count,250000.0,250000.0,250000.0,250000.0,124670.0,250000.0,250000.0
mean,1125000.0,54.968628,168.518,86.870576,10.000217,233.697692,2.497812
std,72168.93,10.025659,9.230815,25.841215,5.476021,43.392722,1.117544
min,1000000.0,8.0,136.0,32.0,1.0,150.0,1.0
25%,1062500.0,48.0,162.0,65.0,5.0,197.0,1.0
50%,1125000.0,55.0,168.0,86.0,10.0,242.0,2.0
75%,1187499.0,62.0,175.0,106.0,15.0,271.0,3.0
max,1249999.0,99.0,209.0,180.0,19.0,300.0,4.0


In [26]:
# Survived to Not Survived Ratio
survived = df_train['survival_status'].value_counts()[1]
not_survived = df_train['survival_status'].value_counts()[0]

survived_ratio = survived/df_train.shape[0]
not_survived_ratio =  not_survived/df_train.shape[0]

print(f"Survived ratio : {survived_ratio} \nNot survived ratio : {not_survived_ratio}")

Survived ratio : 0.2196042196042196 
Not survived ratio : 0.7803957803957804


In [40]:
df_train['survival_status'].value_counts()

survival_status
0    780395
1    219604
Name: count, dtype: int64

## Data Pre-processing
### Data preprocessing Steps
- Filling `cigarattes_per_day` 
- Columns to encode(Binary) : `family_cancer_history`, `has_other_cancer`, `asthma_diagnosis`, `liver_condition`, `blood_pressure_status`
- Columns to encode (OneHot) : `residence_state`, `smoking_status`, `treatment_type`

In [27]:
# Importing the necessory Libraries 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class ProcessSmoking(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self
        # Current Smokers - no NaNs
        # Never Smoked - no NaNs
        # Former Smokers - no NaNs
        # Passive smokers - contain NaNs
    def transform(self,X):
        X.loc[X['smoking_status'].isin(['Never Smoked', 'Passive Smoker', 'Passive', 'Non Smoker']), 'cigarettes_per_day'] = 0
        return X
    

class ProcessCols(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self

    def transform(self,X):
        # Encode 'family_cancer_history': Yes → 1, No → 0
        X['family_cancer_history'] = X['family_cancer_history'].replace({'Yes': 1, 'No': 0})
        
        # Encode 'has_other_cancer': Yes → 1, No → 0
        X['has_other_cancer'] = X['has_other_cancer'].replace({'Yes': 1, 'No': 0})
        
        # Encode 'asthma_diagnosis': Yes → 1, No → 0
        X['asthma_diagnosis'] = X['asthma_diagnosis'].replace({'Yes': 1, 'No': 0})
        
        # Clean and encode 'blood_pressure_status'
        X['blood_pressure_status'] = X['blood_pressure_status'].replace({
            'High Blood Pressure': 1,
            'Elevated': 1,
            'Normal': 0,
            'Normal BP': 0  # Treating 'Normal' and 'Normal BP' as same
        })
        
        # Clean and encode 'liver_condition'
        X['liver_condition'] = X['liver_condition'].replace({
            'Normal': 0,
            'Normal Liver': 0,
            'Liver OK': 0,
            'No Issue': 0,
            'Has Cirrhosis': 1,
            'Cirrhos': 1  # Assuming typo or shorthand for cirrhosis
        })
        X['sex'] = X['sex'].replace({'Male': 1, 'Female': 0})
        X['smoking_status'] = X['smoking_status'].replace({
            'Current Smoker': 'Current',
            'Former Smoker': 'Former',
            'Passive Smoker': 'Passive',
            'Never Smoked': 'Never',
            'Passive': 'Passive',
            'Non Smoker': 'Never',
            'Former Smk': 'Former'
        })
        X['treatment_type'] = X['treatment_type'].replace({
            'Chemo': 'Chemotherapy',
            'Surg': 'Surgery',
            'Combo': 'Combined'
        })
        X = X.drop('residence_state', axis=1)
        
        return X

class EncodeCatCols(BaseEstimator, TransformerMixin):
    def __init__(self, columns=['smoking_status', 'treatment_type']):
        self.columns = columns
        self.encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

    def fit(self, X, y=None):
        self.encoder.fit(X[self.columns])
        return self

    def transform(self, X):
        encoded = self.encoder.transform(X[self.columns])
        
        # Get column names from OneHotEncoder
        column_names = self.encoder.get_feature_names_out(self.columns)
        
        # Create DataFrame from encoded matrix
        encoded_df = pd.DataFrame(encoded, columns=column_names, index=X.index)
        # Drop original categorical columns and join encoded ones
        X = X.drop(columns=self.columns)
        X = pd.concat([X, encoded_df], axis=1)
        
        return X


class FormatDates(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self

    def transform(self,X):
        X['treatment_start_date'] = pd.to_datetime(X['treatment_start_date'], errors='coerce')
        X['treatment_end_date'] = pd.to_datetime(X['treatment_end_date'], errors='coerce')
        X['diagnosis_date'] = pd.to_datetime(X['diagnosis_date'], errors='coerce')
        X['treatment_duration'] = (X['treatment_end_date'] - X['treatment_start_date']).dt.days
        X['diagnosis_to_treatment_delay'] = (X['treatment_start_date'] - X['diagnosis_date']).dt.days
        
        X = X.drop(['treatment_end_date', 'treatment_start_date', 'diagnosis_date'], axis=1)
        return X

class DropUnwantedCols(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self

    def transform(self,X):
        X = X.drop(['first_name', 'last_name', 'record_id'], axis=1)
        return X




# Building Data Pre-Processing pipeline
data_pipeline = Pipeline([
    ('process_smoking', ProcessSmoking()),
    ('process_columns', ProcessCols()),
    ('format_dates', FormatDates()),
    ('encode_categoricals', EncodeCatCols(columns=['smoking_status', 'treatment_type'])),
    ('drop_unwanted', DropUnwantedCols())
])


## Separating Independent Variables (X) and Dependent Variable (y)


In [28]:
X = df_train.drop('survival_status', axis=1)
y = df_train['survival_status']

## Pre-processing data 

In [29]:
X_transformed = data_pipeline.fit_transform(X)

  X['family_cancer_history'] = X['family_cancer_history'].replace({'Yes': 1, 'No': 0})
  X['has_other_cancer'] = X['has_other_cancer'].replace({'Yes': 1, 'No': 0})
  X['asthma_diagnosis'] = X['asthma_diagnosis'].replace({'Yes': 1, 'No': 0})
  X['blood_pressure_status'] = X['blood_pressure_status'].replace({
  X['liver_condition'] = X['liver_condition'].replace({
  X['sex'] = X['sex'].replace({'Male': 1, 'Female': 0})


In [30]:
X_test_transformed = data_pipeline.transform(df_test)
X_test_transformed.describe()

  X['family_cancer_history'] = X['family_cancer_history'].replace({'Yes': 1, 'No': 0})
  X['has_other_cancer'] = X['has_other_cancer'].replace({'Yes': 1, 'No': 0})
  X['asthma_diagnosis'] = X['asthma_diagnosis'].replace({'Yes': 1, 'No': 0})
  X['blood_pressure_status'] = X['blood_pressure_status'].replace({
  X['liver_condition'] = X['liver_condition'].replace({
  X['sex'] = X['sex'].replace({'Male': 1, 'Female': 0})


Unnamed: 0,sex,patient_age,height_cm,weight_kg,cigarettes_per_day,cholesterol_mg_dl,family_cancer_history,has_other_cancer,asthma_diagnosis,liver_condition,...,treatment_duration,diagnosis_to_treatment_delay,smoking_status_Current,smoking_status_Former,smoking_status_Never,smoking_status_Passive,treatment_type_Chemotherapy,treatment_type_Combined,treatment_type_Radiation,treatment_type_Surgery
count,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,...,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0
mean,0.50068,54.968628,168.518,86.870576,4.986908,233.697692,0.499924,0.087368,0.468284,0.226144,...,448.10044,9.49746,0.248616,0.250064,0.249612,0.251708,0.249256,0.250276,0.249036,0.251432
std,0.500001,10.025659,9.230815,25.841215,6.320978,43.392722,0.500001,0.282374,0.498994,0.418334,...,138.534348,7.147601,0.432212,0.433051,0.432789,0.433995,0.432583,0.433173,0.432456,0.433837
min,0.0,8.0,136.0,32.0,0.0,150.0,0.0,0.0,0.0,0.0,...,153.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,48.0,162.0,65.0,0.0,197.0,0.0,0.0,0.0,0.0,...,356.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,55.0,168.0,86.0,0.0,242.0,0.0,0.0,0.0,0.0,...,448.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,62.0,175.0,106.0,10.0,271.0,1.0,0.0,1.0,0.0,...,541.0,14.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
max,1.0,99.0,209.0,180.0,19.0,300.0,1.0,1.0,1.0,1.0,...,729.0,30.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [31]:
X_transformed.describe()

Unnamed: 0,sex,patient_age,height_cm,weight_kg,cigarettes_per_day,cholesterol_mg_dl,family_cancer_history,has_other_cancer,asthma_diagnosis,liver_condition,...,treatment_duration,diagnosis_to_treatment_delay,smoking_status_Current,smoking_status_Former,smoking_status_Never,smoking_status_Passive,treatment_type_Chemotherapy,treatment_type_Combined,treatment_type_Radiation,treatment_type_Surgery
count,999999.0,999999.0,999999.0,999999.0,999999.0,999999.0,999999.0,999999.0,999999.0,999999.0,...,999999.0,999999.0,999999.0,999999.0,999999.0,999999.0,999999.0,999999.0,999999.0,999999.0
mean,0.49972,55.006108,168.486227,86.847382,5.00748,233.693956,0.500747,0.08785,0.468742,0.226607,...,448.652513,9.498979,0.250689,0.250025,0.249315,0.24997,0.250513,0.249337,0.249122,0.251027
std,0.5,10.000513,9.205421,25.791027,6.327319,43.394144,0.5,0.283077,0.499022,0.418637,...,138.257655,7.143923,0.43341,0.433027,0.432617,0.432996,0.433309,0.43263,0.432505,0.433604
min,0.0,8.0,134.0,32.0,0.0,150.0,0.0,0.0,0.0,0.0,...,153.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,48.0,162.0,65.0,0.0,197.0,0.0,0.0,0.0,0.0,...,357.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,55.0,168.0,86.0,1.0,242.0,1.0,0.0,0.0,0.0,...,449.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,62.0,175.0,106.0,10.0,271.0,1.0,0.0,1.0,0.0,...,541.0,14.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
max,1.0,101.0,206.0,180.0,19.0,300.0,1.0,1.0,1.0,1.0,...,729.0,30.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [46]:
df_test_transformed = data_pipeline.transform(df_test)
df_test_transformed.head()

Unnamed: 0,sex,patient_age,height_cm,weight_kg,cigarettes_per_day,cholesterol_mg_dl,family_cancer_history,has_other_cancer,asthma_diagnosis,liver_condition,...,treatment_duration,diagnosis_to_treatment_delay,smoking_status_Current,smoking_status_Former,smoking_status_Never,smoking_status_Passive,treatment_type_Chemotherapy,treatment_type_Combined,treatment_type_Radiation,treatment_type_Surgery
0,0,55.0,162.0,55.0,17.0,207,1,0,1,0,...,216,18,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,50.0,170.0,102.0,0.0,249,0,0,1,0,...,540,29,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0,56.0,151.0,96.0,0.0,278,0,1,1,0,...,705,11,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,1,29.0,176.0,71.0,19.0,155,0,0,0,0,...,392,29,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0,47.0,150.0,37.0,12.0,230,0,0,0,1,...,382,11,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


## Model Building

In [74]:
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.utils import class_weight
import numpy as np

# 1. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

# 2. Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 3. Compute class weights
weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(weights))
print("Class Weights:", class_weights)

# 4. Define the neural network with dropout
model = models.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')  # Binary output
])

# 5. Compile with additional metrics
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=[
                  'accuracy',
                  tf.keras.metrics.Precision(name='precision'),
                  tf.keras.metrics.Recall(name='recall'),
                  tf.keras.metrics.AUC(name='auc')
              ])

# 6. Early stopping
early_stop = callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# 7. Train the model
history = model.fit(X_train, y_train,
                    validation_split=0.1,
                    epochs=3,
                    batch_size=64,
                    class_weight=class_weights,
                    callbacks=[early_stop],
                    verbose=1)

# 8. Evaluate
loss, acc, prec, rec, auc = model.evaluate(X_test, y_test)
print(f"\n✅ Test Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | AUC: {auc:.4f}")

# 9. Predictions
y_pred = (model.predict(X_test) > 0.5).astype("int32")

# 10. Full classification report
print("\n📊 Classification Report:\n")
print(classification_report(y_test, y_pred, digits=4))


Class Weights: {0: 0.6406552309564995, 1: 2.2773956809136924}
Epoch 1/3
[1m11250/11250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 3ms/step - accuracy: 0.4888 - auc: 0.5007 - loss: 0.6951 - precision: 0.2204 - recall: 0.5211 - val_accuracy: 0.7732 - val_auc: 0.4987 - val_loss: 0.6912 - val_precision: 0.2182 - val_recall: 0.0130
Epoch 2/3
[1m11250/11250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 3ms/step - accuracy: 0.4832 - auc: 0.4997 - loss: 0.6937 - precision: 0.2201 - recall: 0.5304 - val_accuracy: 0.7780 - val_auc: 0.5002 - val_loss: 0.6923 - val_precision: 0.2286 - val_recall: 0.0050
Epoch 3/3
[1m11250/11250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 3ms/step - accuracy: 0.6098 - auc: 0.4991 - loss: 0.6927 - precision: 0.2193 - recall: 0.3041 - val_accuracy: 0.7806 - val_auc: 0.4998 - val_loss: 0.6926 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - accuracy: 

In [75]:
df_test_transformed_scaled = scaler.transform(df_test_transformed)

In [77]:
df_test_transformed_scaled


array([[-9.99723788e-01, -2.13812770e-05, -7.04415170e-01, ...,
        -5.76157557e-01, -5.76244160e-01,  1.72732799e+00],
       [ 1.00027629e+00, -5.00167586e-01,  1.64436324e-01, ...,
         1.73563635e+00, -5.76244160e-01, -5.78928846e-01],
       [-9.99723788e-01,  1.00007860e-01, -1.89908598e+00, ...,
        -5.76157557e-01, -5.76244160e-01,  1.72732799e+00],
       ...,
       [ 1.00027629e+00, -1.60048924e+00,  1.03328782e+00, ...,
        -5.76157557e-01, -5.76244160e-01, -5.78928846e-01],
       [-9.99723788e-01, -1.00031379e+00, -4.87202297e-01, ...,
        -5.76157557e-01, -5.76244160e-01,  1.72732799e+00],
       [ 1.00027629e+00,  1.10030027e+00,  9.24681382e-01, ...,
         1.73563635e+00, -5.76244160e-01, -5.78928846e-01]])

In [95]:
y_pred_prob = (model.predict(df_test_transformed_scaled)).flatten()


[1m7813/7813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1ms/step


In [96]:
y_pred_prob

array([0.49850103, 0.49850103, 0.49877563, ..., 0.49845222, 0.4983624 ,
       0.49856925], dtype=float32)

In [132]:
y_pred = (y_pred_prob > 0.499).astype("int32") # Try 0.3 or even 0.2

import numpy as np
unique, counts = np.unique(y_pred, return_counts=True)
print(dict(zip(unique, counts)))

{0: 228851, 1: 21149}


In [133]:
pd.DataFrame({
    'record_id' : df_test['record_id'], 
    'survival_status' : y_pred.flatten()
})['survival_status'].value_counts()

survival_status
0    228851
1     21149
Name: count, dtype: int64

array([0], dtype=int32)