In [None]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load
# +
# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## **Lung Cancer Detection**

In [None]:
import pandas as pd

## Loading the Dataset

In [None]:
df = pd.read_csv("/kaggle/input/idealize-2025-datathon-competition/train.csv")

In [None]:
df = df.drop(['first_name', 'last_name', 'record_id'], axis=1)

In [None]:
df.head()

## Data exploration

In [None]:
df.isna().sum()

## Data Pre-processing
### Data preprocessing Steps
- Filling `cigarattes_per_day` 
- Columns to encode(Binary) : `family_cancer_history`, `has_other_cancer`, `asthma_diagnosis`, `liver_condition`, `blood_pressure_status`
- Columns to encode (OneHot) : `residence_state`, `smoking_status`, `treatment_type`

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class ProcessSmoking(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self

    def transform(self,X):
        X.loc[df['smoking_status'] == 'Never Smoked', 'cigarettes_per_day'] = 0
        X.loc[df['smoking_status'] == 'Passive Smoker', 'cigarettes_per_day'] = 0
        X.loc[df['smoking_status'] == 'Passive', 'cigarettes_per_day'] = 0
        X.loc[df['smoking_status'] == 'Non Smoker', 'cigarettes_per_day'] = 0
        return X
    

class ProcessCols(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self

    def transform(self,X):
        # Encode 'family_cancer_history': Yes → 1, No → 0
        X['family_cancer_history'] = X['family_cancer_history'].replace({'Yes': 1, 'No': 0})
        
        # Encode 'has_other_cancer': Yes → 1, No → 0
        X['has_other_cancer'] = X['has_other_cancer'].replace({'Yes': 1, 'No': 0})
        
        # Encode 'asthma_diagnosis': Yes → 1, No → 0
        X['asthma_diagnosis'] = X['asthma_diagnosis'].replace({'Yes': 1, 'No': 0})
        
        # Clean and encode 'blood_pressure_status'
        X['blood_pressure_status'] = X['blood_pressure_status'].replace({
            'High Blood Pressure': 1,
            'Elevated': 1,
            'Normal': 0,
            'Normal BP': 0  # Treating 'Normal' and 'Normal BP' as same
        })
        
        # Clean and encode 'liver_condition'
        X['liver_condition'] = X['liver_condition'].replace({
            'Normal': 0,
            'Normal Liver': 0,
            'Liver OK': 0,
            'No Issue': 0,
            'Has Cirrhosis': 1,
            'Cirrhos': 1  # Assuming typo or shorthand for cirrhosis
        })
        X['sex'] = X['sex'].replace({'Male': 1, 'Female': 0})
        X['smoking_status'] = X['smoking_status'].replace({
            'Current Smoker': 'Current',
            'Former Smoker': 'Former',
            'Passive Smoker': 'Passive',
            'Never Smoked': 'Never',
            'Passive': 'Passive',
            'Non Smoker': 'Never',
            'Former Smk': 'Former'
        })
        X['treatment_type'] = X['treatment_type'].replace({
            'Chemo': 'Chemotherapy',
            'Surg': 'Surgery',
            'Combo': 'Combined'
        })
        X = X.drop('residence_state', axis=1)
        
        return X

class EncodeCatCols(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.encoder = OneHotEncoder(handle_unknown='ignore')

    def fit(self, X, y=None):
        self.encoder.fit(X[self.columns])
        return self

    def transform(self, X):
        encoded = self.encoder.transform(X[self.columns])
        encoded_df = pd.DataFrame(
            encoded, 
            columns=self.encoder.get_feature_names_out(self.columns),
            index=X.index  # preserve index
        )
        X = X.drop(columns=self.columns)
        X = pd.concat([X, encoded_df], axis=1)
        return X


class FormatDates(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self

    def transform(self,X):
        X['treatment_start_date'] = pd.to_datetime(X['treatment_start_date'], errors='coerce')
        X['treatment_end_date'] = pd.to_datetime(X['treatment_end_date'], errors='coerce')
        X['diagnosis_date'] = pd.to_datetime(X['diagnosis_date'], errors='coerce')
        X['treatment_duration'] = (X['treatment_end_date'] - X['treatment_start_date']).dt.days
        X['diagnosis_to_treatment_delay'] = (X['treatment_start_date'] - X['diagnosis_date']).dt.days
        
        X = X.drop(['treatment_end_date', 'treatment_start_date', 'diagnosis_date'], axis=1)
        return X

#### Filling `cigarattes_per_day`

In [None]:
# Current Smokers - no NaNs
# Never Smoked - no NaNs
# Former Smokers - no NaNs
# Passive smokers - contain NaNs 
df.loc[df['smoking_status'] == 'Never Smoked', 'cigarettes_per_day'] = 0
df.loc[df['smoking_status'] == 'Passive Smoker', 'cigarettes_per_day'] = 0
df.loc[df['smoking_status'] == 'Passive', 'cigarettes_per_day'] = 0
df.loc[df['smoking_status'] == 'Non Smoker', 'cigarettes_per_day'] = 0


df.isna().sum()

In [None]:
df['cigarettes_per_day'].isna().sum()

#### Encoding Binary Columns 

`family_cancer_history`, `has_other_cancer`, `asthma_diagnosis`, `liver_condition`, `blood_pressure_status`, `sex`

In [None]:
df['smoking_status'].value_counts()

In [None]:
# Encode 'family_cancer_history': Yes → 1, No → 0
df['family_cancer_history'] = df['family_cancer_history'].replace({'Yes': 1, 'No': 0})

# Encode 'has_other_cancer': Yes → 1, No → 0
df['has_other_cancer'] = df['has_other_cancer'].replace({'Yes': 1, 'No': 0})

# Encode 'asthma_diagnosis': Yes → 1, No → 0
df['asthma_diagnosis'] = df['asthma_diagnosis'].replace({'Yes': 1, 'No': 0})

# Clean and encode 'blood_pressure_status'
df['blood_pressure_status'] = df['blood_pressure_status'].replace({
    'High Blood Pressure': 1,
    'Elevated': 1,
    'Normal': 0,
    'Normal BP': 0  # Treating 'Normal' and 'Normal BP' as same
})

# Clean and encode 'liver_condition'
df['liver_condition'] = df['liver_condition'].replace({
    'Normal': 0,
    'Normal Liver': 0,
    'Liver OK': 0,
    'No Issue': 0,
    'Has Cirrhosis': 1,
    'Cirrhos': 1  # Assuming typo or shorthand for cirrhosis
})

df['sex'] = df['sex'].replace({'Male': 1, 'Female': 0})



In [None]:
df.head()

#### Encoding multi-class columns 
- `residence_state`, `smoking_status`, `treatment_type`

In [None]:
df['smoking_status'] = df['smoking_status'].replace({
    'Current Smoker': 'Current',
    'Former Smoker': 'Former',
    'Passive Smoker': 'Passive',
    'Never Smoked': 'Never',
    'Passive': 'Passive',
    'Non Smoker': 'Never',
    'Former Smk': 'Former'
})

In [None]:
df['treatment_type'] = df['treatment_type'].replace({
    'Chemo': 'Chemotherapy',
    'Surg': 'Surgery',
    'Combo': 'Combined'
})

In [None]:
df['treatment_type'].value_counts()

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
import pandas as pd


class EncodeCatCols(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

    def fit(self, X, y=None):
        self.encoder.fit(X[self.columns])
        return self

    def transform(self, X):
        encoded = self.encoder.transform(X[self.columns])
        encoded_df = pd.DataFrame(
            encoded, 
            columns=self.encoder.get_feature_names_out(self.columns),
            index=X.index  # preserve index
        )
        X = X.drop(columns=self.columns)
        X = pd.concat([X, encoded_df], axis=1)
        return X

cat_cols = ['smoking_status', 'treatment_type']
encoder = EncodeCatCols(columns=cat_cols)

df = encoder.fit_transform(df)

df.head()


In [None]:
df.dtypes

In [None]:
df = df.drop('residence_state', axis=1)

In [None]:
df.dtypes

In [None]:
df['treatment_start_date'] = pd.to_datetime(df['treatment_start_date'], errors='coerce')
df['treatment_end_date'] = pd.to_datetime(df['treatment_end_date'], errors='coerce')
df['diagnosis_date'] = pd.to_datetime(df['diagnosis_date'], errors='coerce')
df.head()

In [None]:
df.dtypes

In [None]:
df['treatment_duration'] = (df['treatment_end_date'] - df['treatment_start_date']).dt.days
df['diagnosis_to_treatment_delay'] = (df['treatment_start_date'] - df['diagnosis_date']).dt.days

df = df.drop(['treatment_end_date', 'treatment_start_date', 'diagnosis_date'], axis=1)
df.dtypes

In [None]:
df.head()

In [None]:
X = df.drop('survival_status', axis=1)
y = df['survival_status']



In [None]:
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.metrics import accuracy_score

# # Split data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Standardize features
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# # Define models
# models = {
#     "Logistic Regression": LogisticRegression(max_iter=1000),
#     "Random Forest": RandomForestClassifier(),
#     "K-Nearest Neighbors": KNeighborsClassifier()
# }

# # Train and evaluate
# for name, model in models.items():
#     model.fit(X_train_scaled, y_train)
#     y_pred = model.predict(X_test_scaled)
#     acc = accuracy_score(y_test, y_pred)
#     print(f"{name} Accuracy: {acc:.4f}")


In [None]:
# import tensorflow as tf
# from tensorflow.keras import layers, models
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import accuracy_score

# # 1. Train-test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # 2. Scale features
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# # 3. Define the neural network
# model = models.Sequential([
#     layers.Input(shape=(X_train.shape[1],)),
#     layers.Dense(128, activation='relu'),
#     layers.Dense(64, activation='relu'),
#     layers.Dense(32, activation='relu'),
#     layers.Dense(1, activation='sigmoid')  # for binary classification
# ])

# # 4. Compile the model
# model.compile(optimizer='adam',
#               loss='binary_crossentropy',
#               metrics=['accuracy'])

# # 5. Train the model
# history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1, verbose=1)

# # 6. Evaluate on test set
# loss, accuracy = model.evaluate(X_test, y_test)
# print(f"\n✅ Test Accuracy: {accuracy:.4f}")

# # 7. Predict and show results if needed
# y_pred = (model.predict(X_test) > 0.5).astype("int32")


In [None]:
df_test = pd.read_csv("/kaggle/input/idealize-2025-datathon-competition/test.csv")
df_test.head()

In [None]:
df_test.columns

In [None]:
# import matplotlib.pyplot as plt

# plt.plot(history.history['accuracy'], label='Training Accuracy')
# plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
# plt.title('Model Accuracy')
# plt.xlabel('Epoch')
# plt.ylabel('Accuracy')
# plt.legend()
# plt.grid(True)
# plt.show()


In [None]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

cat_cols = ['smoking_status', 'treatment_type']

df = pd.read_csv("/kaggle/input/idealize-2025-datathon-competition/train.csv")
# ✅ Use your existing classes
# (assume ProcessSmoking, ProcessCols, EncodeCatCols, FormatDates are already defined)

# 📦 Define full pipeline
def build_pipeline(cat_columns):
    return Pipeline([
        ("process_smoking", ProcessSmoking()),
        ("process_columns", ProcessCols()),
        ("format_dates", FormatDates()),
        ("encode_categoricals", EncodeCatCols(columns=cat_cols))
    ])

pipeline = build_pipeline(cat_columns=cat_cols)
pipeline.fit_transform(df)
