<a href="https://colab.research.google.com/github/sabaansari9183/Machine-learning-projects-/blob/main/Lung_cancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# 1) Load
df = pd.read_csv('/content/dataset_med.csv')

# 2) Dates -> duration (safe)
df['diagnosis_date'] = pd.to_datetime(df.get('diagnosis_date'), errors='coerce')
df['end_treatment_date'] = pd.to_datetime(df.get('end_treatment_date'), errors='coerce')
df['treatment_duration'] = (df['end_treatment_date'] - df['diagnosis_date']).dt.days

# 3) Drop id and the raw dates (if present)
df = df.drop(columns=['id','diagnosis_date','end_treatment_date'], errors='ignore')

# 4) Target: normalize and map survived -> 0/1
if 'survived' not in df.columns:
    raise KeyError("The dataset must contain a 'survived' column.")

# normalize text to lowercase and map common yes/no values
df['survived'] = df['survived'].astype(str).str.strip().str.lower()
map_dict = {'yes': 1, 'y':1, '1':'1', 'no': 0, 'n':0, '0':'0'}
df['survived'] = df['survived'].map({'yes':1,'no':0})  # try the simple map first

# fallback: if mapping produced NaNs (unexpected labels), use LabelEncoder
if df['survived'].isnull().any():
    le = LabelEncoder()
    df['survived'] = le.fit_transform(df['survived'].astype(str))

# 5) Drop rows missing the target
df = df.dropna(subset=['survived'])
df['survived'] = df['survived'].astype(int)

# 6) Prepare X, y
y = df['survived']
X = df.drop(columns=['survived'])

# 7) Numeric imputation
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
if len(num_cols) > 0:
    X[num_cols] = X[num_cols].fillna(X[num_cols].median())

# 8) Categorical encoding (safe, simple)
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()
if len(cat_cols) > 0:
    X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# Ensure no NaNs remain
X = X.fillna(0)

# 9) Train/test split (stratify if possible)
try:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
except ValueError:
    # if stratify fails (e.g., single class), do a plain split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

# 10) Train model
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# 11) Predict (handle probability outputs if necessary)
y_pred = model.predict(X_test)

# 12) Evaluate
print("Shapes -> X_test:", X_test.shape, "y_test:", y_test.shape, "y_pred:", np.asarray(y_pred).shape)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



Shapes -> X_test: (178000, 45) y_test: (178000,) y_pred: (178000,)
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    178000

    accuracy                           1.00    178000
   macro avg       1.00      1.00      1.00    178000
weighted avg       1.00      1.00      1.00    178000

