In [2]:
# train_lung_cancer_model.py

import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


# -----------------------------
# 1. Create dummy dataset
# -----------------------------
data = pd.DataFrame({
    "age": [65, 45, 50, 30, 70, 55, 60, 40, 68, 35],
    "smoking": ["Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "No", "Yes", "No"],
    "yellow_fingers": ["Yes", "No", "Yes", "No", "Yes", "No", "Yes", "No", "Yes", "No"],
    "anxiety": ["Yes", "No", "Yes", "No", "Yes", "Yes", "No", "No", "Yes", "No"],
    "chronic_disease": ["Yes", "No", "Yes", "No", "Yes", "No", "Yes", "No", "Yes", "No"],
    "fatigue": ["Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "No", "Yes", "No"],
    "lung_cancer": ["YES", "NO", "YES", "NO", "YES", "YES", "YES", "NO", "YES", "NO"]
})


# -----------------------------
# 2. Split features & target
# -----------------------------
X = data.drop("lung_cancer", axis=1)
y = data["lung_cancer"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# -----------------------------
# 3. Column separation
# -----------------------------
numeric_features = ["age"]
categorical_features = [
    "smoking",
    "yellow_fingers",
    "anxiety",
    "chronic_disease",
    "fatigue"
]


# -----------------------------
# 4. Pipelines
# -----------------------------
numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_features),
        ("cat", categorical_pipeline, categorical_features)
    ]
)


# -----------------------------
# 5. Full ML pipeline
# -----------------------------
model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LogisticRegression())
])


# -----------------------------
# 6. Train
# -----------------------------
model_pipeline.fit(X_train, y_train)


# -----------------------------
# 7. Evaluate
# -----------------------------
y_pred = model_pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


# -----------------------------
# 8. Save pipeline
# -----------------------------
joblib.dump(model_pipeline, "lung_cancer_pipeline.pkl")

print("✅ Model pipeline saved as lung_cancer_pipeline.pkl")

Accuracy: 1.0
              precision    recall  f1-score   support

          NO       1.00      1.00      1.00         1
         YES       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2

✅ Model pipeline saved as lung_cancer_pipeline.pkl
