In [29]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.metrics import accuracy_score

In [30]:
data = pd.read_csv(r"C:\Users\Rayhan\OneDrive\Desktop\Cancer Prediction ML project\data\The_Cancer_data_V2.csv")
data.head()

Unnamed: 0,Age,Gender,BMI,Smoking,GeneticRisk,PhysicalActivity,AlcoholIntake,CancerHistory,Diagnosis
0,58,1,16.085313,0,1,8.146251,4.148219,1,1
1,71,0,30.828784,0,1,9.36163,3.519683,0,0
2,48,1,38.785084,0,2,5.135179,4.728368,0,1
3,34,0,30.040296,0,0,9.502792,2.044636,0,0
4,62,1,35.479721,0,0,5.35689,3.309849,0,1


In [31]:
# Features and Target
X = data.drop("Diagnosis", axis=1)
y = data["Diagnosis"]


In [32]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# save train and test sets
X_train.to_csv("C:/Users/Rayhan/OneDrive/Desktop/anaconda -Cancer Prediction ML project/data/X_train.csv", index=False)
X_test.to_csv("C:/Users/Rayhan/OneDrive/Desktop/anaconda -Cancer Prediction ML project/data/X_test.csv", index=False)


In [33]:
# Numeric Preprocessing
num_features = X.select_dtypes(include=np.number).columns
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features)
])

In [34]:
# Define models to compare
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "DecisionTreeClassifier": DecisionTreeClassifier(random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# Store performance
model_scores = {}
pipelines = {}

# Train and evaluate each model
for name, clf in models.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("model", clf)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    model_scores[name] = acc
    pipelines[name] = pipeline
    print(f"{name} Accuracy: {acc:.4f}")

LogisticRegression Accuracy: 0.8633
RandomForest Accuracy: 0.9300
SVM Accuracy: 0.8900
DecisionTreeClassifier Accuracy: 0.8767
GradientBoosting Accuracy: 0.9333


In [35]:
# Choose best model
best_model_name = max(model_scores, key=model_scores.get)
best_pipeline = pipelines[best_model_name]

In [36]:
# Save best model
joblib.dump(best_pipeline, r"C:\Users\Rayhan\OneDrive\Desktop\anaconda -Cancer Prediction ML project\models\cancer_model.pkl")
print(f"\n✅ Best model: {best_model_name} saved as cancer_model.pkl")


✅ Best model: GradientBoosting saved as cancer_model.pkl
