In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

data = load_breast_cancer()
X, y = data.data, data.target

print(f"Missing values in the dataset: {sum(sum(X == None))}")

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "k-Nearest Neighbors": KNeighborsClassifier()
}

 
results = {}
for model_name, model in models.items():
     
    model.fit(X_train, y_train)
    
     
    y_pred = model.predict(X_test)
    
     
    accuracy = accuracy_score(y_test, y_pred)
    results[model_name] = accuracy
    
    print(f"\n{model_name} Classification Report:\n")
    print(classification_report(y_test, y_pred))

print("\nModel Comparison based on Accuracy:")
for model_name, accuracy in results.items():
    print(f"{model_name}: {accuracy:.4f}")

best_model = max(results, key=results.get)
worst_model = min(results, key=results.get)
print(f"\nBest Performing Model: {best_model} with Accuracy {results[best_model]:.4f}")
print(f"Worst Performing Model: {worst_model} with Accuracy {results[worst_model]:.4f}")


Missing values in the dataset: 0

Logistic Regression Classification Report:

              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114


Decision Tree Classification Report:

              precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114


Random Forest Classification Report:

              precision    recall  f1-score   support

           0       0.95      0.93      0.94        43
           1       0.96      0.97      0.97        71

    accuracy              

In [None]:
#Handling Missing Values: Although this dataset does not have missing values, handling them (if present) is critical for algorithms like k-NN, which rely on distance metrics.

#Feature Scaling: Many features in the breast cancer dataset have different ranges. For instance, mean radius and mean smoothness have different scales. Algorithms like SVM, k-NN, and Logistic Regression are sensitive to these differences, so standardization ensures all features contribute equally.

#Logistic Regression: It is a linear model that predicts probabilities using a sigmoid function and maps them to binary classes. It works well with linearly separable data.
# Breast cancer dataset is often linearly separable due to the clear distinction between benign and malignant cases.

#Decision Tree Classifier: A non-linear model that splits data into subsets using decision rules based on feature values. It partitions data recursively into pure groups.
#Decision trees handle non-linear relationships and interpretability.

#Random Forest Classifier:An ensemble method that builds multiple decision trees and aggregates their predictions. It reduces overfitting common in single decision trees.
#It handles high-dimensional datasets and is robust to overfitting.

#Support Vector Machine (SVM): Finds the hyperplane that maximizes the margin between classes. SVM is powerful for both linear and non-linear separable data (using kernels).
#The dataset has well-defined class boundaries, which SVM can capture effectively.

#k-Nearest Neighbors (k-NN):Classifies a data point based on the majority class of its nearest neighbors. It uses distance metrics to find neighbors.
#Breast cancer features are numerical and distance-based methods work well for such datasets.

#Best Performing Model: SVM achieved the highest accuracy  due to its ability to create a precise hyperplane for separation.

#Worst Performing Model: Decision Tree performed the worst (91%), likely due to overfitting.

