In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import time

In [3]:
# 1. Loading and Preprocessing
def load_and_preprocess_data():
    """
    Load the breast cancer dataset and preprocess it.
    Returns X_train, X_test, y_train, y_test, and feature names
    """
    # Load the dataset
    data = load_breast_cancer()
    X, y = data.data, data.target
    feature_names = data.feature_names
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test, feature_names

In [5]:
# 2. Classification Algorithm Implementation
def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    """
    Train and evaluate multiple classification models.
    Returns a dictionary of model results.
    """
    models = {
        'Logistic Regression': LogisticRegression(max_iter=1000),
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'Random Forest': RandomForestClassifier(random_state=42),
        'SVM': SVC(kernel='rbf', random_state=42),
        'k-NN': KNeighborsClassifier(n_neighbors=5)
    }
    
    results = {}
    
    for name, model in models.items():
        start_time = time.time()
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        training_time = time.time() - start_time
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        
        results[name] = {
            'accuracy': accuracy,
            'training_time': training_time,
            'report': report
        }
    
    return results

In [7]:
# Main execution
def main():
    # 1. Load and preprocess data
    print("1. Loading and Preprocessing Data")
    print("-" * 50)
    X_train, X_test, y_train, y_test, feature_names = load_and_preprocess_data()
    print("Dataset shape:")
    print(f"Training set: {X_train.shape}")
    print(f"Testing set: {X_test.shape}")
    print("\nPreprocessing steps performed:")
    print("- Split data into training (80%) and testing (20%) sets")
    print("- Applied StandardScaler for feature scaling")
    print("\n")
    
    # 2. Train and evaluate models
    print("2. Training and Evaluating Models")
    print("-" * 50)
    results = train_and_evaluate_models(X_train, X_test, y_train, y_test)
    
    # 3. Model Comparison
    print("3. Model Comparison")
    print("-" * 50)
    print("\nAccuracy Scores:")
    accuracies = {name: result['accuracy'] for name, result in results.items()}
    for name, accuracy in accuracies.items():
        print(f"{name}: {accuracy:.4f}")
    
    best_model = max(accuracies.items(), key=lambda x: x[1])
    worst_model = min(accuracies.items(), key=lambda x: x[1])
    
    print(f"\nBest performing model: {best_model[0]} with accuracy {best_model[1]:.4f}")
    print(f"Worst performing model: {worst_model[0]} with accuracy {worst_model[1]:.4f}")
    
    # Detailed results for each model
    print("\nDetailed Classification Reports:")
    print("-" * 50)
    for name, result in results.items():
        print(f"\n{name}:")
        print(f"Training time: {result['training_time']:.2f} seconds")
        print("Classification Report:")
        print(result['report'])

In [9]:
if __name__ == "__main__":
    main()

1. Loading and Preprocessing Data
--------------------------------------------------
Dataset shape:
Training set: (455, 30)
Testing set: (114, 30)

Preprocessing steps performed:
- Split data into training (80%) and testing (20%) sets
- Applied StandardScaler for feature scaling


2. Training and Evaluating Models
--------------------------------------------------
3. Model Comparison
--------------------------------------------------

Accuracy Scores:
Logistic Regression: 0.9737
Decision Tree: 0.9474
Random Forest: 0.9649
SVM: 0.9825
k-NN: 0.9474

Best performing model: SVM with accuracy 0.9825
Worst performing model: Decision Tree with accuracy 0.9474

Detailed Classification Reports:
--------------------------------------------------

Logistic Regression:
Training time: 10.25 seconds
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy    