In [None]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Convert to DataFrame for easier manipulation
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y


In [None]:
#Preprocessing Steps
'''Missing Values:
The breast cancer dataset from sklearn does not contain missing values, so we don't need to handle any missing data.
Feature Scaling:
Scaling is essential for algorithms like SVM and k-NN, which are sensitive to the scale of the input features. We will use StandardScaler to standardize the features.'''

In [None]:
# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
'''Justification for Preprocessing
Handling Missing Values: While this dataset has no missing values, it's a good practice to check for them in any dataset, as they can significantly affect model performance.
Feature Scaling: Necessary for models that rely on distance metrics (like k-NN and SVM) to ensure that all features contribute equally to the distance computation.
Step 2: Classification Algorithm Implementation
We will implement the following algorithms:

Logistic Regression

A statistical method for binary classification that models the probability of a class label based on one or more predictor variables.
Suitable due to its simplicity and interpretability.
Decision Tree Classifier

A non-parametric method that splits the dataset into subsets based on feature values. It creates a tree-like model of decisions.
Suitable for its interpretability and ability to handle both numerical and categorical data.
Random Forest Classifier

An ensemble method that constructs multiple decision trees and merges them to improve accuracy and control overfitting.
Suitable for its robustness and ability to generalize well on unseen data.
Support Vector Machine (SVM)

A powerful classifier that finds the optimal hyperplane that separates different classes in a high-dimensional space.
Suitable for its effectiveness in high-dimensional spaces.
k-Nearest Neighbors (k-NN)

A simple algorithm that classifies a data point based on how its neighbors are classified.
Suitable for its simplicity and effectiveness in cases with a well-defined decision boundary.'''

In [None]:
#IMPLIMENTATION
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=10000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'k-Nearest Neighbors': KNeighborsClassifier()
}

# Train and evaluate models
results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[model_name] = accuracy

# Display results
results_df = pd.DataFrame(list(results.items()), columns=['Model', 'Accuracy'])
print(results_df)
