In [None]:
# Preprocess the data to handle any missing values and perform necessary feature scaling.

import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Load the dataset
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# 1. Handle Missing Values (if any)
print("Missing values per column:\n", df.isnull().sum())

# Impute missing values (if any exist) using mean for numerical features
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# 2. Feature Scaling (Standardization)
scaler = StandardScaler()
X = df_imputed.drop('target', axis=1) # Features
y = df_imputed['target'] # Target variable
X_scaled = scaler.fit_transform(X)

# 3. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 4. Model Training (example: Logistic Regression)
model = LogisticRegression()
model.fit(X_train, y_train)

# 5. Model Evaluation
y_pred = model.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Missing values per column:
 mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64
Confusion Matrix:
 [[41  2]
 [ 1 70]]

Classification Report

In [None]:
#Implement the following five classification algorithms:
# 1. Logistic Regression
# 2. Decision Tree Classifier
# 3. Random Forest Classifier
# 4. Support Vector Machine (SVM)
# 5. k-Nearest Neighbors (k-NN)
# For each algorithm, provide a brief description of how it works and why it might be suitable for this dataset.


from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=10000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "k-NN": KNeighborsClassifier()
}

results = {}
for name, clf in classifiers.items():
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"{name} accuracy: {accuracy}")

print("\nAlgorithm Descriptions and Suitability:")
print("Logistic Regression: Models the probability of a binary outcome. Suitable for linearly separable data or when a probability estimate is needed.")
print("Decision Tree: Creates a tree-like model of decisions and their possible consequences. Suitable for both linear and non-linear data and provides interpretability.")
print("Random Forest: An ensemble method that combines multiple decision trees to improve accuracy and reduce overfitting.  Suitable for complex datasets, often performs well.")
print("SVM: Finds an optimal hyperplane to separate data points into different classes. Suitable for both linear and non-linear data (using kernels), effective in high-dimensional spaces.")
print("k-NN: Classifies data points based on the majority class among its k nearest neighbors. Suitable for non-linear data, but can be computationally expensive for large datasets.")

Logistic Regression accuracy: 0.9736842105263158
Decision Tree accuracy: 0.9385964912280702
Random Forest accuracy: 0.9649122807017544
SVM accuracy: 0.9824561403508771
k-NN accuracy: 0.9473684210526315

Algorithm Descriptions and Suitability:
Logistic Regression: Models the probability of a binary outcome. Suitable for linearly separable data or when a probability estimate is needed.
Decision Tree: Creates a tree-like model of decisions and their possible consequences. Suitable for both linear and non-linear data and provides interpretability.
Random Forest: An ensemble method that combines multiple decision trees to improve accuracy and reduce overfitting.  Suitable for complex datasets, often performs well.
SVM: Finds an optimal hyperplane to separate data points into different classes. Suitable for both linear and non-linear data (using kernels), effective in high-dimensional spaces.
k-NN: Classifies data points based on the majority class among its k nearest neighbors. Suitable for

In [None]:
# Compare the performance of the five classification algorithms.
# Which algorithm performed the best and which one performed the worst?

best_algorithm = max(results, key=results.get)
worst_algorithm = min(results, key=results.get)

print(f"\nBest performing algorithm: {best_algorithm} with accuracy {results[best_algorithm]}")
print(f"Worst performing algorithm: {worst_algorithm} with accuracy {results[worst_algorithm]}")


Best performing algorithm: SVM with accuracy 0.9824561403508771
Worst performing algorithm: Decision Tree with accuracy 0.9385964912280702
