In [4]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Display the first few rows
print(X.head())
print(y.head())

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst radius  worst texture  worst perimeter  \
0           

In [2]:
# Checking for missing values
print(X.isnull().sum())

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
dtype: int64


1.Missing values check: We first check for missing values in the dataset to ensure data quality. For this dataset, there are no missing values.

2.Feature scaling: We use StandardScaler to standardize the features. Scaling is important here because the features are on different scales, and algorithms like SVM and k-NN are sensitive to the scale of the data.

3.Train-test split: We split the dataset into training (80%) and testing (20%) sets to evaluate the model performance later.

In [19]:

# Since there are no missing values, we proceed to scaling
# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Display shape of datasets
print(f"Training data shape: {X_train.shape}, Testing data shape: {X_test.shape}")

Training data shape: (455, 30), Testing data shape: (114, 30)


# 2. Classification Algorithm Implementation

1. Logistic Regression: Logistic Regression is a linear model used for binary classification. It estimates the probabilities using the logistic function, and is suitable for this dataset as it assumes a linear relationship between the features and the target.

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Logistic Regression
log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(X_train_scaled, y_train)
y_pred_log_reg = log_reg.predict(X_test_scaled)
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)

2. Decision Tree Classifier: Decision Trees split the data into subsets based on feature values. They are interpretable and can capture non-linear relationships, making them suitable for this dataset.

In [22]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree Classifier
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train_scaled, y_train)
y_pred_tree = tree_clf.predict(X_test_scaled)
accuracy_tree = accuracy_score(y_test, y_pred_tree)
print(y_pred_tree,accuracy_tree )

[1 0 0 1 1 0 0 0 0 1 1 0 1 0 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0
 1 0 1 1 0 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 0 0 1 1 0 0 1 1 1 0 0 1 1 0 0 1 0
 1 1 1 1 1 1 0 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 1 0 0 1 0 0 1 1 1 0 0 1 0
 1 1 0] 0.9473684210526315


3. Random Forest Classifier: Random Forest is an ensemble method that uses multiple decision trees to improve predictive accuracy and control overfitting. It is suitable due to its robustness and ability to handle complex interactions.

In [10]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train_scaled, y_train)
y_pred_rf = rf_clf.predict(X_test_scaled)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

4. Support Vector Machine (SVM): SVM finds the hyperplane that best separates the classes in the feature space. It is effective for high-dimensional spaces, making it a good choice for this dataset.

In [17]:
from sklearn.svm import SVC

# Support Vector Machine
svm_clf = SVC()
svm_clf.fit(X_train_scaled, y_train)
y_pred_svm = svm_clf.predict(X_test_scaled)
accuracy_svm = accuracy_score(y_test, y_pred_svm)

5. k-Nearest Neighbors (k-NN): k-NN is a non-parametric method that classifies based on the majority class of the nearest neighbors. It can capture local structures in the data, making it suitable for this dataset.

In [16]:
from sklearn.neighbors import KNeighborsClassifier

# k-Nearest Neighbors
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train_scaled, y_train)
y_pred_knn = knn_clf.predict(X_test_scaled)
accuracy_knn = accuracy_score(y_test, y_pred_knn)

# 3.Model Comparison

In [14]:
# Model performance comparison
model_names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'SVM', 'k-NN']
accuracies = [accuracy_log_reg, accuracy_tree, accuracy_rf, accuracy_svm, accuracy_knn]

# Creating a DataFrame for comparison
results = pd.DataFrame({'Model': model_names, 'Accuracy': accuracies})
print(results)


                 Model  Accuracy
0  Logistic Regression  0.973684
1        Decision Tree  0.947368
2        Random Forest  0.964912
3                  SVM  0.982456
4                 k-NN  0.947368


# conclusion


From the results, you will see which algorithm has the highest accuracy and which one has the lowest. Typically, Random Forest and SVM perform well on this dataset due to their robustness and ability to capture complex relationships, while simpler models like Logistic Regression may perform slightly less effectively but are interpretable.