### Importing the packages

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    roc_auc_score,
    ConfusionMatrixDisplay,
    make_scorer
)
import warnings
import time
warnings.filterwarnings("ignore")

### Loading the Dataset

In [2]:
df = pd.read_csv("breast-cancer-dataset.csv")
df

FileNotFoundError: [Errno 2] No such file or directory: 'breast-cancer-dataset.csv'

### Data Analysis 

In [None]:
print(f"The dataset has {df.shape[0]} rows and {df.shape[1]} columns")

In [None]:
### check the info about the variables

In [None]:
df.info()

#### Observations:
- The dataset has the numeric and categorical features

In [None]:
### checkk the basic stats about the data

In [None]:
df.describe()

In [None]:
df =df.replace("#",np.nan)

In [None]:
### check for the missing values
df.isna().sum()

#### Observations:
- There are missing values in the data.

In [None]:
### check for the class distribution
print(df['Diagnosis Result'].value_counts())
df['Diagnosis Result'].value_counts().plot.bar()

### Data Preprocessing

In [None]:
### dropping the features that we not need like year and Serial Number
df = df.drop(['S/N','Year'],axis=1)

In [None]:
df

In [None]:
### convert the categorical features into numeric features
le = LabelEncoder()
df['Breast'] = le.fit_transform(df['Breast'])
df['Breast Quadrant'] = le.fit_transform(df['Breast Quadrant'])
df['Diagnosis Result'] = le.fit_transform(df['Diagnosis Result'])
df

In [None]:
### handling the missing values

In [None]:
df = df.astype(float)

In [None]:
df = df.fillna(df.mean())

In [None]:
### check for the missing values
df.isna().sum()

In [None]:
### seperating the X and y
X = df.drop(['Diagnosis Result'],axis=1)
y = df['Diagnosis Result']

In [None]:
### splitting the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)

## Machine Learning Models

In [None]:
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target,classes):
    """
    Function to compute different metrics to check classification model performance

    model: classifier
    predictors: independent variables
    target: dependent variable
    """
    
    # Start time
    start_time = time.time()


    # predicting using the independent variables
    pred = model.predict(predictors)
    
    # predicting using the independent variables
    pred = model.predict(predictors)

    # End time
    end_time = time.time()

    # Calculate inference time
    inference_time = end_time - start_time

    
    
    #y_pred = model.predict(predictors)
    cm = confusion_matrix(target, pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=list(classes))
    disp.plot()
    acc = accuracy_score(target, pred)  # to compute Accuracy
    recall = recall_score(target, pred)  # to compute Recall
    precision = precision_score(target, pred)  # to compute Precision
    f1 = f1_score(target, pred)  # to compute F1-score

    # creating a dataframe of metrics
    df_perf = pd.DataFrame(
        {"Accuracy": acc, "Recall": recall, "Precision": precision, "F1": f1, "Inference Time": inference_time},
        index=[0],
    )

    return df_perf

### Decision Tree

In [None]:
# Record start time
start_time = time.time()

# Define the classifier
dtree_estimator = DecisionTreeClassifier(class_weight="balanced", random_state=1)

# Grid of parameters to choose from
parameters = {
    "max_depth": np.arange(5, 50, 5),
    "min_samples_leaf": [3, 5, 7],
    "max_leaf_nodes": [2, 5, 7],
    "min_impurity_decrease": [0.0001, 0.001],
}

# Type of scoring used to compare parameter combinations
scorer = make_scorer(f1_score)

# Run the grid search
grid_obj = GridSearchCV(
    dtree_estimator, parameters, n_jobs=-1
)  ## run grid search with n_jobs = -1



grid_obj = grid_obj.fit(X_train, y_train)  ## fit the grid_obj on the train data

# Record end time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time for hyperparameter tuning: {elapsed_time} seconds")

# Set the clf to the best combination of parameters
dtree_estimator = grid_obj.best_estimator_
print("The optimal Decision Tree")

dtree_estimator

In [None]:
### using the optimal Dtree plot the Learning Curve
# Fit the best algorithm to the data.
dtree_estimator.fit(X_train, y_train)
# Learning curve parameters
train_sizes, train_scores, test_scores = learning_curve(
    dtree_estimator, X_train, y_train, cv=5, scoring=scorer, n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 10)
)

# Calculate mean and standard deviation of training scores and test scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Plot learning curve
plt.figure(figsize=(10, 6))
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, test_mean, 'o-', color="g", label="Validation score")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.title("Learning Curve")
plt.legend(loc="best")
plt.grid(True)
plt.show()

In [None]:
### Evaluation of the optimal model on train and test data

In [None]:
# check performance on train data
dtree_train_perf = model_performance_classification_sklearn(
    dtree_estimator, X_train, y_train,le.classes_
)  
dtree_train_perf

In [None]:
# check performance on test data
dtree_test_perf = model_performance_classification_sklearn(
    dtree_estimator, X_test, y_test,le.classes_
)  
dtree_test_perf

### Multi Layered Perceptron

In [None]:
# Record start time
start_time = time.time()
# Define the classifier
mlp_estimator = MLPClassifier(random_state=1)

# Grid of parameters to choose from
parameters = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50),(32,16,8)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd','lbfgs'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive'],
    'max_iter':[200,400,600]
}

# Type of scoring used to compare parameter combinations
scorer = make_scorer(f1_score)

# Run the grid search
grid_obj = GridSearchCV(
    mlp_estimator, parameters, n_jobs=-1
)  ## run grid search with n_jobs = -1

grid_obj = grid_obj.fit(X_train, y_train)  ## fit the grid_obj on the train data

# Record end time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time for hyperparameter tuning: {elapsed_time} seconds")

# Set the clf to the best combination of parameters
mlp_estimator = grid_obj.best_estimator_

print("The optimal MLP Classifier")

mlp_estimator


In [None]:
### using the optimal mlp plot the Learning Curve
# Fit the best algorithm to the data.
mlp_estimator.fit(X_train, y_train)
# Learning curve parameters
train_sizes, train_scores, test_scores = learning_curve(
    mlp_estimator, X_train, y_train, cv=5, scoring=scorer, n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 10)
)

# Calculate mean and standard deviation of training scores and test scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Plot learning curve
plt.figure(figsize=(10, 6))
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, test_mean, 'o-', color="g", label="Validation score")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.title("Learning Curve")
plt.legend(loc="best")
plt.grid(True)
plt.show()

In [None]:
### Evaluation of the optimal model on train and test data

In [None]:
# check performance on train data
mlp_train_perf = model_performance_classification_sklearn(
    mlp_estimator, X_train, y_train,le.classes_
)  
mlp_train_perf

In [None]:
# check performance on test data
mlp_test_perf = model_performance_classification_sklearn(
    mlp_estimator, X_test, y_test,le.classes_
)  
mlp_test_perf

### ADABOOST Classifier

In [None]:
# Record start time
start_time = time.time()
# Choose the type of classifier.
abc_estimator = AdaBoostClassifier(random_state=1)

# Grid of parameters to choose from
parameters = {
    # Let's try different max_depth for base_estimator
    "estimator": [
        DecisionTreeClassifier(max_depth=1, class_weight="balanced", random_state=1),
        DecisionTreeClassifier(max_depth=2, class_weight="balanced", random_state=1),
    ],
    "n_estimators": np.arange(80, 151, 10),
    "learning_rate": np.arange(0.1, 0.5, 0.1),
}

# Type of scoring used to compare parameter  combinations
acc_scorer = make_scorer(f1_score)

# Run the grid search
grid_obj = GridSearchCV(
    abc_estimator, parameters, n_jobs=-1
)  ##  run grid search with cv = 5
grid_obj = grid_obj.fit(X_train, y_train)  ##  fit the grid_obj on train data

# Record end time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time for hyperparameter tuning: {elapsed_time} seconds")

# Set the clf to the best combination of parameters
abc_estimator = grid_obj.best_estimator_

print("The optimal Boosting Classifier")

abc_estimator

In [None]:
### using the optimal mlp plot the Learning Curve
# Fit the best algorithm to the data.
abc_estimator.fit(X_train, y_train)
# Learning curve parameters
train_sizes, train_scores, test_scores = learning_curve(
    abc_estimator, X_train, y_train, cv=5, scoring=scorer, n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 10)
)

# Calculate mean and standard deviation of training scores and test scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Plot learning curve
plt.figure(figsize=(10, 6))
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, test_mean, 'o-', color="g", label="Validation score")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.title("Learning Curve")
plt.legend(loc="best")
plt.grid(True)
plt.show()

In [None]:
### Evaluation of the optimal model on train and test data

In [None]:
# check performance on train data
abc_train_perf = model_performance_classification_sklearn(
    abc_estimator, X_train, y_train,le.classes_
)  
abc_train_perf

In [None]:
# check performance on test data
abc_test_perf = model_performance_classification_sklearn(
    abc_estimator, X_test, y_test,le.classes_
)  
abc_test_perf

### SVM Classifier

In [None]:
# Record start time
start_time = time.time()
# Choose the type of classifier.
svm_estimator = SVC(random_state=1)

# Grid of parameters to choose from
parameters = {
    'C': [0.1, 1, 10,50, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [2, 3, 4],
    'coef0': [0.0, 0.1, 0.5]
}


# Type of scoring used to compare parameter  combinations
acc_scorer = make_scorer(f1_score)

# Run the grid search
grid_obj = GridSearchCV(
    svm_estimator, parameters, n_jobs=-1
)  ##  run grid search with cv = 5
grid_obj = grid_obj.fit(X_train, y_train)  ##  fit the grid_obj on train data

# Record end time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time for hyperparameter tuning: {elapsed_time} seconds")

# Set the clf to the best combination of parameters
svm_estimator = grid_obj.best_estimator_

print("The optimal SVM Classifier")

svm_estimator

In [None]:
### using the optimal mlp plot the Learning Curve
# Fit the best algorithm to the data.
svm_estimator.fit(X_train, y_train)
# Learning curve parameters
train_sizes, train_scores, test_scores = learning_curve(
    svm_estimator, X_train, y_train, cv=5, scoring=scorer, n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 10)
)

# Calculate mean and standard deviation of training scores and test scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Plot learning curve
plt.figure(figsize=(10, 6))
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, test_mean, 'o-', color="g", label="Validation score")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.title("Learning Curve")
plt.legend(loc="best")
plt.grid(True)
plt.show()

In [None]:
### Evaluation of the optimal model on train and test data

In [None]:
# check performance on train data
svm_train_perf = model_performance_classification_sklearn(
    svm_estimator, X_train, y_train,le.classes_
)  
svm_train_perf

In [None]:
# check performance on test data
svm_test_perf = model_performance_classification_sklearn(
    svm_estimator, X_test, y_test,le.classes_
)  
svm_test_perf

### KNN Classifier

In [None]:
# Record start time
start_time = time.time()
# Choose the type of classifier.
knn_estimator = KNeighborsClassifier()

# Grid of parameters to choose from
parameters = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19 ],
    'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']
}


# Type of scoring used to compare parameter  combinations
acc_scorer = make_scorer(f1_score)

# Run the grid search
grid_obj = GridSearchCV(
    knn_estimator, parameters, n_jobs=-1
)  ##  run grid search with cv = 5
grid_obj = grid_obj.fit(X_train, y_train)  ##  fit the grid_obj on train data

# Record end time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time
print(f"Elapsed time for hyperparameter tuning: {elapsed_time} seconds")
# Set the clf to the best combination of parameters
knn_estimator = grid_obj.best_estimator_

print("The optimal KNN Classifier")

knn_estimator

In [None]:
### using the optimal mlp plot the Learning Curve
# Fit the best algorithm to the data.
knn_estimator.fit(X_train, y_train)
# Learning curve parameters
train_sizes, train_scores, test_scores = learning_curve(
    knn_estimator, X_train, y_train, cv=5, scoring=scorer, n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 10)
)

# Calculate mean and standard deviation of training scores and test scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Plot learning curve
plt.figure(figsize=(10, 6))
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, test_mean, 'o-', color="g", label="Validation score")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.title("Learning Curve")
plt.legend(loc="best")
plt.grid(True)
plt.show()

In [None]:
### Evaluation of the optimal model on train and test data

In [None]:
# check performance on train data
knn_train_perf = model_performance_classification_sklearn(
    knn_estimator, X_train, y_train,le.classes_
)  
knn_train_perf

In [None]:
# check performance on test data
knn_test_perf = model_performance_classification_sklearn(
    knn_estimator, X_test, y_test,le.classes_
)  
knn_test_perf

### Comparison of Models

In [None]:
# training performance comparison

models_train_comp_df = pd.concat(
    [knn_train_perf.T, abc_train_perf.T, dtree_train_perf.T , mlp_train_perf.T , svm_train_perf.T], axis=1,
)
models_train_comp_df.columns = [
    "KNN ",
    "Boosting",
    "Decision Tree ",
    "Neural Network",
    "SVM",
]
print("Training performance comparison:")
models_train_comp_df

In [None]:
# testing performance comparison

models_test_comp_df = pd.concat(
    [knn_test_perf.T, abc_test_perf.T, dtree_test_perf.T , mlp_test_perf.T , svm_test_perf.T], axis=1,
)
models_test_comp_df.columns = [
    "KNN ",
    "Boosting",
    "Decision Tree ",
    "Neural Network",
    "SVM",
]
print("Testing performance comparison:")
models_test_comp_df