In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron, LinearRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.plotting import plot_decision_regions
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
data = pd.read_csv('breast_cancer_dataset_preprocessed.csv')

# Separate features (X) and target variable (y)
X = data.drop('y', axis=1)

y = np.where(data['y']=="M", 1, -1)
# y = data['y']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=395)

In [3]:

# # Create a list of models
# models = {
#     'Neural Network': MLPClassifier(),
#     'SVM Linear': SVC(kernel='linear'),
#     'SVM Poly Degree 3': SVC(kernel='poly', degree=3),
#     'SVM RBF': SVC(kernel='rbf'),
#     'Perceptron': Perceptron(),
# }

# # Evaluate models
# for name, model in models.items():
#     # Use pipeline to include scaling for models that benefit from it
#     if name in ['Neural Network', 'SVM Linear', 'Perceptron', 'Linear Regression']:
#         pipeline = Pipeline([('scaler', StandardScaler()), ('model', model)])
#     else:
#         pipeline = Pipeline([('model', model)])

#     # Use GridSearchCV to find the best hyperparameters
#     grid_search = GridSearchCV(pipeline, parameters, cv=5, scoring='accuracy')
#     grid_search.fit(X_train, y_train)

#     # Get the best model
#     best_model = grid_search.best_estimator_

#     # Predict and evaluate on the test set
#     y_pred = best_model.predict(X_test)
#     accuracy = accuracy_score(y_test, y_pred)

#     # Display results
#     print(f"{name}:\n{classification_report(y_test, y_pred)}")
#     print(f"Accuracy: {accuracy}\n")

In [4]:
neural_network =  MLPClassifier()
parameters = {}
model_nn = GridSearchCV(neural_network, parameters, cv=5, scoring='accuracy')
model_nn.fit(X_train, y_train)

# Get the best model
best_model = model_nn.best_estimator_

# Predict and evaluate on the test set
y_pred = best_model.predict(X_test)
accuracy_nn = accuracy_score(y_test, y_pred)

print(f"Neural Network:\n{classification_report(y_test, y_pred)}")
print(f"Accuracy: {accuracy_nn}\n")

Neural Network:
              precision    recall  f1-score   support

          -1       0.96      1.00      0.98        51
           1       1.00      0.92      0.96        26

    accuracy                           0.97        77
   macro avg       0.98      0.96      0.97        77
weighted avg       0.98      0.97      0.97        77

Accuracy: 0.974025974025974



In [5]:
c_arr = [0.01, 0.1, 1, 10, 100]

best_error = {"Linear": [0, 0], "Poly": [0, 0], "RBF": [0, 0]}
best_error_model = {"Linear": [], "Poly": [], "RBF": []}

for c in c_arr:
    linear_svm = SVC(kernel='linear', C=c)
    linear_svm.fit(X_train, y_train)
    linear_pred = linear_svm.predict(X_test)
    accuracy_linear = accuracy_score(y_test, linear_pred)
    
    poly_svm = SVC(kernel='poly', degree=3, C=c)
    poly_svm.fit(X_train, y_train)
    poly_pred = poly_svm.predict(X_test)
    accuracy_poly = accuracy_score(y_test, poly_pred)

    rbf_svm = SVC(kernel='rbf', C=c)
    rbf_svm.fit(X_train, y_train)
    svm_pred = rbf_svm.predict(X_test)
    accuracy_rbf = accuracy_score(y_test, svm_pred)

    print(f"SVM Linear:\n{classification_report(y_test, linear_pred)}")
    print(accuracy_linear, best_error["Linear"][0])
    
    print(f"SVM Poly deg 3:\n{classification_report(y_test, poly_pred)}")
    print(accuracy_poly, best_error["Poly"][0])
    
    print(f"SVM RBF:\n{classification_report(y_test, svm_pred)}")
    print(accuracy_rbf, best_error["RBF"][0])
    
    print()
    
    if(accuracy_linear > best_error["Linear"][0]):
        best_error["Linear"] = [accuracy_linear, c]
        best_error_model["Linear"] = linear_svm

    if(accuracy_poly > best_error["Poly"][0]):
        best_error["Poly"] = [accuracy_poly, c]
        best_error_model["Poly"] = poly_svm

    if(accuracy_rbf > best_error["RBF"][0]):
        best_error["RBF"] = [accuracy_rbf, c]
        best_error_model["RBF"] = rbf_svm

        
print(best_error)

SVM Linear:
              precision    recall  f1-score   support

          -1       0.96      1.00      0.98        51
           1       1.00      0.92      0.96        26

    accuracy                           0.97        77
   macro avg       0.98      0.96      0.97        77
weighted avg       0.98      0.97      0.97        77

0.974025974025974 0
SVM Poly deg 3:
              precision    recall  f1-score   support

          -1       0.68      1.00      0.81        51
           1       1.00      0.08      0.14        26

    accuracy                           0.69        77
   macro avg       0.84      0.54      0.48        77
weighted avg       0.79      0.69      0.58        77

0.6883116883116883 0
SVM RBF:
              precision    recall  f1-score   support

          -1       0.66      1.00      0.80        51
           1       0.00      0.00      0.00        26

    accuracy                           0.66        77
   macro avg       0.33      0.50      0.40       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SVM Poly deg 3:
              precision    recall  f1-score   support

          -1       0.94      0.98      0.96        51
           1       0.96      0.88      0.92        26

    accuracy                           0.95        77
   macro avg       0.95      0.93      0.94        77
weighted avg       0.95      0.95      0.95        77

0.948051948051948 0.922077922077922
SVM RBF:
              precision    recall  f1-score   support

          -1       0.92      0.94      0.93        51
           1       0.88      0.85      0.86        26

    accuracy                           0.91        77
   macro avg       0.90      0.89      0.90        77
weighted avg       0.91      0.91      0.91        77

0.9090909090909091 0.961038961038961

{'Linear': [0.974025974025974, 0.01], 'Poly': [0.948051948051948, 100], 'RBF': [0.961038961038961, 1]}


In [6]:
linear_model =  Perceptron()
parameters = {}
model_lin = GridSearchCV(linear_model, parameters, cv=5, scoring='accuracy')
model_lin.fit(X_train, y_train)

# Get the best model
best_model = model_lin.best_estimator_

# Predict and evaluate on the test set
y_pred = best_model.predict(X_test)
accuracy_percep = accuracy_score(y_test, y_pred)

print(f"Perceptron:\n{classification_report(y_test, y_pred)}")
print(f"Accuracy: {accuracy_percep}\n")

Perceptron:
              precision    recall  f1-score   support

          -1       0.94      0.96      0.95        51
           1       0.92      0.88      0.90        26

    accuracy                           0.94        77
   macro avg       0.93      0.92      0.93        77
weighted avg       0.93      0.94      0.93        77

Accuracy: 0.935064935064935



I have split the data into 80% for training and 20% for testing. I used 5 models to test the performance.
1. Neural Network
2. SVM Linear
3. SVM Poly with degree = 3
4. SVM RBF
5. Perceptron

I trained the SVM models with regularization parameters as c = [0.01, 0.1, 1, 10, 100]

In [7]:
mod = max(best_error, key=best_error.get)
print("For SVM,")
print("The best Kernel is {}, the C value is {} and the best minimum error is {}".format(mod, best_error[mod][1], best_error[mod][0]))
print()
print("For Neural Network, the accuracy is", accuracy_nn)
print()
print("For Perceptron, the accuracy is", accuracy_percep)

For SVM,
The best Kernel is Linear, the C value is 0.01 and the best minimum error is 0.974025974025974

For Neural Network, the accuracy is 0.974025974025974

For Perceptron, the accuracy is 0.935064935064935


The Neural Network emerged and the SVM Linear kernel with c as 0.01 are the best-performing model, providing high accuracy and balanced precision, recall, and F1-scores for both classes. Its ability to capture complex relationships in the data made it a suitable choice for this classification task. SVM with RBF Kernel and c as 1 also demonstrated strong performance and could be considered as an alternative.