### Import the Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, classification_report

### Loading and Model Preparation

In [2]:
column_names = [
    'ID', 'Diagnosis',
    'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
    'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se',
    'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst'
]

In [3]:
# Load the wdbc.data file
data_path = '/Users/raja/Desktop/Career/Masters/Universities/mtu/Spring2024/SAT5114/small project 2/breast+cancer+wisconsin+diagnostic/wdbc.data'  # Update this path
data = pd.read_csv(data_path, header=None, names=column_names)

In [4]:
data['Diagnosis'] = data['Diagnosis'].map({'M': 1, 'B': 0})

In [5]:
# Save the DataFrame as a CSV file for future use
csv_path = './wdbc.csv'  # Update this path
data.to_csv(csv_path, index=False)

In [6]:
# Now, you can load this CSV file directly in the future
cancer = pd.read_csv(csv_path)

In [7]:
cancer.head()

Unnamed: 0,ID,Diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [8]:
cancer.isnull().any().any()

False

In [9]:
# Defining features and target
X = cancer.drop(['ID', 'Diagnosis'], axis=1)
y = cancer['Diagnosis']

# Spliting dataset into the training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=109)

## Let's Define the Model and Evalauate initially without Hyperparamaeter Tuning

In [10]:
#Model fitting

model = SVC(random_state=109)
model.fit(X_train, y_train)

# Prediction on the test set

y_pred = model.predict(X_test)

# Computing the evaluation metrics

accuracy_default = accuracy_score(y_test, y_pred)

# Sensitivity
recall_default = recall_score(y_test, y_pred)  
conf_matrix_default = confusion_matrix(y_test, y_pred)

# specificity from the confusion matrix
tn, fp, fn, tp = conf_matrix_default.ravel()
specificity_default = tn / (tn + fp)

# print report

print("Default Model Evaluation")
print(f"Accuracy: {accuracy_default}")
print(f"Sensitivity (Recall): {recall_default}")
print(f"Specificity: {specificity_default}")
print("Confusion Matrix:\n", conf_matrix_default)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Default Model Evaluation
Accuracy: 0.9239766081871345
Sensitivity (Recall): 0.7936507936507936
Specificity: 1.0
Confusion Matrix:
 [[108   0]
 [ 13  50]]

Classification Report:
               precision    recall  f1-score   support

           0       0.89      1.00      0.94       108
           1       1.00      0.79      0.88        63

    accuracy                           0.92       171
   macro avg       0.95      0.90      0.91       171
weighted avg       0.93      0.92      0.92       171



### Hyperparameter Tuning with GridSearchCV

In [11]:
# Define the parameter grid

param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'kernel': ['rbf', 'linear']
}

# Define Stratified K-Fold cross-validator

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Setup GridSearchCV

grid_search = GridSearchCV(SVC(random_state=109), param_grid, cv=cv, scoring='accuracy', refit=True)

# Fit GridSearchCV

grid_search.fit(X_train, y_train)

# Display the best parameters and the best score

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

Best parameters: {'C': 100, 'gamma': 'scale', 'kernel': 'linear'}
Best cross-validation score: 0.95


### Final Model Evaluation

In [12]:
# Predict on the test set using the best model

y_pred_final = grid_search.predict(X_test)

# Compute evaluation metrics for the final model

accuracy_final = accuracy_score(y_test, y_pred_final)
recall_final = recall_score(y_test, y_pred_final)  # Sensitivity
conf_matrix_final = confusion_matrix(y_test, y_pred_final)

# Calculate specificity from the confusion matrix

tn_final, fp_final, fn_final, tp_final = conf_matrix_final.ravel()
specificity_final = tn_final / (tn_final + fp_final)

# Final Model Report

print("\nFinal Model Evaluation")
print("")
print(f"Accuracy: {accuracy_final}")
print(f"Sensitivity (Recall): {recall_final}")
print(f"Specificity: {specificity_final}")
print("Confusion Matrix:\n", conf_matrix_final)
print("\nClassification Report:\n", classification_report(y_test, y_pred_final))


Final Model Evaluation

Accuracy: 0.9707602339181286
Sensitivity (Recall): 0.9682539682539683
Specificity: 0.9722222222222222
Confusion Matrix:
 [[105   3]
 [  2  61]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.98       108
           1       0.95      0.97      0.96        63

    accuracy                           0.97       171
   macro avg       0.97      0.97      0.97       171
weighted avg       0.97      0.97      0.97       171

