# Breast cancer dataset

## Download and import required libraries

In [1]:
!pip3 install scikit-learn
!pip3 install pandas 


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [88]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
# Note, for now MinMaxScaler > normalize > StandardScaler > MaxAbsScaler (but not by much)
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, normalize
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score

## Load the dataset and test the data

In [39]:
dataset = pd.read_csv('./dataset/breast-cancer-diagnostic.shuf.lrn.csv')
test_data = pd.read_csv('./dataset/breast-cancer-diagnostic.shuf.tes.csv')
test_data_class = pd.read_csv('./dataset/breast-cancer-diagnostic.shuf.sol.ex.csv')

# Delete whitespace in column names
dataset.columns = dataset.columns.str.strip()
test_data.columns = test_data.columns.str.strip()

In [None]:
# Check the values were loaded correctly
print(dataset.head())

In [None]:
## Check for missing values
print(dataset.isnull().sum())

## Histograms

In [46]:
columns_to_plot = {
    'class': 'Class (Malignant/Benign)',
    'concavePointsWorst': 'Concave Points Worst',
    'concavePointsMean': 'Concave Points Mean',
    'radiusWorst': 'Radius Worst',
    'perimeterWorst': 'Perimeter Worst',
    'areaWorst': 'Area Worst',
    'areaStdErr': 'Area StdErr',
    'concavityWorst': 'Concavity Worst',
    'concavityMean': 'Concavity Mean',
    'perimeterMean': 'Perimeter Mean',
    'areaMean': 'Area Mean',
    'radiusMean': 'Radius Mean',
    'smoothnessWorst': 'Smoothness Worst',
    'perimeterStdErr': 'Perimeter StdErr',
    'textureWorst': 'Texture Worst',
    'symmetryWorst': 'Symmetry Worst',
    'radiusStdErr': 'Radius StdErr',
    'compactnessWorst': 'Compactness Worst',
    'textureMean': 'Texture Mean',
    'compactnessMean': 'Compactness Mean',
    'smoothnessMean': 'Smoothness Mean',
    'concavePointsStdErr': 'Concave Points StdErr',
    'concavityStdErr': 'Concavity StdErr',
    'fractalDimensionWorst': 'Fractal Dimension Worst',
    'fractalDimensionMean': 'Fractal Dimension Mean',
    'textureStdErr': 'Texture StdErr',
    'symmetryStdErr': 'Symmetry StdErr',
    'compactnessStdErr': 'Compactness StdErr',
    'smoothnessStdErr': 'Smoothness StdErr',
    'fractalDimensionStdErr': 'Fractal Dimension StdErr',
    'symmetryMean': 'Symmetry Mean'
}


In [None]:
columns_to_plot_list = list(columns_to_plot.keys())
columns_to_plot_1 = columns_to_plot_list[:columns_to_plot_list.index('radiusMean')]
columns_to_plot_2 = columns_to_plot_list[columns_to_plot_list.index('radiusMean'):]

def plot_columns(columns_to_plot, filename):
    charts_per_row = 2
    total_columns = len(columns_to_plot)
    rows = math.ceil(total_columns / charts_per_row)

    fig, axs = plt.subplots(rows * 2, charts_per_row, figsize=(20, rows * 10))  # Multiplicamos las filas por 2 para tener espacio para ambos gráficos

    axs = axs.ravel()

    for i, column in enumerate(columns_to_plot):
        if column != 'class':
            unique_values = dataset[column].dropna().unique()
            unique_count = len(unique_values)
            bis = 15

            if unique_count <= 4:
                bis = unique_count
                axs[i*2].set_xticks(unique_values)

            sns.histplot(dataset[column], kde=True, ax=axs[i*2], color='skyblue', edgecolor='black', bins=bis)
            max_count = max(np.histogram(dataset[column].dropna(), bins=bis)[0])
            axs[i*2].set_ylim(top=max_count * 1.1) 

            axs[i*2].tick_params(axis='y', labelsize=16)
            axs[i*2].tick_params(axis='x', labelsize=16)
            axs[i*2].set_title(f'Distribution - {column}', fontsize=16)
            axs[i*2].set_xlabel(column, fontsize=14)
            axs[i*2].set_ylabel('Frequency', fontsize=14)

            # Boxplot
            sns.boxplot(x='class', y=column, data=dataset, ax=axs[i*2+1])
            axs[i*2+1].tick_params(axis='y', labelsize=16)
            axs[i*2+1].tick_params(axis='x', labelsize=16)
            axs[i*2+1].set_title(f'Distribution - {column} by class', fontsize=16)
            axs[i*2+1].set_xlabel('Class', fontsize=14)
            axs[i*2+1].set_ylabel(column, fontsize=14)

    plt.tight_layout()
    plt.show()
    fig.savefig(filename, dpi=fig.dpi)

plot_columns(columns_to_plot_1, 'plot1.png')
plot_columns(columns_to_plot_2, 'plot2.png')

## Processing

In [40]:
## Remove the ID column from the dataset and test_data
dataset = dataset.drop('ID', axis=1)
test_data = test_data.drop('ID', axis=1)

In [41]:
## Transform the class to binary values
dataset['class'] = dataset['class'].astype(int)
test_data_class['class'] = test_data_class['class'].astype(int)

In [42]:
## Define valiables to split atttributes and class; X (attributes) and Y (class)
X_attributes = dataset.drop('class', axis=1)
Y_class = dataset['class']

## Define the attributes and class for the test data; X (attributes) and Y (class)
X_attributes_test = test_data
Y_class_test = test_data_class['class']

In [109]:
# For MinMaxScaler, MaxAbsScaler and StandardScaler
## Scale the data (standardize)
scaler = MinMaxScaler()


# Fit the scaler to the dataset and transform the dataset
X_attributes_scaled = dataset.copy()
X_attributes_scaled = scaler.fit_transform(X_attributes)

# Scale the test data, based on the scaler fitted to the training data
X_attributes_test_scaled = test_data.copy()
X_attributes_test_scaled = scaler.transform(X_attributes_test)


# ## For normalization
# # Fit the scaler to the dataset and transform the dataset
# X_attributes_scaled = X_attributes_test.copy()
# X_attributes_scaled = normalize(X_attributes, norm='l2')

# # Scale the test data, based on the scaler fitted to the training data
# X_attributes_test_scaled = X_attributes_test.copy()
# X_attributes_test_scaled = normalize(X_attributes_test, norm='l2')

In [99]:
print(X_attributes_scaled)

[[1.35099588e-02 1.65003436e-02 8.84824879e-02 ... 1.32970511e-04
  2.96909409e-04 7.70048295e-05]
 [1.63246545e-02 2.91327898e-02 1.10899010e-01 ... 3.68090940e-04
  9.48888413e-04 2.47299933e-04]
 [1.02774462e-02 7.26202329e-03 6.79552246e-02 ... 9.45505814e-05
  1.31293534e-04 4.16772807e-05]
 ...
 [1.89387678e-02 3.38575142e-02 1.23780928e-01 ... 1.41611957e-04
  5.25282806e-04 1.35555124e-04]
 [1.54676943e-02 2.48575749e-02 9.92026662e-02 ... 7.04411773e-05
  2.78395734e-04 7.99448747e-05]
 [1.53230940e-02 2.45069271e-02 9.84035814e-02 ... 1.35314321e-04
  3.34276490e-04 1.02412895e-04]]


In [100]:
# Train a Random Forest model to get which features are more important
model = RandomForestClassifier(random_state=42)

# Fit the model to the data
model.fit(X_attributes, Y_class)

# Get the most important features
most_important_attributes = pd.DataFrame(
                            model.feature_importances_,
                            index = X_attributes.columns,
                            columns=['importance']
                        ).sort_values('importance', ascending=False)

print(most_important_attributes)


## Get a list of the most important features whose importance is greater than 0.05
most_important_attributes_list = most_important_attributes[most_important_attributes['importance'] > 0.05].index.tolist()
print(most_important_attributes_list)

                        importance
concavePointsWorst        0.192535
concavePointsMean         0.117753
concavityMean             0.099546
areaWorst                 0.089077
perimeterWorst            0.081338
radiusWorst               0.052917
concavityWorst            0.050328
perimeterMean             0.048775
areaStdErr                0.046046
areaMean                  0.039142
radiusMean                0.033618
compactnessWorst          0.017061
smoothnessWorst           0.015205
perimeterStdErr           0.015106
compactnessMean           0.015102
textureWorst              0.014510
radiusStdErr              0.014154
textureMean               0.010744
symmetryWorst             0.009702
fractalDimensionWorst     0.006551
concavityStdErr           0.004985
compactnessStdErr         0.003936
symmetryMean              0.003893
symmetryStdErr            0.003036
textureStdErr             0.002986
smoothnessStdErr          0.002789
fractalDimensionMean      0.002696
smoothnessMean      

In [101]:
# Remove the less important features
X_attributes_less_attributes = X_attributes.copy()
X_attributes_less_attributes = X_attributes_less_attributes[most_important_attributes_list]

X_attributes_test_less_attributes = X_attributes_test.copy()
X_attributes_test_less_attributes = X_attributes_test_less_attributes[most_important_attributes_list]


# Remove the less important features from the scaled data
X_attributes_scaled_less_attributes = X_attributes_scaled.copy()
X_attributes_test_scaled_less_attributes = X_attributes_test_scaled.copy()

X_attributes_scaled_less_attributes = pd.DataFrame(X_attributes_scaled_less_attributes, columns=X_attributes.columns)
X_attributes_test_scaled_less_attributes = pd.DataFrame(X_attributes_test_scaled_less_attributes, columns=X_attributes_test.columns)

X_attributes_scaled_less_attributes = X_attributes_scaled_less_attributes[most_important_attributes_list]
X_attributes_test_scaled_less_attributes = X_attributes_test_scaled_less_attributes[most_important_attributes_list]

## Models training and evaluation

In [None]:
## Summary of the variables with the data
# X_attributes - Attributes of the training data, without the class column
# Y_class - Class of the training data

# X_attributes_test - Attributes of the test data, without the class column
# Y_class_test - Class of the test data

# X_attributes_scaled - Attributes of the training data, without the class column, scaled
# X_attributes_test_scaled - Attributes of the test data, without the class column, scaled

# X_attributes_less_attributes - Attributes of the training data, without the class column, with only the most important features
# X_attributes_test_less_attributes - Attributes of the test data, without the class column, with only the most important features

# X_attributes_scaled_less_attributes - Attributes of the training data, without the class column, with only the most important features, scaled
# X_attributes_test_scaled_less_attributes - Attributes of the test data, without the class column, with only the most important features, scaled

## 1. KNN

In [102]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

### Test 1
- `Neighbors`: 5
- `Weights`: uniform
- `Data normalization`: No
- `All features`: Yes

In [103]:
k = 5
weights = 'uniform'

training_attributes = X_attributes
training_classes = Y_class

validation_data = X_attributes_test
validation_classes = Y_class_test

knn_model = KNeighborsClassifier(n_neighbors=k, weights=weights)
# Training the model
knn_model.fit(training_attributes, training_classes)

# Validating the model
Y_predicted = knn_model.predict(validation_data)

## Metrics
# Calculating the accuracy of the model
print(f'Accuracy: {accuracy_score(validation_classes, Y_predicted)}')

print(f"F1 Score: {f1_score(validation_classes, Y_predicted, average='weighted')}")

# Classification report, which includes precision, recall, f1-score and support
print(classification_report(validation_classes, Y_predicted))

# Confusion matrix
print(confusion_matrix(validation_classes, Y_predicted))



Accuracy: 0.6408450704225352
F1 Score: 0.7811158798283262
              precision    recall  f1-score   support

           0       1.00      0.64      0.78       284
           1       0.00      0.00      0.00         0

    accuracy                           0.64       284
   macro avg       0.50      0.32      0.39       284
weighted avg       1.00      0.64      0.78       284

[[182 102]
 [  0   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Test 2
- `Neighbors`: 5
- `Weights`: uniform
- `Data normalization`: Yes
- `All features`: Yes

In [110]:
k = 5
weights = 'distance'

training_attributes = X_attributes_scaled
training_classes = Y_class

validation_data = X_attributes_test_scaled
validation_classes = Y_class_test

knn_model = KNeighborsClassifier(n_neighbors=k, weights=weights)
# Training the model
knn_model.fit(training_attributes, training_classes)

# Validating the model
Y_predicted = knn_model.predict(validation_data)

## Metrics
# Calculating the accuracy of the model
print(f'Accuracy: {accuracy_score(validation_classes, Y_predicted)}')

print(f"F1 Score: {f1_score(validation_classes, Y_predicted, average='weighted')}")

# Classification report, which includes precision, recall, f1-score and support
print(classification_report(validation_classes, Y_predicted))

# Confusion matrix
print(confusion_matrix(validation_classes, Y_predicted))



Accuracy: 0.647887323943662
F1 Score: 0.7863247863247863
              precision    recall  f1-score   support

           0       1.00      0.65      0.79       284
           1       0.00      0.00      0.00         0

    accuracy                           0.65       284
   macro avg       0.50      0.32      0.39       284
weighted avg       1.00      0.65      0.79       284

[[184 100]
 [  0   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Test 3
- `Neighbors`: 5
- `Weights`: uniform
- `Data normalization`: No
- `All features`: No

In [93]:
k = 5
weights = 'distance'

training_attributes = X_attributes_less_attributes
training_classes = Y_class

validation_data = X_attributes_test_less_attributes
validation_classes = Y_class_test

knn_model = KNeighborsClassifier(n_neighbors=k, weights=weights)
# Training the model
knn_model.fit(training_attributes, training_classes)

# Validating the model
Y_predicted = knn_model.predict(validation_data)

## Metrics
# Calculating the accuracy of the model
print(f'Accuracy: {accuracy_score(validation_classes, Y_predicted)}')

print(f"F1 Score: {f1_score(validation_classes, Y_predicted, average='weighted')}")

# Classification report, which includes precision, recall, f1-score and support
print(classification_report(validation_classes, Y_predicted))

# Confusion matrix
print(confusion_matrix(validation_classes, Y_predicted))



Accuracy: 0.6690140845070423
F1 Score: 0.8016877637130801
              precision    recall  f1-score   support

           0       1.00      0.67      0.80       284
           1       0.00      0.00      0.00         0

    accuracy                           0.67       284
   macro avg       0.50      0.33      0.40       284
weighted avg       1.00      0.67      0.80       284

[[190  94]
 [  0   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Test 4
- `Neighbors`: 5
- `Weights`: uniform
- `Data normalization`: Yes
- `All features`: No

In [96]:
k = 5
weights = 'uniform'

training_attributes = X_attributes_scaled_less_attributes
training_classes = Y_class

validation_data = X_attributes_test_scaled_less_attributes
validation_classes = Y_class_test

knn_model = KNeighborsClassifier(n_neighbors=k, weights=weights)
# Training the model
knn_model.fit(training_attributes, training_classes)

# Validating the model
Y_predicted = knn_model.predict(validation_data)

## Metrics
# Calculating the accuracy of the model
print(f'Accuracy: {accuracy_score(validation_classes, Y_predicted)}')

print(f"F1 Score: {f1_score(validation_classes, Y_predicted, average='weighted')}")

# Classification report, which includes precision, recall, f1-score and support
print(classification_report(validation_classes, Y_predicted))

# Confusion matrix
print(confusion_matrix(validation_classes, Y_predicted))



Accuracy: 0.6056338028169014
F1 Score: 0.7543859649122807
              precision    recall  f1-score   support

           0       1.00      0.61      0.75       284
           1       0.00      0.00      0.00         0

    accuracy                           0.61       284
   macro avg       0.50      0.30      0.38       284
weighted avg       1.00      0.61      0.75       284

[[172 112]
 [  0   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### ---- Finding the best K ----

In [22]:
# Finding the best K value
# Creating a list of K values in the range of 1 to 20
k_values = list(range(1, 20))

# Creating a list of cross validation scores
cv_scores = []

for k in k_values:
    # Creating the KNN model
    knn_model = KNeighborsClassifier(n_neighbors=k, weights='uniform')
    
    # Doing cross validation
    scores = cross_val_score(knn_model, X_attributes, Y_class, cv=10, scoring='accuracy')
    
    # Saving the mean of the scores
    cv_scores.append(np.mean(scores))

# Finding the optimal K value by finding the one with the highest accuracy
optimal_k = k_values[np.argmax(k_values)]
print(f"The optimal number of neighbors is: {optimal_k}")

The optimal number of neighbors is: 19


### Test 5
- `Neighbors`: 19
- `Weights`: uniform
- `Data normalization`: No
- `All features`: Yes

In [54]:
k = 19
weights = 'uniform'

training_attributes = X_attributes
training_classes = Y_class

validation_data = X_attributes_test
validation_classes = Y_class_test

knn_model = KNeighborsClassifier(n_neighbors=k, weights=weights)
# Training the model
knn_model.fit(training_attributes, training_classes)

# Validating the model
Y_predicted = knn_model.predict(validation_data)

## Metrics
# Calculating the accuracy of the model
print(f'Accuracy: {accuracy_score(validation_classes, Y_predicted)}')

print(f"F1 Score: {f1_score(validation_classes, Y_predicted, average='weighted')}")

# Classification report, which includes precision, recall, f1-score and support
print(classification_report(validation_classes, Y_predicted))

# Confusion matrix
print(confusion_matrix(validation_classes, Y_predicted))



Accuracy: 0.6654929577464789
F1 Score: 0.7991543340380549
              precision    recall  f1-score   support

           0       1.00      0.67      0.80       284
           1       0.00      0.00      0.00         0

    accuracy                           0.67       284
   macro avg       0.50      0.33      0.40       284
weighted avg       1.00      0.67      0.80       284

[[189  95]
 [  0   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Test 6
- `Neighbors`: 19
- `Weights`: uniform
- `Data normalization`: Yes
- `All features`: Yes

In [113]:
k = 19
weights = 'uniform'

training_attributes = X_attributes_scaled
training_classes = Y_class

validation_data = X_attributes_test_scaled
validation_classes = Y_class_test

knn_model = KNeighborsClassifier(n_neighbors=k, weights=weights)
# Training the model
knn_model.fit(training_attributes, training_classes)

# Validating the model
Y_predicted = knn_model.predict(validation_data)

## Metrics
# Calculating the accuracy of the model
print(f'Accuracy: {accuracy_score(validation_classes, Y_predicted)}')

print(f"F1 Score: {f1_score(validation_classes, Y_predicted, average='weighted')}")

# Classification report, which includes precision, recall, f1-score and support
print(classification_report(validation_classes, Y_predicted))

# Confusion matrix
print(confusion_matrix(validation_classes, Y_predicted))



Accuracy: 0.6619718309859155
F1 Score: 0.7966101694915254
              precision    recall  f1-score   support

           0       1.00      0.66      0.80       284
           1       0.00      0.00      0.00         0

    accuracy                           0.66       284
   macro avg       0.50      0.33      0.40       284
weighted avg       1.00      0.66      0.80       284

[[188  96]
 [  0   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Test 7
- `Neighbors`: 19
- `Weights`: uniform
- `Data normalization`: No
- `All features`: No

In [112]:
k = 19
weights = 'uniform'

training_attributes = X_attributes_less_attributes
training_classes = Y_class

validation_data = X_attributes_test_less_attributes
validation_classes = Y_class_test

knn_model = KNeighborsClassifier(n_neighbors=k, weights=weights)
# Training the model
knn_model.fit(training_attributes, training_classes)

# Validating the model
Y_predicted = knn_model.predict(validation_data)

## Metrics
# Calculating the accuracy of the model
print(f'Accuracy: {accuracy_score(validation_classes, Y_predicted)}')

print(f"F1 Score: {f1_score(validation_classes, Y_predicted, average='weighted')}")

# Classification report, which includes precision, recall, f1-score and support
print(classification_report(validation_classes, Y_predicted))

# Confusion matrix
print(confusion_matrix(validation_classes, Y_predicted))



Accuracy: 0.6830985915492958
F1 Score: 0.8117154811715481
              precision    recall  f1-score   support

           0       1.00      0.68      0.81       284
           1       0.00      0.00      0.00         0

    accuracy                           0.68       284
   macro avg       0.50      0.34      0.41       284
weighted avg       1.00      0.68      0.81       284

[[194  90]
 [  0   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Test 8
- `Neighbors`: 19
- `Weights`: uniform
- `Data normalization`: Yes
- `All features`: No

In [111]:
k = 19
weights = 'uniform'

training_attributes = X_attributes_scaled_less_attributes
training_classes = Y_class

validation_data = X_attributes_test_scaled_less_attributes
validation_classes = Y_class_test

knn_model = KNeighborsClassifier(n_neighbors=k, weights=weights)
# Training the model
knn_model.fit(training_attributes, training_classes)

# Validating the model
Y_predicted = knn_model.predict(validation_data)

## Metrics
# Calculating the accuracy of the model
print(f'Accuracy: {accuracy_score(validation_classes, Y_predicted)}')

print(f"F1 Score: {f1_score(validation_classes, Y_predicted, average='weighted')}")

# Classification report, which includes precision, recall, f1-score and support
print(classification_report(validation_classes, Y_predicted))

# Confusion matrix
print(confusion_matrix(validation_classes, Y_predicted))



Accuracy: 0.6584507042253521
F1 Score: 0.7940552016985138
              precision    recall  f1-score   support

           0       1.00      0.66      0.79       284
           1       0.00      0.00      0.00         0

    accuracy                           0.66       284
   macro avg       0.50      0.33      0.40       284
weighted avg       1.00      0.66      0.79       284

[[187  97]
 [  0   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
