# Breast cancer dataset

## Download and import required libraries

In [1]:
!pip3 install scikit-learn
!pip3 install pandas 


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [2]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


## Load the dataset and test the data

In [3]:
dataset = pd.read_csv('./dataset/breast-cancer-diagnostic.shuf.lrn.csv')
test_data = pd.read_csv('./dataset/breast-cancer-diagnostic.shuf.tes.csv')
test_data_class = pd.read_csv('./dataset/breast-cancer-diagnostic.shuf.sol.ex.csv')

In [4]:
# Check the values were loaded correctly
print(dataset.head())

         ID  class  radiusMean   textureMean   perimeterMean   areaMean  \
0    886452   True       13.96         17.05           91.43      602.4   
1  84348301   True       11.42         20.38           77.58      386.1   
2   9012795   True       21.37         15.10          141.30     1386.0   
3    894326   True       18.22         18.87          118.70     1027.0   
4    867387  False       15.71         13.93          102.00      761.7   

    smoothnessMean   compactnessMean   concavityMean   concavePointsMean  ...  \
0          0.10960           0.12790         0.09789             0.05246  ...   
1          0.14250           0.28390         0.24140             0.10520  ...   
2          0.10010           0.15150         0.19320             0.12550  ...   
3          0.09746           0.11170         0.11300             0.07950  ...   
4          0.09462           0.09462         0.07135             0.05933  ...   

    radiusWorst   textureWorst   perimeterWorst   areaWorst   

In [5]:
## Check for missing values
print(dataset.isnull().sum())

ID                         0
class                      0
radiusMean                 0
 textureMean               0
 perimeterMean             0
 areaMean                  0
 smoothnessMean            0
 compactnessMean           0
 concavityMean             0
 concavePointsMean         0
 symmetryMean              0
 fractalDimensionMean      0
 radiusStdErr              0
 textureStdErr             0
 perimeterStdErr           0
 areaStdErr                0
 smoothnessStdErr          0
 compactnessStdErr         0
 concavityStdErr           0
 concavePointsStdErr       0
 symmetryStdErr            0
 fractalDimensionStdErr    0
 radiusWorst               0
 textureWorst              0
 perimeterWorst            0
 areaWorst                 0
 smoothnessWorst           0
 compactnessWorst          0
 concavityWorst            0
 concavePointsWorst        0
 symmetryWorst             0
 fractalDimensionWorst     0
dtype: int64


## Exploration and Preprocessing of the data

### Exploration

This dataset has information 

### Processing

- Missing values / Attributes deletion
    - In this dataset we have no missing values, so no imputation or deletion of records is required.
    - We will drop the `ID` column as it irrelevant for the analysis.

- Encoding
    - The only column that can be encoded is the `class` column. We will change the values True and False to 1 and 0 respectively.

- Scaling
    - We will scale the data using the `StandardScaler` from `sklearn.preprocessing`. In order to make the data more suitable for the models, the data will be scaled to have a mean of 0 and a standard deviation of 1.

In [6]:
## Remove the ID column
dataset = dataset.drop('ID', axis=1)
test_data = test_data.drop('ID', axis=1)

In [7]:
## Transform the class to binary values
dataset['class'] = dataset['class'].astype(int)
test_data_class['class'] = test_data_class['class'].astype(int)

In [8]:
## Scale the data (standardize)
scaler = StandardScaler()

columns_to_scale = dataset.columns.difference(['class'])
dataset[columns_to_scale] = scaler.fit_transform(dataset[columns_to_scale])
test_data[columns_to_scale] = scaler.transform(test_data[columns_to_scale])

In [9]:
print(dataset.head)

<bound method NDFrame.head of      class  radiusMean   textureMean   perimeterMean   areaMean  \
0        1    0.003895     -0.544619        0.028025  -0.103155   
1        1   -0.725533      0.234998       -0.548583  -0.740126   
2        1    2.131870     -1.001151        2.104230   2.204431   
3        1    1.227265     -0.118522        1.163339   1.147230   
4        0    0.506453     -1.275071        0.468079   0.365960   
..     ...         ...           ...             ...        ...   
280      0    0.167585     -0.008486        0.145428   0.065585   
281      0   -0.131078      0.684507       -0.160153  -0.225071   
282      0   -0.961017     -0.099792       -0.894131  -0.857037   
283      0   -0.102361      0.576812       -0.149745  -0.225071   
284      0   -0.492920      0.043020       -0.508616  -0.519557   

      smoothnessMean   compactnessMean   concavityMean   concavePointsMean  \
0           0.883391          0.429308        0.164146            0.137875   
1        

In [10]:
## Define valiables to split atttributes and class; X (attributes) and Y (class)
X_attributes = dataset.drop('class', axis=1)
Y_class = dataset['class']


X_attributes_test = test_data
Y_class_test = test_data_class['class']

In [118]:
# Train a Random Forest model to get which features are more important
model = RandomForestClassifier()
model.fit(X_attributes, Y_class)

most_important_attributes = pd.DataFrame(
                            model.feature_importances_,
                            index = X_attributes.columns,
                            columns=['importance']
                        ).sort_values('importance', ascending=False)

print(most_important_attributes)


## Get a list of the most important features
most_important_attributes_list = most_important_attributes[most_important_attributes['importance'] > 0.05].index.tolist()
print(most_important_attributes_list)

                         importance
 concavePointsWorst        0.221639
 concavePointsMean         0.127309
 radiusWorst               0.123209
 perimeterWorst            0.077350
 areaWorst                 0.068081
 areaStdErr                0.052284
 concavityWorst            0.046769
 concavityMean             0.044298
 perimeterMean             0.034006
 areaMean                  0.033209
radiusMean                 0.023676
 smoothnessWorst           0.017590
 perimeterStdErr           0.014858
 textureWorst              0.014783
 symmetryWorst             0.014420
 radiusStdErr              0.012000
 compactnessWorst          0.010111
 textureMean               0.009114
 compactnessMean           0.008812
 smoothnessMean            0.008792
 concavePointsStdErr       0.005814
 concavityStdErr           0.005754
 fractalDimensionWorst     0.004841
 fractalDimensionMean      0.003917
 textureStdErr             0.003540
 symmetryStdErr            0.003530
 compactnessStdErr         0

In [None]:
# Remove the less important features
X_attributes = X_attributes[most_important_attributes_list]
X_attributes_test = X_attributes_test[most_important_attributes_list]

## Models training and evaluation

### Gradient Boosting Classifier

In [67]:
# Crear una instancia del modelo Random Forest
random_forest_classifier = RandomForestClassifier(n_estimators=1000, random_state=42)

# Entrenar el modelo
random_forest_classifier.fit(X_attributes, Y_class)

In [122]:
from sklearn.metrics import f1_score

y_pred = random_forest_classifier.predict(X_attributes_test)



print("Distribución de clases en entrenamiento:", Y_class.value_counts())
print("Distribución de clases en prueba:", Y_class_test.value_counts())


# Cálculo del F1-score
f1 = f1_score(Y_class_test, y_pred, average='weighted')

## calculate true positives
tp = sum((Y_class_test == 0) & (y_pred == 0))

## calculate false positives
fp = sum((Y_class_test == 1) & (y_pred == 0))

## calculate false negatives
fn = sum((Y_class_test == 0) & (y_pred == 1))

f1_score = 2 * tp / (2 * tp + fp + fn)

print(f"F1 Score: {f1}")

print(f"F1 Score: {f1_score}")

print(f"True Positives: {tp}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")



Distribución de clases en entrenamiento: class
0    188
1     97
Name: count, dtype: int64
Distribución de clases en prueba: class
0    284
Name: count, dtype: int64
F1 Score: 0.7811158798283262
F1 Score: 0.7811158798283262
True Positives: 182
False Positives: 0
False Negatives: 102
