In [12]:
import pandas as pd
import numpy as np

# Définir les noms des colonnes
column_names = [
    'ID', 'Diagnosis', 'Radius_Mean', 'Texture_Mean', 'Perimeter_Mean', 'Area_Mean',
    'Smoothness_Mean', 'Compactness_Mean', 'Concavity_Mean', 'Concave_Points_Mean',
    'Symmetry_Mean', 'Fractal_Dimension_Mean', 'Radius_SE', 'Texture_SE', 'Perimeter_SE',
    'Area_SE', 'Smoothness_SE', 'Compactness_SE', 'Concavity_SE', 'Concave_Points_SE',
    'Symmetry_SE', 'Fractal_Dimension_SE', 'Radius_Worst', 'Texture_Worst',
    'Perimeter_Worst', 'Area_Worst', 'Smoothness_Worst', 'Compactness_Worst',
    'Concavity_Worst', 'Concave_Points_Worst', 'Symmetry_Worst', 'Fractal_Dimension_Worst'
]

# Charger le dataset sans en-tête
cancer_data = pd.read_csv('./wdbc.data', header=None, names=column_names)

In [15]:
# Séparer le dataset en X et y
X = cancer_data.drop(columns=['ID','Diagnosis'])
y = cancer_data['Diagnosis']


In [17]:
X,y

(     Radius_Mean  Texture_Mean  Perimeter_Mean  Area_Mean  Smoothness_Mean  \
 0          17.99         10.38          122.80     1001.0          0.11840   
 1          20.57         17.77          132.90     1326.0          0.08474   
 2          19.69         21.25          130.00     1203.0          0.10960   
 3          11.42         20.38           77.58      386.1          0.14250   
 4          20.29         14.34          135.10     1297.0          0.10030   
 ..           ...           ...             ...        ...              ...   
 564        21.56         22.39          142.00     1479.0          0.11100   
 565        20.13         28.25          131.20     1261.0          0.09780   
 566        16.60         28.08          108.30      858.1          0.08455   
 567        20.60         29.33          140.10     1265.0          0.11780   
 568         7.76         24.54           47.92      181.0          0.05263   
 
      Compactness_Mean  Concavity_Mean  Concave_Po

In [18]:
# Convertir les 'M' dans y par des 0 et les 'B' par des 1
y = y.map({'M': 0, 'B': 1})


In [20]:
# Fonction pour calculer le nombre de bins selon la règle de Scott
def scott_bins(data):
    n = len(data)
    std_dev = np.std(data)
    bin_width = 3.5 * std_dev / np.cbrt(n)
    data_range = np.max(data) - np.min(data)
    return int(np.ceil(data_range / bin_width))

# Appliquer la règle de Scott et discrétiser chaque colonne de X
for column in X.columns:
    if X[column].dtype in [np.float64, np.int64]:  # Vérifier si la colonne est numérique
        num_bins = scott_bins(X[column])
        # Discrétiser les données
        X[column] = pd.cut(X[column], bins=num_bins, labels=False)

In [22]:
# Concaténer X et y
cancer_data_processed = pd.concat([X, y], axis=1)


# Trier les lignes en fonction de y
cancer_data_processed = cancer_data_processed.sort_values(by='Diagnosis')


# Exporter dans un fichier CSV
cancer_data_processed.to_csv('./cancer.csv', index=False)
