In [11]:
import pandas as pd
import numpy as np


In [12]:
# Fonction pour calculer le nombre de bins selon la règle de Scott
def scott_bins(data):
    n = len(data)
    std_dev = np.std(data)
    bin_width = 3.5 * std_dev / np.cbrt(n)
    data_range = np.max(data) - np.min(data)
    return int(np.ceil(data_range / bin_width))

In [13]:

# Définir les noms des colonnes
column_names = [
    'ID', 'Diagnosis', 'Radius_Mean', 'Texture_Mean', 'Perimeter_Mean', 'Area_Mean',
    'Smoothness_Mean', 'Compactness_Mean', 'Concavity_Mean', 'Concave_Points_Mean',
    'Symmetry_Mean', 'Fractal_Dimension_Mean', 'Radius_SE', 'Texture_SE', 'Perimeter_SE',
    'Area_SE', 'Smoothness_SE', 'Compactness_SE', 'Concavity_SE', 'Concave_Points_SE',
    'Symmetry_SE', 'Fractal_Dimension_SE', 'Radius_Worst', 'Texture_Worst',
    'Perimeter_Worst', 'Area_Worst', 'Smoothness_Worst', 'Compactness_Worst',
    'Concavity_Worst', 'Concave_Points_Worst', 'Symmetry_Worst', 'Fractal_Dimension_Worst'
]

# Charger le dataset sans en-tête
cancer_data = pd.read_csv('./wdbc.data', header=None, names=column_names)

In [14]:
# Séparer le dataset en X et y
X = cancer_data.drop(columns=['ID','Diagnosis'])
y = cancer_data['Diagnosis']

# Convertir les 'M' dans y par des 0 et les 'B' par des 1
y = y.map({'M': 0, 'B': 1})


In [15]:
X,y

(     Radius_Mean  Texture_Mean  Perimeter_Mean  Area_Mean  Smoothness_Mean  \
 0          17.99         10.38          122.80     1001.0          0.11840   
 1          20.57         17.77          132.90     1326.0          0.08474   
 2          19.69         21.25          130.00     1203.0          0.10960   
 3          11.42         20.38           77.58      386.1          0.14250   
 4          20.29         14.34          135.10     1297.0          0.10030   
 ..           ...           ...             ...        ...              ...   
 564        21.56         22.39          142.00     1479.0          0.11100   
 565        20.13         28.25          131.20     1261.0          0.09780   
 566        16.60         28.08          108.30      858.1          0.08455   
 567        20.60         29.33          140.10     1265.0          0.11780   
 568         7.76         24.54           47.92      181.0          0.05263   
 
      Compactness_Mean  Concavity_Mean  Concave_Po

### Scott's rule

In [20]:
# Appliquer la règle de Scott et discrétiser chaque colonne de X
for column in X.columns:
    if X[column].dtype in [np.float64, np.int64]:  # Vérifier si la colonne est numérique
        num_bins = scott_bins(X[column])
        # Discrétiser les données
        X[column] = pd.cut(X[column], bins=num_bins, labels=False)

### Uniform quantization based on the number of bits

In [23]:
def quantize_to_bits(data, b):
    """
    Quantize the dataset to integers using b bits.

    Parameters:
    - data: pd.DataFrame, the dataset to quantize
    - b: int, the number of bits to use for quantization

    Returns:
    - pd.DataFrame, the quantized dataset
    """
    # Calculate the number of levels
    num_levels = 2 ** b

    # Initialize a DataFrame to store the quantized data
    quantized_data = pd.DataFrame()

    for column in data.columns:
        if data[column].dtype in [np.float64, np.int64]:  # Check if the column is numeric
            # Find the min and max of the column
            col_min = data[column].min()
            col_max = data[column].max()

            # Quantize the column
            quantized_data[column] = ((data[column] - col_min) / (col_max - col_min) * (num_levels - 1)).round().astype(int)
        else:
            # If the column is not numeric, copy it as is
            quantized_data[column] = data[column]

    return quantized_data



In [24]:
X_quantized = quantize_to_bits(X, 11)
X_quantized.head()

Unnamed: 0,Radius_Mean,Texture_Mean,Perimeter_Mean,Area_Mean,Smoothness_Mean,Compactness_Mean,Concavity_Mean,Concave_Points_Mean,Symmetry_Mean,Fractal_Dimension_Mean,...,Radius_Worst,Texture_Worst,Perimeter_Worst,Area_Worst,Smoothness_Worst,Compactness_Worst,Concavity_Worst,Concave_Points_Worst,Symmetry_Worst,Fractal_Dimension_Worst
0,1067,46,1118,745,1215,1621,1439,1497,1405,1239,...,1271,290,1368,923,1231,1268,1164,1867,1225,857
1,1317,558,1261,1027,593,372,417,714,777,289,...,1242,621,1105,891,711,316,395,1308,478,456
2,1231,799,1219,920,1053,882,947,1301,1043,432,...,1139,737,1041,767,990,789,736,1709,826,437
3,430,739,478,211,1661,1661,1158,1070,1589,2047,...,508,790,494,192,1874,1666,1123,1811,2047,1584
4,1289,321,1292,1002,881,712,950,1061,774,382,...,1064,254,1038,699,895,353,654,1143,322,292


In [25]:
# Concaténer X et y
cancer_data_processed = pd.concat([X_quantized, y], axis=1)


# Trier les lignes en fonction de y
cancer_data_processed = cancer_data_processed.sort_values(by='Diagnosis')


# Exporter dans un fichier CSV
cancer_data_processed.to_csv('./cancer.csv', index=False)