In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [10]:
# Load the wine dataset
df = pd.read_csv('./wine.data', header=None)
df.columns = ['class', 'alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 
              'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 
              'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']

# Display the first few rows of the dataframe
df.head()



Unnamed: 0,class,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [5]:
def display_caracteristics(df_encoded, df):
    # Afficher les caract√©ristiques du dataset
    num_features = df_encoded.shape[1] - 1
    num_instances = df_encoded.shape[0]
    num_classes = df['class'].nunique()

    print(f"Nombre de features: {num_features}")
    print(f"Nombre d'instances: {num_instances}")
    print(f"Nombre de classes: {num_classes}")

### Scott's rule

In [15]:
def scott_bin_width(data):
    """
    Calculate the optimal bin width using Scott's rule.
    
    Parameters:
    data (array-like): The input data for which to calculate the bin width.
    
    Returns:
    float: The optimal bin width.
    """
    n = len(data)
    sigma = np.std(data)
    bin_width = 3.5 * sigma / n**(1/3)
    return bin_width

In [16]:


# Quantize each feature using Scott's rule
quantized_data = df.copy()
for column in df.columns[1:]:  # Exclude the 'class' column
    bin_width = scott_bin_width(df[column])
    num_bins = int((df[column].max() - df[column].min()) / bin_width)
    quantized_data[column] = np.floor((df[column] - df[column].min()) / bin_width).astype(int)

### Quantization based on the number of bits

In [11]:
def quantize_to_bits(data, b):
    """
    Quantize the dataset to integers using b bits.

    Parameters:
    - data: pd.DataFrame, the dataset to quantize
    - b: int, the number of bits to use for quantization

    Returns:
    - pd.DataFrame, the quantized dataset
    """
    # Calculate the number of levels
    num_levels = 2 ** b

    # Initialize a DataFrame to store the quantized data
    quantized_data = pd.DataFrame()

    for column in data.columns:
        if data[column].dtype in [np.float64, np.int64]:  # Check if the column is numeric
            # Find the min and max of the column
            col_min = data[column].min()
            col_max = data[column].max()

            # Quantize the column
            quantized_data[column] = ((data[column] - col_min) / (col_max - col_min) * (num_levels - 1)).round().astype(int)
        else:
            # If the column is not numeric, copy it as is
            quantized_data[column] = data[column]

    return quantized_data


In [26]:
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

In [27]:
X.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [28]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: class, dtype: int64

In [29]:
X_quantized = quantize_to_bits(X, 11)
X_quantized.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,1724,392,1171,528,1268,1285,1175,579,1214,762,932,1987,1149
1,1169,421,854,63,668,1179,1045,502,562,541,949,1597,1127
2,1147,655,1434,844,690,1285,1252,657,1550,768,915,1425,1324
3,1799,490,1248,654,957,2026,1360,425,1143,1139,632,1635,1755
4,1190,748,1653,1097,1068,1285,1015,1004,910,531,932,1245,667


In [30]:
# Concatenate X and y
dataset = pd.concat([X_quantized, y], axis=1)

In [31]:
dataset['class'] = dataset['class'] - 1
dataset['class']


0      0
1      0
2      0
3      0
4      0
      ..
173    2
174    2
175    2
176    2
177    2
Name: class, Length: 178, dtype: int64

In [32]:


# Export the dataset to a CSV file
dataset.to_csv('./wine.csv', index=False)
