In [1]:
import pandas as pd
import numpy as np

# Lire le dataset adult.data et le mettre dans un DataFrame

# Définir les noms des colonnes
column_names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", 
                "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", 
                "hours-per-week", "native-country", "income"]

# Lire le fichier CSV
df = pd.read_csv('adult.data', header=None, names=column_names, na_values=' ?')

# Afficher les premières lignes du DataFrame
print(df.head())


   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country  income  
0          2174             0              40   United-States   <=50

In [2]:
# Convertir la colonne 'income' en valeurs binaires
df['income'] = df['income'].apply(lambda x: 0 if x == ' <=50K' else 1)

# Afficher les premières lignes du DataFrame pour vérifier la conversion
print(df.head())


   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country  income  
0          2174             0              40   United-States       

In [3]:
# Séparer le dataset en X et y où X est l'ensemble des features et y est la classification
X = df.drop('income', axis=1)
y = df['income']

# Afficher les premières lignes de X et y pour vérifier la séparation
print(X.head())
print(y.head())


# One hot encoding de toutes les variables de X
X_encoded = pd.get_dummies(X)

   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country  
0          2174             0              40   United-States  
1           

In [4]:
def quantize_to_bits(data, b):
    """
    Quantize the dataset to integers using b bits.

    Parameters:
    - data: pd.DataFrame, the dataset to quantize
    - b: int, the number of bits to use for quantization

    Returns:
    - pd.DataFrame, the quantized dataset
    """
    # Calculate the number of levels
    num_levels = 2 ** b

    # Initialize a DataFrame to store the quantized data
    quantized_data = pd.DataFrame()

    for column in data.columns:
        if data[column].dtype in [np.float64, np.int64]:  # Check if the column is numeric
            # Find the min and max of the column
            col_min = data[column].min()
            col_max = data[column].max()

            # Quantize the column
            quantized_data[column] = ((data[column] - col_min) / (col_max - col_min) * (num_levels - 1)).round().astype(int)
        else:
            # If the column is not numeric, copy it as is
            quantized_data[column] = data[column]

    return quantized_data



In [5]:
X_encoded = quantize_to_bits(X_encoded, 11)

  quantized_data[column] = data[column]
  quantized_data[column] = data[column]
  quantized_data[column] = data[column]
  quantized_data[column] = data[column]
  quantized_data[column] = data[column]


In [6]:
# Afficher les premières lignes du DataFrame quantizé
print(X_encoded.head())

    age  fnlwgt  education-num  capital-gain  capital-loss  hours-per-week  \
0   617      91           1638            45             0             815   
1   925      99           1638             0             0             251   
2   589     283           1092             0             0             815   
3  1009     309            819             0             0             815   
4   308     453           1638             0             0             815   

   workclass_ Federal-gov  workclass_ Local-gov  workclass_ Never-worked  \
0                       0                     0                        0   
1                       0                     0                        0   
2                       0                     0                        0   
3                       0                     0                        0   
4                       0                     0                        0   

   workclass_ Private  ...  native-country_ Portugal  \
0                 

In [28]:


# Quantization en suivant la règle de Scott
def quantize_column(column):
    bin_width = 3.49 * np.std(column) * (len(column) ** (-1/3))
    bins = np.arange(min(column), max(column) + bin_width, bin_width)
    return np.digitize(column, bins)

# Appliquer la quantization à toutes les colonnes numériques de X
for col in X_encoded.select_dtypes(include=[np.number]).columns:
    X_encoded[col] = quantize_column(X_encoded[col])

# Afficher les premières lignes du DataFrame quantizé
print(X_encoded.head())


   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country  
0          2174             0              40   United-States  
1           

In [7]:
# Combiner X_encoded et y pour créer un DataFrame complet
df_combined = pd.concat([X_encoded, y], axis=1)

# Afficher les premières lignes du DataFrame combiné
print(df_combined.head())


    age  fnlwgt  education-num  capital-gain  capital-loss  hours-per-week  \
0   617      91           1638            45             0             815   
1   925      99           1638             0             0             251   
2   589     283           1092             0             0             815   
3  1009     309            819             0             0             815   
4   308     453           1638             0             0             815   

   workclass_ Federal-gov  workclass_ Local-gov  workclass_ Never-worked  \
0                       0                     0                        0   
1                       0                     0                        0   
2                       0                     0                        0   
3                       0                     0                        0   
4                       0                     0                        0   

   workclass_ Private  ...  native-country_ Puerto-Rico  \
0              

In [8]:
def display_caracteristics(df_encoded, df):
    # Afficher les caractéristiques du dataset
    num_features = df_encoded.shape[1]
    num_instances = df_encoded.shape[0]
    num_classes = df['income'].nunique()

    print(f"Nombre de features: {num_features}")
    print(f"Nombre d'instances: {num_instances}")
    print(f"Nombre de classes: {num_classes}")

# Appeler la fonction avec les DataFrames appropriés
display_caracteristics(df_combined, df)


Nombre de features: 106
Nombre d'instances: 32561
Nombre de classes: 2


In [9]:
# Exporter le DataFrame encodé et quantifié en fichier CSV
df_combined.to_csv('adult.csv', index=False)


# Encodage ordinaire puis quantization


In [31]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler

def quantize_adult_dataframe(df):
    """
    Prend un DataFrame du dataset Adult (déjà chargé avec les noms de colonnes)
    Retourne (X, y) où :
      - X est le DataFrame des features quantifiées sur 11 bits
      - y est la série binaire des classes (0 ou 1)
    """
    df = df.copy()
    
    # Supprimer les lignes contenant des valeurs manquantes
    df.dropna(inplace=True)

    # Colonnes catégorielles à encoder
    categorical_cols = [
        "workclass", "education", "marital-status", "occupation",
        "relationship", "race", "sex", "native-country"
    ]

    # Colonnes numériques (hors income)
    numeric_cols = [col for col in df.columns if col not in categorical_cols + ["income"]]

    # Encoder ordinal les catégories
    encoder = OrdinalEncoder()
    df[categorical_cols] = encoder.fit_transform(df[categorical_cols])

    # Quantization des catégories sur 11 bits
    for col in categorical_cols:
        max_val = df[col].max()
        df[col] = (df[col] / max_val * 2047).round().astype(int)

    # Quantization des colonnes numériques sur 11 bits
    scaler = MinMaxScaler(feature_range=(0, 2047))
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols]).round().astype(int)

    # Identifier et traiter les colonnes binaires (après encodage)
    all_feature_cols = categorical_cols + numeric_cols
    for col in all_feature_cols:
        unique_vals = df[col].nunique()
        if unique_vals == 2:
            mu = df[col].mean()
            print(mu)
            stretched_1 = round(mu)
            df[col] = df[col].apply(lambda x: 0 if x == 0 else stretched_1)

    # Séparer features et label
    y = df["income"].apply(lambda x: 1 if ">50K" in str(x) else 0).astype(int)
    X = df.drop(columns=["income"])

    return X, y

def display_caracteristics(df):
    # Afficher les caractéristiques du dataset
    num_features = df.shape[1] - 1
    num_instances = df.shape[0]
    num_classes = df['income'].nunique()

    print(f"Nombre de features: {num_features}")
    print(f"Nombre d'instances: {num_instances}")
    print(f"Nombre de classes: {num_classes}")

In [32]:
# Chargement du dataset depuis 'adult.data'
column_names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", 
                "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", 
                "hours-per-week", "native-country", "income"]

df = pd.read_csv("adult.data", header=None, names=column_names, na_values=" ?", skipinitialspace=True)

# Quantization
X, y = quantize_adult_dataframe(df)

1369.8636405515801


In [33]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,617,1791,91,1228,1638,1365,146,409,2047,1370,45,0,815,1947
1,925,1535,99,1228,1638,682,585,0,2047,1370,0,0,251,1947
2,589,1024,283,1501,1092,0,877,409,2047,1370,0,0,815,1947
3,1009,1024,309,136,819,682,877,0,1024,1370,0,0,815,1947
4,308,1024,453,1228,1638,682,1462,2047,1024,0,0,0,815,250


In [34]:
df_combined = pd.concat([X, y.rename("income")], axis=1)

In [35]:
display_caracteristics(df_combined)

Nombre de features: 14
Nombre d'instances: 32561
Nombre de classes: 2


In [37]:
df_combined.to_csv("adult.csv", index=False)