In [26]:
import pandas as pd
import numpy as np

# Lire le dataset adult.data et le mettre dans un DataFrame

# Définir les noms des colonnes
column_names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", 
                "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", 
                "hours-per-week", "native-country", "income"]

# Lire le fichier CSV
df = pd.read_csv('adult.data', header=None, names=column_names, na_values=' ?')

# Afficher les premières lignes du DataFrame
print(df.head())


   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country  income  
0          2174             0              40   United-States   <=50

In [27]:
# Convertir la colonne 'income' en valeurs binaires
df['income'] = df['income'].apply(lambda x: 0 if x == ' <=50K' else 1)

# Afficher les premières lignes du DataFrame pour vérifier la conversion
print(df.head())


   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country  income  
0          2174             0              40   United-States       

In [28]:
# Séparer le dataset en X et y où X est l'ensemble des features et y est la classification
X = df.drop('income', axis=1)
y = df['income']

# Afficher les premières lignes de X et y pour vérifier la séparation
print(X.head())
print(y.head())


# One hot encoding de toutes les variables de X
X_encoded = pd.get_dummies(X)

# Quantization en suivant la règle de Scott
def quantize_column(column):
    bin_width = 3.49 * np.std(column) * (len(column) ** (-1/3))
    bins = np.arange(min(column), max(column) + bin_width, bin_width)
    return np.digitize(column, bins)

# Appliquer la quantization à toutes les colonnes numériques de X
for col in X_encoded.select_dtypes(include=[np.number]).columns:
    X_encoded[col] = quantize_column(X_encoded[col])

# Afficher les premières lignes du DataFrame quantizé
print(X_encoded.head())


   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country  
0          2174             0              40   United-States  
1           

In [29]:
# Combiner X_encoded et y pour créer un DataFrame complet
df_combined = pd.concat([X_encoded, y], axis=1)

# Afficher les premières lignes du DataFrame combiné
print(df_combined.head())


   age  fnlwgt  education-num  capital-gain  capital-loss  hours-per-week  \
0   15       6             43             3             1              29   
1   23       7             43             1             1               9   
2   15      18             29             1             1              29   
3   25      20             22             1             1              29   
4    8      29             43             1             1              29   

   workclass_ Federal-gov  workclass_ Local-gov  workclass_ Never-worked  \
0                       1                     1                        1   
1                       1                     1                        1   
2                       1                     1                        1   
3                       1                     1                        1   
4                       1                     1                        1   

   workclass_ Private  ...  native-country_ Puerto-Rico  \
0                   1

In [30]:
def display_caracteristics(df_encoded, df):
    # Afficher les caractéristiques du dataset
    num_features = df_encoded.shape[1]
    num_instances = df_encoded.shape[0]
    num_classes = df['income'].nunique()

    print(f"Nombre de features: {num_features}")
    print(f"Nombre d'instances: {num_instances}")
    print(f"Nombre de classes: {num_classes}")

# Appeler la fonction avec les DataFrames appropriés
display_caracteristics(df_combined, df)


Nombre de features: 106
Nombre d'instances: 32561
Nombre de classes: 2


In [31]:
# Exporter le DataFrame encodé et quantifié en fichier CSV
df_combined.to_csv('adult_encoded_quantized.csv', index=False)
