# EDA - Problema de clasificación - Adult
### Objetivo
Predecir si los ingresos anuales de una persona son superior a $50K
### Data
https://archive.ics.uci.edu/dataset/2/adult

## 0. Carga de librerias

In [18]:
import sys
from pathlib import Path
import pandas as pd
from sklearn.impute import KNNImputer
import numpy as np

## 1. Carga de Datos

In [2]:
# Añade el directorio raíz del proyecto al sys.path
PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

from rfh_testdatascience_1.config import RAW_DATA_DIR

# Paths a los archivos
train_path = RAW_DATA_DIR / "adult.data"
test_path = RAW_DATA_DIR / "adult.test"

# Leer los datos de entrenamiento
df_train = pd.read_csv(train_path)

# Leer los datos de prueba
df_test = pd.read_csv(test_path)

[32m2025-07-29 22:50:58.987[0m | [1mINFO    [0m | [36mrfh_testdatascience_1.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\Usuario\Desktop\Capgemini\RFH-TestDataScience-1\rfh_testdatascience_1[0m


## 2. Revisión inicial

In [3]:
# Mostrar primeras filas
print(df_train.head())
print('------------------------------------------------')
print(df_test.head())

   39          State-gov   77516   Bachelors   13        Never-married  \
0  50   Self-emp-not-inc   83311   Bachelors   13   Married-civ-spouse   
1  38            Private  215646     HS-grad    9             Divorced   
2  53            Private  234721        11th    7   Married-civ-spouse   
3  28            Private  338409   Bachelors   13   Married-civ-spouse   
4  37            Private  284582     Masters   14   Married-civ-spouse   

         Adm-clerical   Not-in-family   White     Male   2174   0   40  \
0     Exec-managerial         Husband   White     Male      0   0   13   
1   Handlers-cleaners   Not-in-family   White     Male      0   0   40   
2   Handlers-cleaners         Husband   Black     Male      0   0   40   
3      Prof-specialty            Wife   Black   Female      0   0   40   
4     Exec-managerial            Wife   White   Female      0   0   40   

    United-States   <=50K  
0   United-States   <=50K  
1   United-States   <=50K  
2   United-States   <=50K 

### Insights
- Falta incluir la cabeceras -> están en el archivo "adult.name"
- Null con ?
- Sobre la primera linea en test

In [4]:
# Nombres de columnas (según adult.names)
column_names = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
]

# Leer los datos de entrenamiento
df_train = pd.read_csv(train_path, header=None, names=column_names, na_values=" ?")

# Leer los datos de prueba
df_test = pd.read_csv(test_path, header=0, names=column_names, na_values=" ?")

In [5]:
# Mostrar primeras filas
print(df_train.head())
print('------------------------------------------------')
print(df_test.head())

   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country  income  
0          2174             0              40   United-States   <=50

In [6]:
# Forma
print(df_train.shape)
print('------------------------------------------------')
print(df_test.shape)

(32561, 15)
------------------------------------------------
(16281, 15)


In [7]:
# Tipos de datos
print(df_train.dtypes)
print('------------------------------------------------')
print(df_test.dtypes)

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
income            object
dtype: object
------------------------------------------------
age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
income            object
dtype: object


In [8]:
# Estadistica básica
print(df_train.describe())
print('------------------------------------------------')
print(df_test.describe())

                age        fnlwgt  education-num  capital-gain  capital-loss  \
count  32561.000000  3.256100e+04   32561.000000  32561.000000  32561.000000   
mean      38.581647  1.897784e+05      10.080679   1077.648844     87.303830   
std       13.640433  1.055500e+05       2.572720   7385.292085    402.960219   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.178270e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.783560e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.370510e+05      12.000000      0.000000      0.000000   
max       90.000000  1.484705e+06      16.000000  99999.000000   4356.000000   

       hours-per-week  
count    32561.000000  
mean        40.437456  
std         12.347429  
min          1.000000  
25%         40.000000  
50%         40.000000  
75%         45.000000  
max         99.000000  
------------------------------------------------
              

### Insights
- Un poco raro que train y test tengan exactamente los mismo estadisticos en las columnas númericas. Imagino que se debe a que es un data set preparado. Aunque por la descripción parece que es un extraición real.
- Hay valores que parecen nulos como el max de hours per week
- La distribución de capital-gain y capital-loss sugiere que una variable erronea

# 2. Análisis variable objetivo

In [9]:
# Balanceo 
print(df_train['income'].value_counts())
print('------------------------------------------------')
print(df_test['income'].value_counts())

income
<=50K    24720
>50K      7841
Name: count, dtype: int64
------------------------------------------------
income
<=50K.    12435
>50K.      3846
Name: count, dtype: int64


In [10]:
# Balanceo porcentual
print("Train (%):")
print(df_train['income'].value_counts(normalize=True) * 100)
print('------------------------------------------------')
print("Test (%):")
print(df_test['income'].value_counts(normalize=True) * 100)

Train (%):
income
<=50K    75.919044
>50K     24.080956
Name: proportion, dtype: float64
------------------------------------------------
Test (%):
income
<=50K.    76.377372
>50K.     23.622628
Name: proportion, dtype: float64


### Insights
- El train y test tienen mas o menos la misma distribución en la variable objetivo
- Esta ligeramente desbalanceado, habrá que pobrar si el modelo mejora con algun metodo que ajuste este desbalanceo

## 3. Análisis de datos faltantes

In [11]:
print(df_train.isnull().sum())
print('------------------------------------------------')
print(df_test.isnull().sum())

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income               0
dtype: int64
------------------------------------------------
age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    274
income              0
dtype: int64


In [12]:
cols = ['occupation', 'workclass', 'native-country']

# Conteo de nulos
missing_count = df_train[cols].isnull().sum()

# Porcentaje de nulos
missing_percent = df_train[cols].isnull().mean() * 100

# Unir en un solo DataFrame para visualizar
missing_df = pd.DataFrame({
    'Nulos': missing_count,
    '% Nulos': missing_percent.round(2)
})

print(missing_df)

print('----------------------------------------')

# Conteo de nulos
missing_count = df_test[cols].isnull().sum()

# Porcentaje de nulos
missing_percent = df_test[cols].isnull().mean() * 100

# Unir en un solo DataFrame para visualizar
missing_df = pd.DataFrame({
    'Nulos': missing_count,
    '% Nulos': missing_percent.round(2)
})

print(missing_df)

                Nulos  % Nulos
occupation       1843     5.66
workclass        1836     5.64
native-country    583     1.79
----------------------------------------
                Nulos  % Nulos
occupation        966     5.93
workclass         963     5.91
native-country    274     1.68


### Insights
- Tenemos valores alrededor del 5% en variables categorias, vamos a analizarlos e intentar imputarlos de alguna forma

#### workclass

In [15]:
# Conteo por cada clase
print(df_train['workclass'].value_counts())
print('------------------------------------------------')
print(df_test['workclass'].value_counts())

workclass
Private             22696
Self-emp-not-inc     2541
Local-gov            2093
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: count, dtype: int64
------------------------------------------------
workclass
Private             11210
Self-emp-not-inc     1321
Local-gov            1043
State-gov             683
Self-emp-inc          579
Federal-gov           472
Without-pay             7
Never-worked            3
Name: count, dtype: int64


#### occupation

In [16]:
# Conteo por cada clase
print(df_train['occupation'].value_counts())
print('------------------------------------------------')
print(df_test['occupation'].value_counts())

occupation
Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         3770
Sales                3650
Other-service        3295
Machine-op-inspct    2002
Transport-moving     1597
Handlers-cleaners    1370
Farming-fishing       994
Tech-support          928
Protective-serv       649
Priv-house-serv       149
Armed-Forces            9
Name: count, dtype: int64
------------------------------------------------
occupation
Prof-specialty       2032
Exec-managerial      2020
Craft-repair         2013
Sales                1854
Adm-clerical         1841
Other-service        1628
Machine-op-inspct    1020
Transport-moving      758
Handlers-cleaners     702
Tech-support          518
Farming-fishing       496
Protective-serv       334
Priv-house-serv        93
Armed-Forces            6
Name: count, dtype: int64


#### occupation

In [17]:
# Conteo por cada clase
print(df_train['native-country'].value_counts())
print('------------------------------------------------')
print(df_test['native-country'].value_counts())

native-country
United-States                 29170
Mexico                          643
Philippines                     198
Germany                         137
Canada                          121
Puerto-Rico                     114
El-Salvador                     106
India                           100
Cuba                             95
England                          90
Jamaica                          81
South                            80
China                            75
Italy                            73
Dominican-Republic               70
Vietnam                          67
Guatemala                        64
Japan                            62
Poland                           60
Columbia                         59
Taiwan                           51
Haiti                            44
Iran                             43
Portugal                         37
Nicaragua                        34
Peru                             31
France                           29
Greece       