# Obtener los datos

In [1]:
import pandas as pd

"""
Leer datos de entrada
"""

# Definir las categorías del dataset
categories = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 
              'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
             'target']

# Definir el dataset con sus categorías
adult_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', 
                         names=categories)

# Definir el conunto de prueba con sus categorías
adult_test = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test', 
                         names=categories)
adult_test.drop(adult_test.index[0], inplace=True)

# Codificar valores no continuos

In [2]:
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

# Atributos que no son continuos
labels = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 
          'native-country', 'target']

# Utilizamos LabelEncoder para codificar
d1 = defaultdict(LabelEncoder)
d2 = defaultdict(LabelEncoder)
# Codificar los valores, solo si no son continuos
adult_data = adult_data.apply(lambda x: d1[x.name].fit_transform(x) if x.name in labels else x)
adult_test = adult_test.apply(lambda x: d2[x.name].fit_transform(x) if x.name in labels else x)

# Separar los datos en [X, y]

In [3]:
X_train = adult_data.drop('target', 1)
y_train = adult_data.target

X_test = adult_test.drop('target', 1)
y_test = adult_test.target

# Elegir el mejor clasificador

In [4]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as skm

gnb = GaussianNB()
mnb = MultinomialNB()
lr = LogisticRegression()

models = [gnb, mnb, lr]

# Naive Bayes Gaussiano
model = models[0].fit(X_train, y_train)
preds = models[0].predict(X_test)
f1_gnb = skm.f1_score(y_test, preds, average='weighted')

# Naive Bayes Multinomial
model = models[1].fit(X_train, y_train)
preds = models[1].predict(X_test)
f1_mnb = skm.f1_score(y_test, preds, average='weighted')

# Regresión Logística
model = models[2].fit(X_train, y_train)
preds = models[2].predict(X_test)
f1_lr = skm.f1_score(y_test, preds, average='weighted')

# Puntuaciones F1 de cada clasificador
# Generalmente F1 es más útil que la precisión, especialmente con una distribución de clases desigual.
f1_scores = [f1_gnb, f1_mnb, f1_lr]

max_v = 0
index = 0
for i, val in enumerate(f1_scores):
    if val > max_v:
        max_v = val
        index = i

# Elegimos al mejor modelo al que tiene mayor puntuación F1
model = models[index]

# Entrenar clasificador y reportar resultados

In [5]:
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# Reporte de resultados
print(skm.classification_report(y_test, preds, target_names=['<=50K', '>50K']))

             precision    recall  f1-score   support

      <=50K       0.81      0.95      0.88     12435
       >50K       0.65      0.29      0.40      3846

avg / total       0.77      0.80      0.76     16281

