In [1]:
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

In [64]:
df = (
   pd.read_csv(
       '../data/adult.data',
       names=[
          'age',
          'workclass',
          'fnlwgt',
          'education',
          'education-num',
          'marital-status',
          'occupation',
          'relationship',
          'race',
          'sex',
          'capital-gain',
          'capital-loss',
          'hours-per-week',
          'native-country',
          'income',
       ],
       dtype={
          'age': np.int64,
          'workclass': 'category',
          'fnlwgt': np.int64,
          'education': 'category',
          'education-num': np.int64,
          'marital-status': 'category',
          'occupation': 'category',
          'relationship': 'category',
          'race': 'category',
          'sex': 'category',
          'capital-gain': np.int64,
          'capital-loss': np.int64,
          'hours-per-week': np.int64,
          'native-country': 'category',
          'income': 'category',
       },
       na_values='?',
       sep=', '
   )
   .dropna()
)
df.head()

  return func(*args, **kwargs)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [65]:
def get_prior_probability(series, value):
    assert series.dtype.name == 'category'
    return sum(series==value)/len(series)

print('Prior probability of income <=50K: {0:.4g}'.format(get_prior_probability(df['income'], '<=50K')))
print('Prior probability of income >50K: {0:.4g}'.format(get_prior_probability(df['income'], '>50K')))


Prior probability of income <=50K: 0.7511
Prior probability of income >50K: 0.2489


In [66]:
def get_naive_bayes_alpha(cat_series, cat_val, class_series, class_val):
    assert cat_series.dtype.name == 'category'
    assert class_series.dtype.name == 'category'
    assert len(cat_series) == len(class_series)

    sub_series = cat_series[class_series==class_val]
    return get_prior_probability(sub_series, cat_val)

# test on given solution
workclass_alpha_below = {key: get_naive_bayes_alpha(df['workclass'], key, df['income'], '<=50K') for key in ['Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 'Local-gov', 'State-gov', 'Without-pay', 'Never-worked']}
workclass_alpha_above = {key: get_naive_bayes_alpha(df['workclass'], key, df['income'], '>50K') for key in ['Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 'Local-gov', 'State-gov', 'Without-pay', 'Never-worked']}
print(workclass_alpha_above)
print(workclass_alpha_below)

{'Private': 0.6494405966968567, 'Self-emp-not-inc': 0.09509856153436334, 'Self-emp-inc': 0.07991475759190197, 'Federal-gov': 0.048614810868407035, 'Local-gov': 0.0811134789557805, 'State-gov': 0.04581779435269046, 'Without-pay': 0.0, 'Never-worked': 0.0}
{'Private': 0.768517701068244, 'Self-emp-not-inc': 0.07879403195903593, 'Self-emp-inc': 0.020923457226096936, 'Federal-gov': 0.025514257967687824, 'Local-gov': 0.06435949501191843, 'State-gov': 0.041273064359495015, 'Without-pay': 0.0006179924075218504, 'Never-worked': 0.0}


In [69]:
def get_naive_bayes_gauss_params(cont_series, class_series, class_val):
    assert cont_series.dtype.name != 'category'
    assert class_series.dtype.name == 'category'
    assert len(cont_series) == len(class_series)

    sub_series = cont_series[class_series==class_val]
    return {'mean': sub_series.mean(), 'variance': sub_series.var(ddof=0)}

# test on given solution
age_params_below = get_naive_bayes_gauss_params(df['age'], df['income'], '<=50K')
age_params_above = get_naive_bayes_gauss_params(df['age'], df['income'], '>50K')
print(age_params_above)
print(age_params_below)

{'mean': 43.95911028236548, 'variance': 105.4513115152752}
{'mean': 36.60806038668668, 'variance': 181.28829205320872}


In [82]:
print('<=50K')
print('education-num (<=50K):',get_naive_bayes_gauss_params(df['education-num'], df['income'], '<=50K'))
print(
    'marital-status (<=50K):',
    {key: get_naive_bayes_alpha(df['marital-status'],key, df['income'], '<=50K') for key in ['Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed', 'Married-spouse-absent', 'Married-AF-spouse']}
)
print(
    'race (<=50K):',
    {key: get_naive_bayes_alpha(df['race'],key, df['income'], '<=50K') for key in ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black']}
)
print('capital-gain (<=50K):',get_naive_bayes_gauss_params(df['capital-gain'], df['income'], '<=50K'))

print('\n')
print('>50K')
print('education-num (>50K):',get_naive_bayes_gauss_params(df['education-num'], df['income'], '>50K'))
print(
    'marital-status (>50K):',
    {key: get_naive_bayes_alpha(df['marital-status'],key, df['income'], '>50K') for key in ['Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed', 'Married-spouse-absent', 'Married-AF-spouse']}
)
print(
    'race (>50K):',
    {key: get_naive_bayes_alpha(df['race'],key, df['income'], '>50K') for key in ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black']}
)
print('capital-gain (>50K):',get_naive_bayes_gauss_params(df['capital-gain'], df['income'], '>50K'))

<=50K
education-num (<=50K): {'mean': 9.629116270857244, 'variance': 5.82518914574656}
marital-status (<=50K): {'Married-civ-spouse': 0.3383949854330361, 'Divorced': 0.16606338836408582, 'Never-married': 0.4085812660015891, 'Separated': 0.03853624084046967, 'Widowed': 0.03297430917277302, 'Married-spouse-absent': 0.01496424472499338, 'Married-AF-spouse': 0.00048556546305288247}
race (<=50K): {'White': 0.842853359230158, 'Asian-Pac-Islander': 0.028560077690474087, 'Amer-Indian-Eskimo': 0.011123863335393308, 'Other': 0.009269886112827757, 'Black': 0.10819281363114681}
capital-gain (<=50K): {'mean': 148.8938377328507, 'variance': 876791.7958717996}


>50K
education-num (>50K): {'mean': 11.606419818859882, 'variance': 5.608680149804337}
marital-status (>50K): {'Married-civ-spouse': 0.8522908897176346, 'Divorced': 0.06020245071923282, 'Never-married': 0.06259989344698988, 'Separated': 0.008790623335109217, 'Widowed': 0.010655301012253596, 'Married-spouse-absent': 0.004128929142248269, 'Marr

In [86]:
test_df = (
   pd.read_csv(
       '../data/adult.test',
       names=[
          'age',
          'workclass',
          'fnlwgt',
          'education',
          'education-num',
          'marital-status',
          'occupation',
          'relationship',
          'race',
          'sex',
          'capital-gain',
          'capital-loss',
          'hours-per-week',
          'native-country',
          'income',
       ],
       dtype={
          'age': np.int64,
          'workclass': 'category',
          'fnlwgt': np.int64,
          'education': 'category',
          'education-num': np.int64,
          'marital-status': 'category',
          'occupation': 'category',
          'relationship': 'category',
          'race': 'category',
          'sex': 'category',
          'capital-gain': np.int64,
          'capital-loss': np.int64,
          'hours-per-week': np.int64,
          'native-country': 'category',
          'income': 'category',
       },
       na_values='?',
       sep=', ',
       skiprows=1,
   )
   .dropna()
)
test_df.head()

  return func(*args, **kwargs)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K.
