In [275]:
import pandas as pd
import numpy as np
from math import log

In [276]:
data = pd.read_csv("census/census-income.data")

In [277]:
categorical_attributes = ['ACLSWKR','ADTIND','ADTOCC','AHGA','AHSCOL','AMARITL','AMJIND','AMJOCC','ARACE',
                          'AREORGN','ASEX','AUNMEM','AUNTYPE','AWKSTAT','FILESTAT','GRINREG','GRINST','HHDFMX',
                          'HHDREL','MIGMTR1','MIGMTR3','MIGMTR4','MIGSAME','MIGSUN','PARENT','PEFNTVTY',
                          'PEMNTVTY','PENATVTY','PRCITSHP','SEOTR','VETQVA','VETYN','YEAR', 'INCOME']

In [278]:
data_categorical = data.loc[:,categorical_attributes]

In [279]:
# Fix missing values to mode
data_categorical.loc[data_categorical.GRINST == " ?", "GRINST"] = \
data_categorical.loc[data_categorical.GRINST != " ?", "GRINST"].mode().iloc[0]

data_categorical.loc[data_categorical.MIGMTR3 == " ?", "MIGMTR3"] = \
data_categorical.loc[data_categorical.MIGMTR3 != " ?", "MIGMTR3"].mode().iloc[0]

data_categorical.loc[data_categorical.MIGMTR3 == " ?", "MIGMTR3"] = \
data_categorical.loc[data_categorical.MIGMTR3 != " ?", "MIGMTR3"].mode().iloc[0]

data_categorical.loc[data_categorical.MIGMTR4 == " ?", "MIGMTR4"] = \
data_categorical.loc[data_categorical.MIGMTR4 != " ?", "MIGMTR4"].mode().iloc[0]

data_categorical.loc[data_categorical.MIGSAME == " ?", "MIGSAME"] = \
data_categorical.loc[data_categorical.MIGSAME != " ?", "MIGSAME"].mode().iloc[0]

data_categorical.loc[data_categorical.PEFNTVTY == " ?", "PEFNTVTY"] = \
data_categorical.loc[data_categorical.PEFNTVTY != " ?", "PEFNTVTY"].mode().iloc[0]

data_categorical.loc[data_categorical.PEMNTVTY == " ?", "PEMNTVTY"] = \
data_categorical.loc[data_categorical.PEMNTVTY != " ?", "PEMNTVTY"].mode().iloc[0]

data_categorical.loc[data_categorical.PENATVTY == " ?", "PENATVTY"] = \
data_categorical.loc[data_categorical.PENATVTY != " ?", "PENATVTY"].mode().iloc[0]

In [280]:
num_less = data_categorical.loc[data_categorical.INCOME == " - 50000.", "INCOME"].shape[0]
num_more = data_categorical.loc[data_categorical.INCOME == " 50000+.", "INCOME"].shape[0]
num_total = data_categorical.shape[0]
prob_less = num_less/num_total
prob_more = num_more/num_total
log_prob_less = log(prob_less)
log_prob_more = log(prob_more)

In [281]:
class_less = data_categorical.loc[data_categorical.INCOME == " - 50000.", :]
class_more = data_categorical.loc[data_categorical.INCOME == " 50000+.", :]

In [282]:
probabilities = {}
for categorical_attribute in categorical_attributes[:-1]:
    probabilities[categorical_attribute] = {
        " - 50000." : dict(class_less[categorical_attribute].value_counts()/num_less),
        " 50000+." : dict(class_more[categorical_attribute].value_counts()/num_more)
    }
    
for categorical_attribute in probabilities.keys():
    for sal_class in probabilities[categorical_attribute].keys():
        for attribute_val in probabilities[categorical_attribute][sal_class].keys():
            probabilities[categorical_attribute][sal_class][attribute_val] = \
            log(probabilities[categorical_attribute][sal_class][attribute_val])

In [299]:
data_randomised = data_categorical.iloc[np.random.permutation(data_categorical.shape[0])]
train = data_randomised.iloc[:99762]
test = data_randomised.iloc[99762:]

In [297]:
results = []

for i in range(test.shape[0]):
    record = test.iloc[i:i+1]

    posterior_less = 0
    posterior_more = 0

    for categorical_attribute in categorical_attributes[:-1]:
    
        if record[categorical_attribute].iloc[0] in probabilities[categorical_attribute][" - 50000."].keys():
            posterior_less = posterior_less + \
            probabilities[categorical_attribute][" - 50000."][record[categorical_attribute].iloc[0]]
    
        if record[categorical_attribute].iloc[0] in probabilities[categorical_attribute][" 50000+."].keys():
            posterior_more = posterior_more + \
            probabilities[categorical_attribute][" 50000+."][record[categorical_attribute].iloc[0]]

    posterior_less = posterior_less + log_prob_less
    posterior_more = posterior_more + log_prob_more

    if posterior_less >= posterior_more:
        cur_class = " - 50000."
    else:
        cur_class = " 50000+."
    
    results.append(cur_class)
    
test['PREDICTION'] = results

[' 50000+.',
 ' - 50000.',
 ' 50000+.',
 ' 50000+.',
 ' - 50000.',
 ' 50000+.',
 ' - 50000.',
 ' 50000+.',
 ' - 50000.',
 ' - 50000.']