In [6]:
# Imports
import pandas as pd
import numpy as np
import warnings

from math import log
from sklearn.metrics import accuracy_score

# Disable warnings from being printed
warnings.filterwarnings('ignore')

In [7]:
# Read the data and get the categorical attrbiutes
data = pd.read_csv("census/census-income.data")
categorical_attributes = ['ACLSWKR','ADTIND','ADTOCC','AHGA','AHSCOL','AMARITL','AMJIND','AMJOCC','ARACE',
                          'AREORGN','ASEX','AUNMEM','AUNTYPE','AWKSTAT','FILESTAT','GRINREG','GRINST','HHDFMX',
                          'HHDREL','MIGMTR1','MIGMTR3','MIGMTR4','MIGSAME','MIGSUN','PARENT','PEFNTVTY',
                          'PEMNTVTY','PENATVTY','PRCITSHP','SEOTR','VETQVA','VETYN','YEAR', 'INCOME']
data_categorical = data.loc[:,categorical_attributes]

In [8]:
# Preprocessing
# Fix missing values to mode as all are categorical variables
# and have one particular value which is very dominatingly occurring.

data_categorical.loc[data_categorical.GRINST == " ?", "GRINST"] = \
data_categorical.loc[data_categorical.GRINST != " ?", "GRINST"].mode().iloc[0]

data_categorical.loc[data_categorical.MIGMTR3 == " ?", "MIGMTR3"] = \
data_categorical.loc[data_categorical.MIGMTR3 != " ?", "MIGMTR3"].mode().iloc[0]

data_categorical.loc[data_categorical.MIGMTR4 == " ?", "MIGMTR4"] = \
data_categorical.loc[data_categorical.MIGMTR4 != " ?", "MIGMTR4"].mode().iloc[0]

data_categorical.loc[data_categorical.MIGSAME == " ?", "MIGSAME"] = \
data_categorical.loc[data_categorical.MIGSAME != " ?", "MIGSAME"].mode().iloc[0]

data_categorical.loc[data_categorical.PEFNTVTY == " ?", "PEFNTVTY"] = \
data_categorical.loc[data_categorical.PEFNTVTY != " ?", "PEFNTVTY"].mode().iloc[0]

data_categorical.loc[data_categorical.PEMNTVTY == " ?", "PEMNTVTY"] = \
data_categorical.loc[data_categorical.PEMNTVTY != " ?", "PEMNTVTY"].mode().iloc[0]

data_categorical.loc[data_categorical.PENATVTY == " ?", "PENATVTY"] = \
data_categorical.loc[data_categorical.PENATVTY != " ?", "PENATVTY"].mode().iloc[0]

# No other attributes have missing values

# As code is a categorical thing. Keeping it float poses problem in using it as dict key.
data_categorical["MIGMTR1"] = str(data_categorical["MIGMTR1"])

# Add relevant continuous attributes with binning. No of bins decided based on distinct values present.
# data_categorical["wage_bins"] = pd.cut(data.AHRSPAY, bins=1000, labels=False)
# categorical_attributes.insert(0, "wage_bins")

# data_categorical["capgain_bins"] = pd.cut(data.CAPGAIN, bins=132, labels=False)
# categorical_attributes.insert(0, "capgain_bins")

# data_categorical["caploss_bins"] = pd.cut(data.CAPLOSS, bins=113, labels=False)
# categorical_attributes.insert(0, "caploss_bins")

In [9]:
def iterate(accuracies):
    
    # Take a random samples of all data
    # and divide it equally into train and test of size 2000 each.
    data_randomised = data_categorical.iloc[np.random.permutation(data_categorical.shape[0])]
    train = data_randomised.iloc[:10000]
    test = data_randomised.iloc[10000:20000]
    
    # Separate the train data classwise.
    class_less = train.loc[data_categorical.INCOME == " - 50000.", :]
    class_more = train.loc[data_categorical.INCOME == " 50000+.", :]

    # Compute number on instances classwise and total.
    num_less = class_less.shape[0]
    num_more = class_more.shape[0]
    num_total = train.shape[0]

    # Compute priors for each class
    prob_less = num_less/num_total
    prob_more = num_more/num_total

    # Use log probabilities to avoid numerical errors
    log_prob_less = log(prob_less)
    log_prob_more = log(prob_more)
    
    # Compute likelihoods and take log of them
    probabilities = {}

    for categorical_attribute in categorical_attributes[:-1]:
        probabilities[categorical_attribute] = {
            " - 50000." : dict(class_less[categorical_attribute].value_counts()/num_less),
            " 50000+." : dict(class_more[categorical_attribute].value_counts()/num_more)
        }

    for categorical_attribute in probabilities.keys():
        for sal_class in probabilities[categorical_attribute].keys():
            for attribute_val in probabilities[categorical_attribute][sal_class].keys():
                probabilities[categorical_attribute][sal_class][attribute_val] = \
                log(probabilities[categorical_attribute][sal_class][attribute_val])
                
    # Predict
    results = []

    for i in range(test.shape[0]):
        record = test.iloc[i:i+1]

        posterior_less = 0
        posterior_more = 0

        for categorical_attribute in categorical_attributes[:-1]:
    
            if record[categorical_attribute].iloc[0] in probabilities[categorical_attribute][" - 50000."].keys():
                posterior_less = posterior_less + \
                probabilities[categorical_attribute][" - 50000."][record[categorical_attribute].iloc[0]]
    
            if record[categorical_attribute].iloc[0] in probabilities[categorical_attribute][" 50000+."].keys():
                posterior_more = posterior_more + \
                probabilities[categorical_attribute][" 50000+."][record[categorical_attribute].iloc[0]]

        posterior_less = posterior_less + log_prob_less
        posterior_more = posterior_more + log_prob_more

        if posterior_less >= posterior_more:
            cur_class = " - 50000."
        else:
            cur_class = " 50000+."
    
        results.append(cur_class)
    
    test['PREDICTION'] = results
    
    # Compute accuracy
    accuracies.append(accuracy_score(test.INCOME, test.PREDICTION))

In [10]:
# Run 10 times to compute mean and standard deviation of accuracy
accuracies = []
for i in range(10):
    iterate(accuracies)
    
print("Mean accuracy: ", np.mean(accuracies))
print("Standard deviation of accuracies: ", np.std(accuracies))

Mean accuracy:  0.71157
Standard deviation of accuracies:  0.00954798931713
