In [1]:
from google.colab import files

uploaded = files.upload()

Saving adult.csv to adult.csv


In [2]:
import numpy as np
from collections import defaultdict
import re
from math import sqrt, exp, pi, log

In [3]:
# This function should prepare the data by reading it from a file and converting it into a useful format for training and testing
# and implement 90-10 splitting as specified in the project description.
def preprocess(filename):
    whole = []
    f = open(filename, 'r')
    for line in f:
        line = re.sub(r'\n$', '', line)
        line = re.sub(r', ', ',', line)
        line = line.strip().split(",")
        whole.append(line)
    whole = np.array(whole)
    f.close()
    
    # Extract attribute values and class labels, and split dataset into training and test sets
    n_instance = whole.shape[0] - 1
    breakpoint = 0.9 * n_instance + 1
    breakpoint = int(breakpoint)
    
    attributes = list(whole[0, :-1])
    X_train = whole[1:breakpoint, :-1]
    y_train = whole[1:breakpoint, -1]
    X_test = whole[breakpoint:, :-1]
    y_test = whole[breakpoint:, -1]
    
    return attributes, X_train, y_train, X_test, y_test, whole

In [4]:
# For each nominal attribute, record the frequency of each value, then calculate their conditional probabilities
def nominal_conditional_prob(index_list, X_train, attribute_num, prior):
    freq = defaultdict(int)
    
    for ind in index_list:
        freq[X_train[ind,attribute_num]] += 1
    cond_prob = {key: (freq[key] / X_train.shape[0]) / prior for key in freq}
    
    return cond_prob

In [5]:
# Calculate the mean of a vector of values of a numeric attribute
def mean(values):
    return sum(values) / float(len(values))

# Calculate the standard deviation of a vector of values of a numeric attribute
def sd(values):
    avg = mean(values)
    variance = sum([pow(x - avg, 2) for x in values]) / float(len(values) - 1)
    return sqrt(variance)

# Calculate probabilty using Gaussian distribution's probability density function
def gaussian(x, mean, sd):
    exponent = exp(-((x-mean)**2 / (2 * sd**2)))
    return (1 / (sqrt(2 * pi) * sd)) * exponent

In [6]:
# This function should calculate prior probabilities and likelihoods (conditional probabilities) from the training data and using
# to build a naive Bayes model

def train(attributes, X_train, y_train):
   # Calculate prior probabilities of each class label
    label_dict = defaultdict(int)
    for label in y_train:
        label_dict[label] += 1
    prior_dict = {}
    for label in ['<=50K', '>50K']:
        prior_dict[label] = label_dict[label] / len(y_train)
    
    # Record the indexes of instances according to the class label
    pos_index = []
    neg_index = []
    for i in range(y_train.size):
        if y_train[i] == '<=50K': 
            pos_index.append(i)
        else:
            neg_index.append(i)

    # Calculate conditional probabilities of nominal attributes
    nom_conditional_pos = dict()
    nom_conditional_neg = dict()
    for attr in ['work class', 'education', 'marital status', 'occupation', 'relationship', 'race', 'sex', 'native country (region)']:
        attr_index = attributes.index(attr)
        nom_conditional_pos[attr] = nominal_conditional_prob(pos_index, X_train, attr_index, prior_dict['<=50K'])
        nom_conditional_neg[attr] = nominal_conditional_prob(neg_index, X_train, attr_index, prior_dict['>50K'])
    
    # Calculate conditional probabilities of numeric attributes
    num_conditional_pos = dict()
    num_conditional_neg = dict()
    for attr in ['age', 'education num', 'hours per week']:
        pos = []
        neg = []
        attr_index = attributes.index(attr)
        # For each numeric attribute, separate instances according to the class label
        for ind in pos_index:
            pos.append(X_train[ind, attr_index])
        for ind in neg_index:
            neg.append(X_train[ind, attr_index])
        
        pos = [float(x) for x in pos]
        neg = [float(x) for x in neg]
        
        # Record the distribution of each numeric attribute given class label
        num_conditional_pos[attr] = (mean(pos), sd(pos))
        num_conditional_neg[attr] = (mean(neg), sd(neg))
    
    
    return prior_dict, nom_conditional_pos, nom_conditional_neg, num_conditional_pos, num_conditional_neg, pos_index, neg_index

In [7]:
def calculate_posterior(attributes, prior, nom_conditional, num_conditional, instance):
    log_posterior = 0.0
    counter = 0
    for feature in instance:
        
        # Calculate the distribution of the attribute if it is numeric
        if attributes[counter] in ['age', 'education num', 'hours per week']:
            try:
                mean = num_conditional[attributes[counter]][0]
                sd = num_conditional[attributes[counter]][1]
                log_posterior += log(gaussian(float(feature), mean, sd)) 
            except:
                log_posterior += float("-inf")
        
        # For nominal attributes, sum the logarithm of conditional probabilities calculated previously
        else:
            try:
                log_posterior += log(nom_conditional[attributes[counter]][feature])
            except:
                log_posterior += float("-inf")
        counter += 1
    log_posterior += log(prior)
    
    return log_posterior

In [8]:
# This function should predict classes for new items in the testing data
def predict(attributes, prior_dict, nom_conditional_pos, nom_conditional_neg, num_conditional_pos, num_conditional_neg, X_test):
    predicted = []
    pos_logprob = []
    neg_logprob = []
    
    # For each instance in the test set, calculate the logarithm of its posterior probability
    for ins in X_test:
        log_posterior = (calculate_posterior(attributes, prior_dict['<=50K'], nom_conditional_pos, num_conditional_pos, ins),
                        calculate_posterior(attributes, prior_dict['>50K'], nom_conditional_neg, num_conditional_neg, ins))
        
        # Predict class label based on the maximum of logarithm of posterior probabilities
        pos_logprob.append(log_posterior[0])
        neg_logprob.append(log_posterior[1])
        maximum = np.argmax(log_posterior)
        if maximum == 0:
            predicted.append('<=50K')
        else:
            predicted.append('>50K')
    
    return predicted, pos_logprob, neg_logprob

In [9]:
# This function should evaliate the prediction performance by comparing your model’s class outputs to ground
# truth labels, return and output accuracy, confusion matrix and F1 score.

def evaluate(predicted, y_test):
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    
    # Generate the confusion matrix
    for i in range(len(y_test)):
        if predicted[i] == '<=50K':
            if predicted[i] == y_test[i]:
                tp += 1
            else:
                fp += 1
        elif predicted[i] == '>50K':
            if predicted[i] == y_test[i]:
                tn += 1
            else:
                fn += 1
    
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    confusion_matrix = np.array([[tp,fn],[fp,tn]], dtype=int)

    # Calculate the f-score
    precision = tp / (tp+fp)
    recall = tp / (tp+fn)
    fscore = 2 * precision * recall / (precision+recall)
    
    print(confusion_matrix)
    print("Accuracy: ", accuracy)
    print("F-score: ", fscore)
    return accuracy, confusion_matrix, fscore

In [10]:
# This cell should act as your "main" function where you call the above functions 
# on the full ADULT data set, and print the evaluation score. 


# First, read in the data and apply your NB model to the OBJECTIVITY data

filename = 'adult.csv' # load filepath here
attributes, X_train, y_train, X_test, y_test, whole = preprocess(filename)
prior_dict, nom_conditional_pos, nom_conditional_neg, num_conditional_pos, num_conditional_neg, pos_index, neg_index = train(attributes, X_train, y_train)
predicted, pos_logprob, neg_logprob = predict(attributes, prior_dict, nom_conditional_pos, nom_conditional_neg, num_conditional_pos, num_conditional_neg, X_test)


# Second, print the full evaluation results from the evaluate() function

accuracy, confusion_matrix, fscore = evaluate(predicted, y_test)


# Third, print data statistics and model predictions, as instructed below 
# N is the total number of instances, F the total number of attributes, L the total number of labels
# The "class probabilities" may be unnormalized
# The "predicted class ID" must be in range (0, L)

print("Attribute vectors of instances [0, 1, 2]: ", X_train[:3,])

print("\nNumber of instances (N): ", whole.shape[0]-1)
print("Number of attributes (F): ", len(attributes))
print("Number of labels (L): ", len(set(y_train)))

print("\n\nPredicted class log-probabilities for instance N-3: ", "log P(c = '<=50K') = ", pos_logprob[-3], "log P(c = '>50K') = ", neg_logprob[-3])
print("Predicted class ID for instance N-3: ", predicted[-3])
print("\nPredicted class log-probabilities for instance N-2: ", "log P(c = '<=50K') = ", pos_logprob[-2], "log P(c = '>50K') = ", neg_logprob[-2])
print("Predicted class ID for instance N-2: ", predicted[-2])
print("\nPredicted class log-probabilities for instance N-1: ", "log P(c = '<=50K') = ", pos_logprob[-1], "log P(c = '>50K') = ", neg_logprob[-1])
print("Predicted class ID for instance N-1: ", predicted[-1])


[[69  8]
 [ 6 17]]
Accuracy:  0.86
F-score:  0.9078947368421053
Attribute vectors of instances [0, 1, 2]:  [['68' '?' '1st-4th' '2' 'Divorced' '?' 'Not-in-family' 'White' 'Female'
  '20' 'United-States']
 ['39' 'State-gov' 'Bachelors' '13' 'Never-married' 'Adm-clerical'
  'Not-in-family' 'White' 'Male' '40' 'United-States']
 ['50' 'Self-emp-not-inc' 'Bachelors' '13' 'Married-civ-spouse'
  'Exec-managerial' 'Husband' 'White' 'Male' '13' 'United-States']]

Number of instances (N):  1000
Number of attributes (F):  11
Number of labels (L):  2


Predicted class log-probabilities for instance N-3:  log P(c = '<=50K') =  -20.71689698193305 log P(c = '>50K') =  -19.556273652832147
Predicted class ID for instance N-3:  >50K

Predicted class log-probabilities for instance N-2:  log P(c = '<=50K') =  -25.339070637730188 log P(c = '>50K') =  -22.744589775643146
Predicted class ID for instance N-2:  >50K

Predicted class log-probabilities for instance N-1:  log P(c = '<=50K') =  -16.852794958645738