In [None]:
from random import randrange
from math import sqrt,exp, pi
import pandas as pd, numpy as np, sys
from IPython.display import HTML
CONFIG_FILE = '.config_ipynb'


if os.path.isfile(CONFIG_FILE):
    with open(CONFIG_FILE) as f:
        sys.argv = f.read().split()
else:
    sys.argv = ['test_args.py', 'input_file', '--int_param', '12']
    
parser = argparse.ArgumentParser()
parser.add_argument("input_file",help="Input image, directory, or npy.")
parser.add_argument("--int_param", type=int, default=4, help="an optional integer parameter.")
args = parser.parse_args()
p = args.int_param
print(args.input_file,p)



def fit(dataset, nb, decision, *args):
    data = cross_validation(dataset)
    acc_list = list()
    for index, unf_data in enumerate(data):
        X_train = np.array(data)
        X_train = np.delete(X_train, index, axis=0)
        X_train = np.concatenate((X_train))
        X_test = list()
        for record in unf_data:
            r = list(record)
            X_test.append(r)
            r[decision] = None
        pred = nb(X_train, X_test, decision, *args)
        y_test = [record[decision] for record in unf_data]
        accuracy = acc_per(y_test, pred)
        acc_list.append(accuracy)
    return acc_list

def naive_bayes_classifier(X_train, X_test, decision):
    sort_data = cls_std(X_train, decision)
    predictions = list()
    for record in X_test:
        res_prob = t_prob(
            sort_data, record)
        best_target_class, high_prob = None, -1
        for target_value, probability in res_prob.items():
            if best_target_class is None or probability > high_prob:
                high_prob = probability
                best_target_class = target_value
        predictions.append(best_target_class)
    return predictions


def cross_validation(dataset):
    num_of_folds = 5
    aftr_crss = list()
    temp_dataset = list(dataset)
    datasize = int(len(dataset) / num_of_folds)
    for f in range(num_of_folds):
        fnl_fld_data = list()
        while len(fnl_fld_data) < datasize:
            index = randrange(len(temp_dataset))
            fnl_fld_data.append(temp_dataset.pop(index))
        aftr_crss.append(fnl_fld_data)
    return aftr_crss


def cls_std(dataset, decision):
    dd_cls = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        target_value = vector[decision]
        if (target_value not in dd_cls):
            dd_cls[target_value] = list()
        dd_cls[target_value].append(vector)
    result = dict()
    for target_value, record in dd_cls.items():
        value = [(mean(column), stdev(column), len(column))
                           for column in zip(*record)]
        del(value[decision])
        result[target_value] = value
    return result


def t_prob(sort_data, row):
    t_rows = sum([sort_data[label][0][2] for label in sort_data])
    res_prob = dict()
    for target_value, sorted_data in sort_data.items():
        res_prob[target_value] = sort_data[target_value][0][2] / \
            float(t_rows)
        for i in range(len(sorted_data)):
            mean, stdev, _ = sorted_data[i]
            if stdev == 0.0:
                stdev = 1
            exponent = exp(-((row[i]-mean)**2 / (2 * stdev**2)))
            prob = (1 / (sqrt(2 * pi) * stdev)) * exponent
            res_prob[target_value] *= prob
    return res_prob

def acc_per(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0


def mean(numbers):
    return sum(numbers)/float(len(numbers))


def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
    return sqrt(variance)

In [35]:
if __name__ == "__main__":

    dataset = sys.argv[1]

    acc = []

    #Preprocessing dataset
    car_dataset = pd.read_csv("/Users/sandyavs/Documents/ML/Programming Project/data/car.data", names=["buying", "maint", "doors", "persons", "lug_boot", "safety", "decision"])
    car_dataset["buying"].replace(["vhigh", "high", "med", "low"], [3, 2, 1, 0], inplace = True)
    car_dataset["maint"].replace(["vhigh", "high", "med", "low"], [3, 2, 1, 0], inplace = True)
    car_dataset["doors"].replace(["5more"], [5], inplace = True)
    car_dataset["persons"].replace(["more"], [4], inplace = True)
    car_dataset["lug_boot"].replace(["small", "med", "big"], [0, 1, 2], inplace = True)
    car_dataset["safety"].replace(["low", "med", "high"], [0, 1, 2], inplace = True)
    car_dataset["decision"].replace(["unacc", "acc", "good", "vgood"], [1, 0, 2, 3], inplace = True)

    car_dataset['buying'] = car_dataset['buying'].astype(int)
    car_dataset['maint'] = car_dataset['maint'].astype(int)
    car_dataset['doors'] = car_dataset['doors'].astype(int)
    car_dataset['persons'] = car_dataset['persons'].astype(int)
    car_dataset['lug_boot'] = car_dataset['lug_boot'].astype(int)
    car_dataset['safety'] = car_dataset['safety'].astype(int)
    car_dataset['decision'] = car_dataset['decision'].astype(int)


    avg_accuracy = 0
    for i in range(10):
        decision = 6 
        accuracy = fit(car_dataset.values,naive_bayes_classifier, decision)
        acc.append(accuracy)

    print('accuracy', acc[10-1])
    std = np.std(acc)
    # print('Naive Bayes Classi÷fication accuracy:', (sum(acc)/float(len(acc))))
    print("Naive Bayes Classification standard deviation", std)


accuracy [83.18840579710145, 79.42028985507247, 77.68115942028986, 79.71014492753623, 76.81159420289855]
Naive Bayes Classification standard deviation 2.353144807527193


In [None]:
python test_args.py my_input_file --int_param 12