In [4]:
import csv
import random
import math
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

def prepareTargetData (data):
    x_train = []
    for j in range(len(data)):
        column = []
        for i in range(10):
            column.append(0)
        x_train.append(column)
    # taking only required columns, dropping ID column
    for i in range(len(data)):
        for j in range(10):
            x_train[i][j] = data[i][j + 2]
    #print(x_train)
    #print(data)
    return x_train

def loadCsvData(filename):
    lines = csv.reader(open(filename, "r"))
    data = list(lines)
    data.pop(0)
    #print(data)
    #print(len(data))
    for i in range(len(data)):
        if(data[i][11]  == 'malignant'): data[i][11] = '1'
        elif data[i][11] == 'benign': data[i][11] = '0'
    for i in range(len(data)):
        if(data[i][11]  == 'malignant'): data[i][11] = '1'
        elif data[i][11] == 'benign': data[i][11] = '0'
    for i in range(len(data)):
        data[i] = [float(x) for x in data[i]]
    x_train = prepareTargetData(data)
    return x_train


def train_test_split_byclass (data):
    separated = {}
    for i in range(len(data)):
        value = data[i]
        if (value[-1] not in separated):
            separated[value[-1]] = []
        separated[value[-1]].append(value)
    return separated

def summarize(data):
    summaries = [(np.mean(attribute), np.std(attribute)) for attribute in zip(*data)]
    del summaries[-1]
    return summaries


def class_summary(data):
    separated = train_test_split_byclass(data)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries


def calculateProbability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
    return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent


def getClassProbabilities(summaries, inputData):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputData[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities


def predict(summaries, inputData):
    nv_probabilities = getClassProbabilities(summaries, inputData)
    posVal, posProb = None, -1
    for classValue, probability in nv_probabilities.items():
        if posVal is None or probability > posProb:
            posProb = probability
            posVal = classValue
    return posVal


def getNVPredictionValues(summaries, testData):
    predictions = []
    for i in range(len(testData)):
        result = predict(summaries, testData[i])
        predictions.append(result)
    return predictions


def accuracy_score(testSet, predictions):
    correct_guess = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct_guess += 1
    return (correct_guess / float(len(testSet)))


def main():
    ## Give your file path with name here. Now local path is given
    filename = 'C:/Users/subar/Downloads/CMPE-255 Sec 99 - Data Mining/Home Works/MyData.csv'
    data = loadCsvData(filename)
    trainData, testData = train_test_split(data, train_size=0.7, random_state=100)
    print("Split {0} rows into train={1} and test={2} rows".format(len(data), len(trainData), len(testData)))

    # prepare  naive bayes model
    summaries = class_summary(trainData)

    # test model
    predictions = getNVPredictionValues(summaries, testData)

    # Accuracy calculation
    print('Accuracy with Naive Bayes Model:', (accuracy_score(testData, predictions) * 100), "%")

main()

Split 683 rows into train=478 and test=205 rows
Accuracy with Naive Bayes Model: 96.09756097560975 %
