In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from scipy.spatial import distance
from sklearn.decomposition import PCA
import matplotlib.cm as matpltlb
import matplotlib.pyplot as plot
from scipy.stats import norm

def text_file_retrieve(path):
    with open(path) as f:
        lines = [line.split() for line in f]
    f.close()
    return lines


def data_processing(data):
    datacol = data[0]
    nominal_col = []
    datapts = []
    print(datacol)
    for col in range(datacol.size):
        attr = datacol[col]
        if attr.isalpha():
            unique_elm,indices = np.unique(data[:,col],return_inverse = True)
            data[:,col] = indices.astype(np.float)
            nominal_col.append(col)

    datapts = np.matrix(data[:,:-1],dtype = float,copy = False)
    for col in range(datapts.shape[1]):
      if col not in nominal_col:
          v = datapts[:,col]
          datapts[:,col] = (v - v.min()) / (v.max() - v.min()) #normalizing the data using min-max normalization

    return datapts, nominal_col



def cross_validation(datapts):
    datapts_split = np.array_split((datapts),10)
    return datapts_split



def getContinuousData(train,test_data,nominal_col,trainclasslabel,testclasslabel):
    continuoustrain = np.delete(train,nominal_col,axis=1)
    continuoustest = np.delete(test_data,nominal_col,axis=1)
    train0 = continuoustrain[trainclasslabel == 0.0]
    train1 = continuoustrain[trainclasslabel == 1.0]
    return continuoustrain, continuoustest, train0, train1


def Pdf(train0,train1,continuoustest):
    mean = np.mean(train0,axis=0)
    sd = np.std(train0, axis=0)
    continuous0=norm.pdf(continuoustest,mean,sd)
    mean1 = np.mean(train1,axis=0)
    sd1 = np.std(train1, axis=0)
    continuous1 =norm.pdf(continuoustest,mean1,sd1)
    return mean, mean1, sd, sd1, continuous0,continuous1



def findProbabilityfor01(ind,train0_nom,train1_nom,test_categoric_data):
    probgiven_0 = len(train0_nom[np.where((train0_nom == test_categoric_data[ind]))])/len(train0_nom)
    probgiven_1 = len(train1_nom[np.where((train1_nom ==test_categoric_data[ind]))])/len(train1_nom)
    return probgiven_0,probgiven_1

def getTrainTestData(index,datasplit,gtsplit):
    train = np.asarray(np.vstack([x for i,x in enumerate(datasplit) if i != index]))
    trainclasslabel = np.asarray(np.concatenate([x for i,x in enumerate(gtsplit) if i != index]))
    test = np.asarray(datasplit[index])
    testclasslabel = np.asarray(gtsplit[index])
    return train,trainclasslabel,test,testclasslabel



def naivebayes_for_continuous(train, trainclasslabel, test, testclasslabel,nominal_col):
    continuoustrain, continuoustest, train0, train1 = getContinuousData(train,test,nominal_col,trainclasslabel,testclasslabel)
    mean, mean1, sd, sd1, continuous0,continuous1 = Pdf(train0,train1,continuoustest)
    # print(continuous0)
    # print("=====================================")
    # print(continuous1)
    continuous0list = list()
    continuous1list = list()
    for i in range(len(test)):
      continuous0list.append(np.prod(continuous0[i]))
      continuous1list.append(np.prod(continuous1[i]))
    
    return continuous0list,continuous1list


def naivebayes_for_nominal(train, trainclasslabel, test, testclasslabel,nominal_col,continuous0list,continuous1list):
    nom0 =  np.ones((len(test)))
    nom1 = np.ones((len(test)))
    for i in range(len(nominal_col)):
        train_nom = train[:,nominal_col[i]]
        test_nom = test[:,nominal_col[i]]
        train0_nom = train_nom[trainclasslabel == 0.0]
        train1_nom = train_nom[trainclasslabel == 1.0]
        prob0 = len(train0_nom)/len(train_nom)
        prob1 = len(train1_nom)/len(train_nom)
        for ind in range(len(test_nom)):
            probgiven_0 = len(train0_nom[np.where((train0_nom ==test_nom[ind]))])/len(train0_nom)
            probgiven_1 = len(train1_nom[np.where((train1_nom ==test_nom[ind]))])/len(train1_nom)
            if probgiven_0 == 0 or probgiven_0 == 1:
                probgiven_0 = (len(train0_nom[np.where((train0_nom ==test_nom[ind]))]) + 1) /(len(train0_nom) + len(set(train_nom)))

            if probgiven_1 == 0 or probgiven_1 == 1:
                probgiven_1 = (len(train1_nom[np.where((train1_nom ==test_nom[ind]))]) + 1) /(len(train1_nom) + len(set(train_nom)))

            div =  len(train_nom[train_nom==test_nom[ind]])/len(train_nom)
            nom0[ind]=nom0[ind]*((probgiven_0 * prob0) / div)
            nom1[ind]=nom1[ind]*((probgiven_1 * prob1) / div)

    
    return nom0,nom1



def naiveBayes_totalData(test,nom0,nom1,continuous0list,continuous1list):
    result=[]
    for i in range(len(test)):
        p0 = nom0[i]*continuous0list[i]
        p1= nom1[i]*continuous1list[i]
        if p0 > p1:
            result.append(0)
        else:
            result.append(1)
    print("P1:"p1)
    print("P0:"p0)
    return result



#performance_metric function calculates accuracy, precision, recall and f1 score
## True Positive - tp 
# True Negative - tn 
# False Positive - fp 
# False Negative - fn 
def ConfusionMatrix(testclasslabel,result):
        result = np.array(result)
        testclasslabel = np.asarray(testclasslabel)
        target = len(np.unique(testclasslabel))
        a = (testclasslabel * target) + result
        g = np.sort(a)
        sq = target * target
        hist, bin_edges = np.histogram(a, bins=range(g[0].astype('int'), g[0].astype('int')+ sq + 1))
        hist = np.reshape(hist, (target, target))
        return hist
def Accuracy(testclasslabel,result):
  count = 0
  for i in range(len(result)):
    if result[i] == testclasslabel[i]:
      count +=1
  accuracy = count / float(len(testclasslabel))
  print("accuray:",accuracy)
  return accuracy

def Recall(testclasslabel,result):
    cm = ConfusionMatrix(testclasslabel,result)
    recall = np.diag(cm) / np.sum(cm, axis = 1)
    recall = np.mean(recall)
    print("recall:",recall) 
    return recall

def Precision(testclasslabel,result):
    cm = ConfusionMatrix(testclasslabel,result)
    precision = np.diag(cm) / np.sum(cm, axis = 0)
    precision = np.mean(precision)
    print("precision:",precision)
    return precision

def fscore(prec,recall):
    f1score = 2 * (prec * recall) / (prec + recall)
    print("f1score:",f1score)
    return f1score



def main():
    
    text_file_name = input("Enter the filename: ")
    lines = text_file_retrieve(text_file_name)
    data = np.asarray(lines)
    #preprocess_data - contains data except nominal/categorical columns and class labels , nominal_col - contains categorical column with its respective numerical value
    preprocess_data, nominal_col = data_processing(data)
    #groundtruth - contains class label column
    groundtruth = np.asarray(data[:,-1],dtype=int)
    gtsplit = np.array_split(groundtruth,10)
    datasplit = cross_validation(preprocess_data)
    avg_accuracy = avg_precision = avg_recall = avg_f1score = 0
    for index in range(10):
      train, trainclasslabel, test, testclasslabel = getTrainTestData(index,datasplit,gtsplit)
      
      #continuous data
      continuous0list,continuous1list = naivebayes_for_continuous(train, trainclasslabel, test, testclasslabel,nominal_col)
      #categorical data
      nom0,nom1 = naivebayes_for_nominal(train, trainclasslabel, test, testclasslabel,nominal_col,continuous0list,continuous1list)
      result = naiveBayes_totalData(test,nom0,nom1,continuous0list,continuous1list)
      print(result) #contains final result and is further used to evaluate performance 
      #evaluating performance 
      
      
      accuracy = Accuracy(testclasslabel, result)
      precision = Precision(testclasslabel, result)
      recall = Recall(testclasslabel, result)
      f1score = fscore(precision,recall)
      
      avg_accuracy += accuracy
      avg_precision += precision
      avg_recall += recall
      avg_f1score += f1score
    print("average accuracy:",avg_accuracy * 10 )
    print("average pecision:",avg_precision * 10)
    print("average recall:",avg_recall * 10)
    print("average f1score:",avg_f1score * 0.1)

main()

SyntaxError: ignored