In [334]:
import arff_parser
import pandas as pd
import numpy as np
from collections import defaultdict

In [335]:
train_parser = arff_parser.ARFF_Parser()
test_parser = arff_parser.ARFF_Parser()

In [336]:
train_parser.parse("lymph_train.arff")
test_parser.parse("lymph_test.arff")
train_data = pd.DataFrame(train_parser.data)
test_data = pd.DataFrame(test_parser.data)

In [337]:
from collections import deque
 
def topsort(graph):

    in_degree = { u : 0 for u in graph }     # determine in-degree 
    for u in graph:                          # of each node
        for v in graph[u]:
            in_degree[v] += 1
 
    Q = deque()                 # collect nodes with zero in-degree
    for u in in_degree:
        if in_degree[u] == 0:
            Q.appendleft(u)
 
    L = []     # list for order of nodes
     
    while Q:                
        u = Q.pop()          # choose node of zero in-degree
        L.append(u)          # and 'remove' it from graph
        for v in graph[u]:
            in_degree[v] -= 1
            if in_degree[v] == 0:
                Q.appendleft(v)
 
    if len(L) == len(graph):
        return L
    else:                    # if there is a cycle,  
        return []            # then return an empty list


In [338]:
tuple([1,2,3])

(1, 2, 3)

In [368]:
from collections import OrderedDict
import itertools
from collections import defaultdict
import numpy as np
from collections import OrderedDict
import itertools
import pandas as pd
class NaiveBayes(object):
    def __init__(self,train_parser,classification_type="n"):
        self.laplace_class_counts = {}
        self.prior_probabilities={}
        self.conditional_probabilities = {}
        self.joint_probabilities = {}
        self.conditional_laplacian_counts = {}
        self.dependent_attribute =None
        self.dependent_attribute_values = None
        self.root_node = None
        self.feature_attributes =None
        self.conditional_mutual_info = defaultdict()
        self.graph=OrderedDict()
        self.tan_conditional_counts ={}
        self.tan_conditional_probabilities ={}
        self.classification_type =classification_type
        self.train_parser = train_parser

            
    def calculate_conditional_mutual_information(self):
        needed_columns =self.feature_attributes 
        for index_1 in xrange(0,len(needed_columns)) :
            for index_2 in xrange(0,len(needed_columns)) :

                    column_1= needed_columns[index_1]
                    column_2= needed_columns[index_2]

                    self.conditional_mutual_info[(column_1,column_2)]=0
                    for col_val_1 in self.train_parser.attribute_val_map[column_1]:
                        for col_val_2 in self.train_parser.attribute_val_map[column_2]:
                            for class_val in self.train_parser.attribute_val_map[self.dependent_attribute]:    
                                self.conditional_mutual_info[(column_1,column_2)] +=\
                                self.joint_probabilities[(column_1,column_2)][(col_val_1,col_val_2,class_val)]*\
                                np.log2(\
                                float(self.conditional_probabilities[(column_1,column_2)][(col_val_1,col_val_2,class_val)])/\
                                (self.conditional_probabilities[column_1][(col_val_1,class_val)]*self.conditional_probabilities[column_2][(col_val_2,class_val)])
                                )

                
    def prims(self):
        costs = [0*len(self.feature_attributes)]
        nodes = self.feature_attributes
        self.root_node = nodes[0]

        queue=nodes[1 :]
        #print "***"
        #print queue
        #print self.root_node
        #print "***"
        mst_edges =[]
        mst_edge_indices =[]
        nodes_in_mst=[self.root_node]
        while (len(queue)>0):
            max_wt=-1000
            max_to_node =None
            max_from_node = None
            max_index =0
            for node_1 in nodes_in_mst:
                for index,node_2 in enumerate(queue):

                    if self.conditional_mutual_info[(node_1,node_2)]>max_wt:
                        max_wt = self.conditional_mutual_info[(node_1,node_2)]
                        max_to_node = node_2
                        max_from_node = node_1
                        max_index = index
            nodes_in_mst.append(max_to_node)
            mst_edges.append((max_from_node,max_to_node))
            mst_edge_indices.append((nodes.index(max_from_node),\
                                   nodes.index(max_to_node)))
            
            queue.remove(max_to_node)
        return {"vertices":nodes_in_mst,"edges":mst_edges}
        
        

    def calculate_tan_conditional_probabilities(self):
        train_data = pd.DataFrame(self.train_parser.data)
        for node in self.graph:
            column_class_count_dict = \
            train_data.groupby([node]+self.graph[node]).size().to_dict()
            column_values =[]
            for column in [node]+self.graph[node]:
                column_values.append(self.train_parser.attribute_val_map[column])
            keys = list(itertools.product(*column_values))
            for key in keys:
                if key in column_class_count_dict:
                    column_class_count_dict[key]+=1
                else:
                    column_class_count_dict[key]=1
            final_key = tuple([node]+self.graph[node])
            
            self.tan_conditional_counts[final_key] = column_class_count_dict
  
        for node in self.graph: 
            final_key = tuple([node]+self.graph[node])
            column_values=[]

            for column in self.graph[node]:
                column_values.append(self.train_parser.attribute_val_map[column])
            keys = list(itertools.product(*column_values))

            conditional_probability_table=defaultdict()
            for key in keys:
                total=0
                
                for val in self.train_parser.attribute_val_map[node]:
                    new_key =(val,)+key
                    total = total+\
                             self.tan_conditional_counts[final_key][new_key]
                        
                for val in self.train_parser.attribute_val_map[node]: 
                    new_key =(val,)+key
                    conditional_probability_table[new_key]=\
                    self.tan_conditional_counts[final_key][new_key]/float(total)
                    self.tan_conditional_probabilities[final_key]=conditional_probability_table
            
            
    def fit(self):

        self.dependent_attribute =self.train_parser.dependent_attribute
        self.feature_attributes = [attr \
                                   for attr in self.train_parser.attributes if attr != self.dependent_attribute]

        #Normal Conditional probabilities
        self.calculate_probabilities(train_data)    
        
        if self.classification_type == "t":
            self.calculate_conditional_mutual_information(self.train_parser)
            result = self.prims()
            edges = result["edges"]
            vertices = result["vertices"]
            adjacency_matrix =defaultdict(list)
            for edge in edges:
                adjacency_matrix[edge[0]].append(edge[1])
            for vertex in vertices: 
                if vertex not in adjacency_matrix:
                    adjacency_matrix[vertex]=[]
   
            
            #Creating the graph of parent relationships
            parental_graph = defaultdict(list)
            for edge in edges:
                parental_graph[edge[1]].append(edge[0])
            for vertex in vertices: 
                if vertex not in parental_graph:
                    parental_graph[vertex]=[]  
            for node in self.feature_attributes:
                self.graph[node] =parental_graph[node]
                self.graph[node].append(self.dependent_attribute)
            for node in self.graph:    
                print " ".join([node]+self.graph[node]) 
            self.calculate_tan_conditional_probabilities(self.train_parser)    
            
        else:
            for attr in self.feature_attributes:
                print " ".join([attr, self.dependent_attribute])
            
                
                   
    def calculate_probabilities(self,training_data):
        
        
        train_data =pd.DataFrame(training_data)
        self.dependent_attribute = self.train_parser.dependent_attribute
        train_sample_size = train_data.shape[0]
        self.dependent_attribute_values = self.train_parser.attribute_val_map[self.dependent_attribute]
        for values in train_data[self.dependent_attribute].unique():
            self.laplace_class_counts[values] =\
                train_data.groupby(self.dependent_attribute).size()[values]+1
        for values in train_data[self.dependent_attribute].unique():    
            self.prior_probabilities[values] =\
                float(self.laplace_class_counts[values])/\
                sum(self.laplace_class_counts.values())

                
        #Conditional laplacian counts
        for column in self.train_parser.attributes :
            #if column !=dependent_attribute:
                column_class_count_dict = train_data.groupby([column,self.dependent_attribute]).size().to_dict()
                for col_val in self.train_parser.attribute_val_map[column]:
                    for class_val in self.train_parser.attribute_val_map[self.dependent_attribute]:
                        if (col_val,class_val) not in column_class_count_dict:
                            column_class_count_dict [(col_val,class_val)]=1
                        else:
                            column_class_count_dict [(col_val,class_val)]+=1         

                self.conditional_laplacian_counts[column]=column_class_count_dict
        
        #Conditional laplacian counts for pairs of  attributes
        for column_1 in self.train_parser.attributes[:-1] :
            for column_2 in self.train_parser.attributes[:-1] :
                column_class_count_dict = train_data.groupby([column_1,column_2,self.dependent_attribute]).size().to_dict()
                for col_val_1 in self.train_parser.attribute_val_map[column_1]:
                    for col_val_2 in self.train_parser.attribute_val_map[column_2]:
                        for class_val in self.train_parser.attribute_val_map[self.dependent_attribute]:
                                if (col_val_1,col_val_2,class_val) not in column_class_count_dict:
                                    column_class_count_dict [(col_val_1,col_val_2,class_val)]=1
                                else:
                                    column_class_count_dict [(col_val_1,col_val_2,class_val)]+=1         

                self.conditional_laplacian_counts[(column_1,column_2)]=column_class_count_dict        
        #Conditional probabilities for  one attribute
        for column in self.train_parser.attributes :
            
                total = defaultdict()
                for class_val in self.train_parser.attribute_val_map[self.dependent_attribute]:
                    for col_val in self.train_parser.attribute_val_map[column]:
                        total[class_val] = total.get(class_val,0)+\
                                            self.conditional_laplacian_counts[column][(col_val,class_val)]
                conditional_probability_dict=defaultdict()
                for class_val in self.train_parser.attribute_val_map[self.dependent_attribute]:
                    for col_val in self.train_parser.attribute_val_map[column]:
                        conditional_probability_dict[(col_val,class_val)] = \
                            float(self.conditional_laplacian_counts[column][(col_val,class_val)])/total[class_val]
                self.conditional_probabilities[column]=conditional_probability_dict            
                
        #conditional probability distribution for pairs of attrinbutes
        needed_columns = self.feature_attributes
        for index_1 in xrange(0,len(needed_columns)) :
            for index_2 in xrange(0,len(needed_columns)) :

                    column_1= needed_columns[index_1]
                    column_2= needed_columns[index_2]
                    total= defaultdict()
                    for class_val in self.train_parser.attribute_val_map[self.dependent_attribute]:
                        total[class_val]=0
                        for col_val_1 in self.train_parser.attribute_val_map[column_1]:
                            for col_val_2 in self.train_parser.attribute_val_map[column_2]:
                                
                                total[class_val] = total.get(class_val,0)+\
                                                self.conditional_laplacian_counts[(column_1,column_2)][(col_val_1,col_val_2,class_val)] 
                    
                    conditional_probability_dict=defaultdict()
                    for class_val in self.train_parser.attribute_val_map[self.dependent_attribute]:
                        for col_val_1 in self.train_parser.attribute_val_map[column_1]:
                            for col_val_2 in self.train_parser.attribute_val_map[column_2]:
                                conditional_probability_dict[(col_val_1,col_val_2,class_val)] = \
                                    float(self.conditional_laplacian_counts[(column_1,column_2)][(col_val_1,col_val_2,class_val)])/total[class_val]
                    self.conditional_probabilities[(column_1,column_2)]= conditional_probability_dict  

                
        #joint probability distribution for pairs of attrinbutes

        needed_columns = self.feature_attributes
        for index_1 in xrange(0,len(needed_columns)) :
            for index_2 in xrange(0,len(needed_columns)) :

                    column_1= needed_columns[index_1]
                    column_2= needed_columns[index_2]
                    total = train_data.shape[0]+\
                    (len(self.train_parser.attribute_val_map[column_1])*\
                     len(self.train_parser.attribute_val_map[column_2])*\
                     len(self.train_parser.attribute_val_map[self.dependent_attribute])
                    )

                    joint_probability_dict=defaultdict()
                    for class_val in self.train_parser.attribute_val_map[self.dependent_attribute]:
                        for col_val_1 in self.train_parser.attribute_val_map[column_1]:
                            for col_val_2 in self.train_parser.attribute_val_map[column_2]:
                                joint_probability_dict[(col_val_1,col_val_2,class_val)] = \
                                    float(self.conditional_laplacian_counts[(column_1,column_2)][(col_val_1,col_val_2,class_val)])/total
                    self.joint_probabilities[(column_1,column_2)]= joint_probability_dict  
        
    def predict(self,test_data):
        print ""
        correctly_classified_instances = 0        
        if self.classification_type =='n':
            for data in test_data:
                final_score=defaultdict()

                for class_val in self.dependent_attribute_values:
                    final_score[class_val] = self.prior_probabilities[class_val]
                    for index in xrange(0,len(self.feature_attributes)):
                        attr_name = self.feature_attributes[index]
                        attr_value = data[index]
                        final_score[class_val] = final_score[class_val]*\
                                                 self.conditional_probabilities[attr_name][(attr_value,class_val)]
                max_score = None             
                final_class = None
                classes = final_score.keys()
                scores = final_score.values()
                max_score = max(scores)
                #print scores
                final_class = classes[scores.index(max_score)]
                if final_class == data[-1]:
                    correctly_classified_instances +=1
                print "{0} {1} {2:.12f}".format(final_class,data[-1],float(max_score)/sum(final_score.values()))
            print "\n{0}".format(correctly_classified_instances)    
        else:            
            for data in test_data:
                final_score=defaultdict()
                for class_val in self.dependent_attribute_values:
                    final_score[class_val] = self.prior_probabilities[class_val]
                for node,parents in self.graph.iteritems():
                    final_key =  tuple([node]+parents)
                    cpt = self.tan_conditional_probabilities[final_key]
                    key=[]
                    #exclude the class_atrtribute
                    for column in [node]+parents[:-1]:
                        attr_index = self.feature_attributes.index(column)
                        attr_value = data[attr_index]
                        key.append(attr_value)

                    for class_val in self.dependent_attribute_values:
                        #Modify the key
                        new_key =tuple(key)+(class_val,)
                        final_score[class_val]=final_score[class_val]*cpt[new_key]
                max_score = None             
                final_class = None
                classes = final_score.keys()
                scores = final_score.values()
                max_score = max(scores)
                #print scores
                final_class = classes[scores.index(max_score)]
                if final_class == data[-1]:
                    correctly_classified_instances +=1
                print "{0} {1} {2:.12f}".format(final_class,data[-1],float(max_score)/sum(final_score.values()))
            print "{0}\n".format(correctly_classified_instances)    

In [369]:
nb_classifier = NaiveBayes(train_parser,sys.argv[3])
nb_classifier.fit(train_data)
nb_classifier.predict(test_parser.data)

TypeError: __init__() takes at least 2 arguments (1 given)

In [367]:
#Crossvalidation
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10)
for train_index, test_index in skf.split(test_parser.data , [1 for i in xrange(0,test_parser.data.shape[0])]):
    train_index, test_index

[ 5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
 30 31 32 33 34 35 36 37 38 39 40 41] [0 1 2 3 4]
[ 0  1  2  3  4 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
 30 31 32 33 34 35 36 37 38 39 40 41] [5 6 7 8 9]
[ 0  1  2  3  4  5  6  7  8  9 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
 29 30 31 32 33 34 35 36 37 38 39 40 41] [10 11 12 13]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 18 19 20 21 22 23 24 25 26 27 28
 29 30 31 32 33 34 35 36 37 38 39 40 41] [14 15 16 17]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 22 23 24 25 26 27 28
 29 30 31 32 33 34 35 36 37 38 39 40 41] [18 19 20 21]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 26 27 28
 29 30 31 32 33 34 35 36 37 38 39 40 41] [22 23 24 25]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 30 31 32 33 34 35 36 37 38 39 40 41] [26 27 28 29]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 3

[42]

IndexError: too many indices for array