In [1]:
import re
import math

In [2]:
def getwords(doc):
    '''
        Returns words in form of dictionary
    '''
    splitter = re.compile('\\W*')
    #split words by non alpha characters
    words = [s.lower() for s in splitter.split(doc) if len(s)>2 and len(s)<20 ]
    
    # return unique set of words only
    return dict([(w,1) for w in words])

In [3]:
class classifier:
    def __init__(self,getfeatures,filename=None):
        #counts of feature/category combinatios
        self.fc={}
        #counts of documents in each category
        self.cc={}
        self.getfeatures = getfeatures
    
    # Increase the count of a feature/category pair
    def incf(self,f,cat):
        self.fc.setdefault(f,{})
        self.fc[f].setdefault(cat,0)
        self.fc[f][cat]+=1
        
    # Increase the count of a category
    def incc(self,cat):
        self.cc.setdefault(cat,0)
        self.cc[cat]+=1
    
    # The number of times a feature has appeared in category
    def fcount(self,f,cat):
        if f in self.f and cat in self.cat:
            return float(self.fc[f][cat])
        return 0.0
    
    # The number of items in a category
    def catcount(self,cat):
        if cat in self.cc:
            return float(self.cc[cat])
        return 0
    
    # The total number of items
    def totalcount(self):
        return sum(self.cc.values())
    
    # The list of all categories
    def categories(self):
        return self.cc.keys()
    
    def train(self,item,cat):
        features = self.getfeatures(item)
        # increment count of every feature in this category
        for f in features:
            self.incf(f,cat)
        
        # increment count of this category
        self.incc(cat)

In [4]:
# using getwords as the feature extractor.
c1 = classifier(getwords)

# Calculating the probabilities.

In [6]:
# Calculating Probabilities
def fprob(self,f,cat):
    if self.catcount(cat) == 0:
        return 0
    # Total number of times product appears in this category
    # divided by total number of items in category
    return self.fcount(f,cat)/self.catcount(cat)
classifier.fprob = fprob

def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5):
    '''prf is the function to calculate proability
       ap is assumed probability
    '''
    #Calculate current probability
    basicprob = prf(f,cat)
    
    #count the number of times this feature has appeared in all categories
    totals = sum([self.fcount(f,c) for c in self.categories()])
    
    # calculate the weighted average
    bp = ((weight*ap) + (totals*basicprob)) / (weight + totals)
    return bp

# Naive Bayes classifier

In [8]:
# creating a subclass classifier
class naivebayes(classifier):
    def __init__(self,getfeatures):
        classifier.__init__(self,getfeatures)
        self.thresholds={}
        
    def docprob(self,item,cat):
        features = self.getfeatures(item)
        
        # Since each word is assumed independent multiply all probabilities
        p=1
        for f in features: p*=self.weightedprob(f,cat,self.fprob)
        return p
    
    def prob(self,item,cat):
        catprob = self.catcount(cat)/self.totalcount()
        docprob = self.docprob(item,cat)
        return docprob*catprob
    
    def setthreshold(self,cat,t):
        self.thresholds[cat] = t
    
    def getthreshold(self,cat):
        if cat not in self.thresholds: return 1.0
        return self.thresolds[cat]
    
    def classify(self,item,default=None):
        probs={}
        # Find the category with highest probability
        tmax = 0.0
        for cat in self.categories():
            probs[cat] = self.prob(item,cat)
            if probs[cat] > tmax:
                tmax = probs[cat]
                best = cat
        
        # Make sure the probability exceeds thresold*next best
        for cat in probs:
            if cat == best: continue
            if probs[cat]*self.getthreshold(best)>probs[best]: return default
        return best