In [None]:
from pysqlite2 import dbapi2 as sqlite
import re
import math

In [None]:
def getwords(doc):
    splitter = re.compile('\\W*')
    print doc
    # Split the words by non-alpha characters
    words = [s.lower() for s in splitter.split(doc) if len(s)>2 and len(s)<20]
  
    # Return the unique set of words only
    return dict([(w,1) for w in words])

In [None]:
def sampletrain(cl):
    cl.train('Nobody owns the water.','good')
    cl.train('the quick rabbit jumps fences','good')
    cl.train('buy pharmaceuticals now','bad')
    cl.train('make quick money at the online casino','bad')
    cl.train('the quick brown fox jumps','good')

# 分类器基类

In [None]:
class classifier:
    def __init__(self,getfeatures,filename=None):
        # Counts of feature/category combinations
        self.fc = {}
        # Counts of documents in each category
        self.cc = {}
        self.getfeatures = getfeatures
        #
        self.dbfile = filename
        
    def setdb(self,dbfile):
        self.dbfile = dbfile
        #
        self.con = sqlite.connect(dbfile)    
        self.con.execute('create table if not exists fc(feature,category,count)')
        self.con.execute('create table if not exists cc(category,count)')
    
    def incf(self,f,cat):
        if self.dbfile == None:
            self.fc.setdefault(f,{})
            self.fc[f].setdefault(cat,0)
            self.fc[f][cat] += 1
        else:
            count = self.fcount(f,cat)
            if count == 0:
                self.con.execute("insert into fc values ('%s','%s',1)" % (f,cat))
            else:
                self.con.execute("update fc set count=%d where feature='%s' and category='%s'" % (count+1,f,cat)) 

    def incc(self,cat):
        if self.dbfile == None:
            self.cc.setdefault(cat,0)
            self.cc[cat] += 1
        else:
            count = self.catcount(cat)
            if count==0:
                self.con.execute("insert into cc values ('%s',1)" % (cat))
            else:
                self.con.execute("update cc set count=%d where category='%s'" % (count+1,cat))  

    def fcount(self,f,cat):
        if self.dbfile == None:
            if f in self.fc and cat in self.fc[f]:
                return float(self.fc[f][cat])
            return 0.0
        else:
            res = self.con.execute('select count from fc where feature="%s" and category="%s"'%(f,cat)).fetchone()
            if res == None: return 0
            else: return float(res[0])
    

    def catcount(self,cat):
        if self.dbfile == None:
            if cat in self.cc:
                return float(self.cc[cat])
            return 0
        else:
            res = self.con.execute('select count from cc where category="%s"' %(cat)).fetchone()
            if res == None: return 0
            else: return float(res[0])
    
    
    def totalcount(self):
        if self.dbfile == None:
            return sum(self.cc.values())
        else:
            res = self.con.execute('select sum(count) from cc').fetchone();
            if res == None: return 0
            return res[0]
    
    def categories(self):
        if self.dbfile == None:
            return self.cc.keys()
        else:
            cur = self.con.execute('select category from cc');
            return [d[0] for d in cur]
    
    def train(self,item,cat):
        features = self.getfeatures(item)
        # Increment the count for every feature with this category
        for f in features:
            self.incf(f,cat)

        # Increment the count for this category
        self.incc(cat)
        
        #
        if self.dbfile != None:
            self.con.commit()

    def fprob(self,f,cat):
        if self.catcount(cat)==0: return 0

        # The total number of times this feature appeared in this 
        # category divided by the total number of items in this category
        return self.fcount(f,cat)/self.catcount(cat)   
    
    def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5):
        # Calculate current probability
        basicprob = prf(f,cat)
        #print 'basicprob:',basicprob
        # Count the number of times this feature has appeared in
        # all categories
        totals = sum([self.fcount(f,c) for c in self.categories()])
        #print 'totals:',totals

        # Calculate the weighted average
        bp = ((weight * ap) + (totals * basicprob))/(weight + totals)
        return bp    

In [None]:
cl = classifier(getwords)
cl

In [None]:
cl.train('the quick brown fox jump over the lazy dog','good')

In [None]:
cl.train('make quick money in the on1ine casio ','bad')

In [None]:
cl.fcount('quick' , 'good')

In [None]:
cl.catcount('good')

In [None]:
cl.catcount('bad')

In [None]:
cl.catcount('good')

In [None]:
cl.totalcount()

In [None]:
cl.fcount('the' , 'good')

In [None]:
cl.fc

In [None]:
sampletrain(cl)

In [None]:
cl.fprob('quick','good')

In [None]:
cl.weightedprob('money','good',cl.fprob)

In [None]:
cl.fprob('money','good')

In [None]:
cl.weightedprob('money','good',cl.fprob)

In [None]:
cl.fprob('money','bad')

In [None]:
cl.weightedprob('money','bad',cl.fprob)

# 朴素贝叶斯

In [None]:
class naivebayes(classifier):
  
    def __init__(self,getfeatures):
        classifier.__init__(self,getfeatures)
        self.thresholds = {}
        
    def docprob(self,item,cat):
        features = self.getfeatures(item)   

        # Multiply the probabilities of all the features together
        p = 1
        for f in features: p *= self.weightedprob(f,cat,self.fprob)
        return p     
    
    def prob(self,item,cat):
        catprob = self.catcount(cat)/self.totalcount()
        docprob = self.docprob(item,cat)
        return docprob * catprob    
    
    def setthreshold(self,cat,t):
        self.thresholds[cat]=t
    
    def getthreshold(self,cat):
        if cat not in self.thresholds: return 1.0
        return self.thresholds[cat]  
    
    def classify(self,item,default=None):
        probs = {}
        # Find the category with the highest probability
        max = 0.0
        for cat in self.categories():
            probs[cat] = self.prob(item,cat)
            if probs[cat] > max: 
                max = probs[cat]
                best = cat

        # Make sure the probability exceeds threshold*next best
        for cat in probs:
            if cat == best: continue
            if probs[cat] * self.getthreshold(best) > probs[best]: return default
        
        #
        return best    

In [None]:
cl = naivebayes(getwords)

In [None]:
sampletrain(cl)

In [None]:
cl.prob('quick rabbit','good')

In [None]:
cl.prob('quick rabbit','bad')

In [None]:
cl.classify('quick rabbit',default='unknown')

In [None]:
cl.classify('quick money',default='unknown')

In [None]:
cl.setthreshold('bad',3.0)

In [None]:
cl.classify('quick rabbit',default='unknown')

In [None]:
cl.classify('quick money',default='unknown')

In [None]:
for i in range(10): sampletrain(cl)

In [None]:
cl.classify('quick money',default='unknown')

# 费舍尔方法

In [None]:
class fisherclassifier(classifier):
    
    def __init__(self,getfeatures):
        classifier.__init__(self,getfeatures)
        self.minimums = {}
    
    def cprob(self,f,cat):
        # The frequency of this feature in this category    
        clf = self.fprob(f,cat)
        if clf==0: return 0

        # The frequency of this feature in all the categories
        freqsum = sum([self.fprob(f,c) for c in self.categories()])

        # The probability is the frequency in this category divided by
        # the overall frequency
        p = clf/(freqsum)

        return p
    
    def fisherprob(self,item,cat):
        # Multiply all the probabilities together
        p = 1
        features = self.getfeatures(item)
        for f in features:
            p *= (self.weightedprob(f,cat,self.cprob))

        # Take the natural log and multiply by -2
        fscore=-2*math.log(p)

        # Use the inverse chi2 function to get a probability
        return self.invchi2(fscore,len(features)*2)
    
    def invchi2(self,chi, df):
        m = chi / 2.0
        sum = term = math.exp(-m)
        for i in range(1, df//2):
            term *= m / i
            sum += term
        return min(sum, 1.0)    
    
    def setminimum(self,cat,min):
        self.minimums[cat] = min

    def getminimum(self,cat):
        if cat not in self.minimums: return 0
        return self.minimums[cat]
    
    def classify(self,item,default=None):
        # Loop through looking for the best result
        best = default
        max = 0.0
        for c in self.categories():
            p = self.fisherprob(item,c)
            # Make sure it exceeds its minimum
            if p > self.getminimum(c) and p > max:
                best = c
                max = p
        return best    

In [None]:
cl = fisherclassifier(getwords)

In [None]:
sampletrain(cl)

In [None]:
cl.cprob('quick','good')

In [None]:
cl.cprob('money','good')

In [None]:
cl.cprob('money','bad')

In [None]:
cl.weightedprob('money','bad',cl.cprob)

In [None]:
cl.fisherprob('money','good')

In [None]:
cl.fisherprob('money','bad')

In [None]:
cl.classify('quick rabbit')

In [None]:
cl.classify('quick money')

In [None]:
cl.setminimum('bad',0.8)

In [None]:
cl.classify('quick money')

In [None]:
cl.setminimum('good',0.4)

In [None]:
cl.classify('quick rabbit')

In [None]:
cl = fisherclassifier(getwords)

In [None]:
cl.setdb('fisher.db')

In [None]:
sampletrain(cl)

In [None]:
c2 = naivebayes(getwords)

In [None]:
c2.setdb('fisher.db')

In [None]:
c2.classify('quick money')

In [None]:
cl.classify('quick money')

# 过滤 Feed

In [None]:
import feedparser
import re

In [None]:
# Takes a filename of URL of a blog feed and classifies the entries
def read(feed,classifier,is_entry=False):
    # Get feed entries and loop over them
    f = feedparser.parse(feed)
    for entry in f['entries']:
        print
        print '-----'
        # Print the contents of the entry
        print 'Title:     '+ entry['title'].encode('utf-8')
        print 'Publisher: '+ entry['publisher'].encode('utf-8')
        print
        print entry['summary'].encode('utf-8')


        # Combine all the text to create one item for the classifier
        fulltext = '%s\n%s\n%s' % (entry['title'],entry['publisher'],entry['summary'])

        # Print the best guess at the current category
        if is_entry == True:
            print 'Guess: '+ str(classifier.classify(entry))
        else:
            print 'Guess: '+ str(classifier.classify(fulltext))

        # Ask the user to specify the correct category and train on that
        cl = raw_input('Enter category: ')
        
        if is_entry == True:
            classifier.train(entry,cl)
        else:
            classifier.train(fulltext,cl)

In [None]:
cl = fisherclassifier(getwords)

In [None]:
cl.setdb('python_feed.db')

In [None]:
read('python_search.xml',cl)

In [None]:
def entryfeatures(entry):
    splitter = re.compile('\\W*')
    f = {}
  
    # Extract the title words and annotate
    titlewords = [s.lower() for s in splitter.split(entry['title']) if len(s)>2 and len(s)<20]
    for w in titlewords: f['Title:'+ w] = 1
  
    # Extract the summary words
    summarywords = [s.lower() for s in splitter.split(entry['summary']) if len(s)>2 and len(s)<20]

    # Count uppercase words
    uc = 0
    for i in range(len(summarywords)):
        w = summarywords[i]
        f[w] = 1
        if w.isupper(): uc += 1

        # Get word pairs in summary as features
        if i < len(summarywords)-1:
            twowords = ' '.join(summarywords[i:i+1])
            f[twowords] = 1
    
    # Keep creator and publisher whole
    f['Publisher:' + entry['publisher']]=1

    # UPPERCASE is a virtual word flagging too much shouting  
    if float(uc)/len(summarywords)>0.3: f['UPPERCASE'] = 1
  
    return f

In [None]:
cl = fisherclassifier(entryfeatures)

In [None]:
cl.setdb('python_feed.db')

In [None]:
read('python_search.xml',cl,True)