In [2]:
import sys
import csv
from itertools import chain, combinations
from collections import defaultdict
import nltk.data
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from stemming.porter2 import stem
import wikipedia
from gensim import corpora, models
import math
from textblob import TextBlob as tb
from nltk.stem import WordNetLemmatizer

In [3]:
def subsets(arr):
    """ Returns non empty subsets of arr"""
    return chain(*[combinations(arr, i + 1) for i, a in enumerate(arr)])

In [4]:
def returnfrequentitemsets(itemSet, transactionList, minSupport, freqSet):
        """calculates the support for items in the itemSet and returns a subset
        of the itemSet each of whose elements satisfies the minimum support"""
        _itemSet = set()
        localSet = defaultdict(int)

        for item in itemSet:
            for transaction in transactionList:
                if item.issubset(transaction):
                    freqSet[item] += 1
                    localSet[item] += 1

        for item, count in localSet.items():
            support = float(count)/len(transactionList)

            if support >= minSupport:
                _itemSet.add(item)

        return _itemSet


In [5]:
def joinSet(itemSet, length):
        """Join a set with itself and returns the n-element itemsets"""
        return set([i.union(j) for i in itemSet for j in itemSet if len(i.union(j)) == length])

def getSupport(transactionList,freqSet,item):
    return float(freqSet[item])/len(transactionList)


In [11]:
def runApriori(data_iter, minSupport):

    transactionList=list()
    itemSet=set()
    for record in data_iter:
        transaction=frozenset(record)
        transactionList.append(transaction)
        for item in transaction:
            itemSet.add(frozenset([item]))             # generate 1-itemset
    #print type(itemSet)
    freqSet = defaultdict(int)
    largeSet = dict()
    #assocRules = dict()
    # Dictionary which stores Association Rules

    oneCSet = returnfrequentitemsets(itemSet,transactionList,minSupport,freqSet)  # obtaining frequent 1-itemset          

    currentLSet = oneCSet
    k = 2
    while(currentLSet != set([])):
        """ while loop generates 2 or more itemsets that are frequent and the result is stored in 
            largeSet dictionary"""      
        largeSet[k-1] = currentLSet
        currentLSet = joinSet(currentLSet, k)        # joining frequent itemset to obtain k length itemset
        currentCSet = returnfrequentitemsets(currentLSet,transactionList,minSupport,freqSet)    # from the set obtained by joining select the itemsets that satisfy minsupport
        currentLSet = currentCSet
        k = k + 1
    RetrieveItems = []
    """ RetrieveItems store itemset which are frequent with their support"""
    for key, value in largeSet.items():
        for item in value:
            RetrieveItems.append((tuple(item),getSupport(transactionList,freqSet,item))) 
        #RetrieveItems.extend([(tuple(item), getSupport(transactionList,freqSet,item))
                        #for item in value])
    return RetrieveItems

In [7]:
def printfreqitemsets(items,maxlength,writer): 
    count=countfreqitemsets(items,maxlength)
    #count=list(count)
    #writer.write('%d' %count)
    for item, support in sorted(items, key=lambda (item, support): support):
    #print type(item)
        if len(item)==maxlength:    # printing frequent itemsets with max length
            itemlist=list(item)
            #print type(itemlist)
            #freq_item=str(",".join([str(s) for s in itemlist]))
            #freq_itemList=freq_item.split(' ')
            #print freq_itemList
            writer.writerows([itemlist])
            #print "%s" % str(item)


In [8]:
# function to parse sentences to words and remove stopwords from sentence
def sentence_to_wordlist( sentence, remove_stopwords=True ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    text = BeautifulSoup(sentence).get_text()
    #  
    # 2. Remove non-letters
    text = re.sub("[^a-zA-Z]"," ",text)
    #
    # 3. Convert words to lower case and split them
    words = text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [9]:
# function to parse text to sentences using tokenizer mentioned above
def text_to_sentences(text,tokenizer,remove_stopwords=True):
    raw_sentences = tokenizer.tokenize(text.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( sentence_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [17]:
# tokenize to sentences based on the notations followed in english literature
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

#import wikipedia page for 
content = wikipedia.page('barack').content
sentences = []
sentences = text_to_sentences(content, tokenizer)
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
# stem token
sentences = [[stem(word) for word in sentence] for sentence in sentences]

minsupport = 0.5
items = runApriori(sentences, minsupport)
print items

[((u'obama',), 0.5600858369098712)]
