In [43]:
from itertools import chain, combinations
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [65]:
def prepare(corpus):
    f = open(corpus)
    lines = f.read().splitlines()
    summary = {}
    ind = 1
    stop = set(stopwords.words('english'))
    for line in lines:
        line = re.sub(r"http\S+", "", line)    
        line = re.sub(' +',' ',line)
        tokenized_sent =word_tokenize(line)
        list_of_words = [i.lower() for i in tokenized_sent if i.lower() not in stop]
        tw = []
        for w in list_of_words:
            if w not in ['.',',','#','earthquake','nepal','nepalearthquake'] and w not in tw:
                tw.append(w)
        yield tw

In [45]:
def itemset_from_data(data):
    itemset = set()
    transaction_list = list()
    for row in data:
        transaction_list.append(frozenset(row))
        for item in row:
            if item:
                itemset.add(frozenset([item]))
    return itemset, transaction_list

In [46]:
def joinset(itemset, k):
    return set([i.union(j) for i in itemset for j in itemset if len(i.union(j)) == k])


def subsets(itemset):
    return chain(*[combinations(itemset, i + 1) for i, a in enumerate(itemset)])
    

def itemset_support(transaction_list, itemset, min_support=0):
    len_transaction_list = len(transaction_list)
    l = [
        (item, float(sum(1 for row in transaction_list if item.issubset(row)))/len_transaction_list) 
        for item in itemset
    ]
    return dict([(item, support) for item, support in l if support >= min_support])


def freq_itemset(transaction_list, c_itemset, min_support):
    f_itemset = dict()

    k = 1
    while True:
        if k > 1:
            c_itemset = joinset(l_itemset, k)
        l_itemset = itemset_support(transaction_list, c_itemset, min_support)
        if not l_itemset:
            break
        f_itemset.update(l_itemset)
        k += 1

    return f_itemset    


def apriori(itemset, transaction_list, min_support, min_confidence):
    # Get first itemset and transactions
    #itemset, transaction_list = itemset_from_data(data)

    # Get the frequent itemset
    f_itemset = freq_itemset(transaction_list, itemset, min_support)

    # Association rules
    rules = list()
    for item, support in f_itemset.items():
        if len(item) > 1:
            for A in subsets(item):
                B = item.difference(A)
                if B:
                    A = frozenset(A)
                    AB = A | B
                    confidence = float(f_itemset[AB]) / f_itemset[A]
                    if confidence >= min_confidence:
                        rules.append((A, B, confidence))    
    return rules, f_itemset


def print_report(rules, f_itemset):
    print('--Frequent Itemset--')
    for item, support in sorted(f_itemset.items(), key=lambda (item, support): support):
        print('[I] {} : {}'.format(tuple(item), round(support, 4)))

    print('')
    print('--Rules--')
    for A, B, confidence in sorted(rules, key=lambda (A, B, confidence): confidence):
        print('[R] {} => {} : {}'.format(tuple(A), tuple(B), round(confidence, 4))) 

In [102]:
tweets = prepare("output.txt")

In [104]:
itemset, transaction_list = itemset_from_data(tweets)

In [106]:
rules, itemset = apriori(itemset, transaction_list, 0.005, 0.40)

In [101]:
rules
#itemset

[(frozenset({'affected'}), frozenset({'thoughts'}), 0.23689320388349513),
 (frozenset({'thoughts'}), frozenset({'affected'}), 0.24646464646464647),
 (frozenset({'avalanche'}), frozenset({'everest'}), 0.8278145695364238),
 (frozenset({'everest'}), frozenset({'avalanche'}), 0.38819875776397517),
 (frozenset({'safe'}), frozenset({'stay'}), 0.23324396782841822),
 (frozenset({'stay'}), frozenset({'safe'}), 0.5576923076923077),
 (frozenset({'!'}), frozenset({'prayers'}), 0.13145539906103287),
 (frozenset({'go'}), frozenset({'prayers'}), 0.49246231155778897),
 (frozenset({'prayers'}), frozenset({'go'}), 0.10816777041942605),
 (frozenset({'bless'}), frozenset({'god'}), 0.803921568627451),
 (frozenset({'god'}), frozenset({'bless'}), 0.24848484848484848),
 (frozenset({'death'}), frozenset({'rises', 'toll'}), 0.21645021645021645),
 (frozenset({'rises'}), frozenset({'death', 'toll'}), 0.9090909090909091),
 (frozenset({'toll'}), frozenset({'death', 'rises'}), 0.21367521367521367),
 (frozenset({'dea