In [1]:
def KnuthMorrisPratt(text, pattern):

    '''Yields all starting positions of copies of the pattern in the text.
Calling conventions are similar to string.find, but its arguments can be
lists or iterators, not just strings, it returns all matches, not just
the first one, and it does not need the whole text in memory at once.
Whenever it yields, it will have read the text exactly up to and including
the match that caused the yield.'''

    # allow indexing into pattern and protect against change during yield
    pattern = list(pattern)

    # build table of shift amounts
    shifts = [1] * (len(pattern) + 1)
    shift = 1
    for pos in range(len(pattern)):
        while shift <= pos and pattern[pos] != pattern[pos-shift]:
            shift += shifts[pos-shift]
        shifts[pos+1] = shift

    print(shifts)
    # do the actual search
    startPos = 0
    matchLen = 0
    for c in text:
        while matchLen == len(pattern) or \
              matchLen >= 0 and pattern[matchLen] != c:
            startPos += shifts[matchLen]
            matchLen -= shifts[matchLen]
        matchLen += 1
        if matchLen == len(pattern):
            yield startPos

def search_list(text, pattern):
    matchLen = len(pattern)
    for i in range(len(text) - matchLen + 1):
        startPos = 0
        while(startPos < matchLen and text[i + startPos] == pattern[startPos]):
            startPos += 1
        #if (text[i:i+matchLen] == pattern): return i
        if (startPos == matchLen): return i
    
    return -1

In [2]:
vocab_dict = {}
vocab_list = []
review_lines = []

i = 0
#with open('reviews_test.txt') as f:
with open('reviews_sample.txt') as f: 
    for line in f.read().splitlines():
        vocab_index = ""
        for word in line.split(' '):
            if word not in vocab_dict:                
                vocab_dict[word] = i
                word_index = i
                vocab_list.append(word)
                i += 1
            else:
                word_index = vocab_dict[word]
            
            vocab_index += ":" + str(word_index)
        
        vocab_index += ":"
        review_lines.append(vocab_index)

In [3]:
from tqdm import tqdm

MIN_SUPPORT = len(review_lines) * 0.01

def get_support(all_reviews, pattern):
    str_pattern = ""
    for i in pattern:
        str_pattern += ":" + str(i)
    str_pattern += ":"
    
    support = 0
    
    for review in all_reviews:
        #if search_list(review, pattern) >= 0:
        if review.find(str_pattern) >= 0:
            support += 1
    
    return support

L = []
S = []
L1 = []
S1 = []
for vocab in tqdm(range(len(vocab_list))):
    s = get_support(review_lines, [vocab])    
    if(s >= MIN_SUPPORT):
        L1.append([vocab])
        S1.append(s)

L.append(L1)
S.append(S1)

100%|██████████| 22104/22104 [02:32<00:00, 145.00it/s]


In [4]:
len(L1)

977

In [5]:
 def apriori_gen(all_lines, LK_1, min_support):
    LK = []
    S = []
    for l1 in tqdm(LK_1):
        for l2 in LK_1:
            if l1[:-1] == l2[:-1]: #and l1[-1] < l2[-1]:
                l = l1 + [l2[-1]]
                support = get_support(all_lines, l)
                if support >= min_support:
                    #print(l)
                    #print(support)
                    #print([vocab_list[i] for i in l])
                    LK.append(l)
                    S.append(support)
    
    return LK, S

In [6]:
LK_1 = L1
while len(LK_1) > 0:
    print("Working on %d-Itemsets" % len(LK_1[0]))
    LK_1, SK_1 = apriori_gen(review_lines, LK_1, MIN_SUPPORT)
    if (len(LK_1) > 0):
        L.append(LK_1)
        S.append(SK_1)

  0%|          | 0/977 [00:00<?, ?it/s]

Working on 1-Itemsets


100%|██████████| 977/977 [1:44:34<00:00,  6.29s/it]
  8%|▊         | 5/63 [00:00<00:01, 49.31it/s]

Working on 2-Itemsets


100%|██████████| 63/63 [00:00<00:00, 94.87it/s]


In [7]:
len(L[1])

63

In [8]:
with open('patterns.txt', 'w') as f:
    for i in range(len(L)):
        for j in range(len(L[i])):
            f.write("%d:"%(S[i][j]))
            for k in range(len(L[i][j]) - 1):
                f.write("%s;"%(vocab_list[L[i][j][k]]))
            f.write("%s\n"%(vocab_list[L[i][j][len(L[i][j]) - 1]]))