In [156]:
def KnuthMorrisPratt(text, pattern):

    '''Yields all starting positions of copies of the pattern in the text.
Calling conventions are similar to string.find, but its arguments can be
lists or iterators, not just strings, it returns all matches, not just
the first one, and it does not need the whole text in memory at once.
Whenever it yields, it will have read the text exactly up to and including
the match that caused the yield.'''

    # allow indexing into pattern and protect against change during yield
    pattern = list(pattern)

    # build table of shift amounts
    shifts = [1] * (len(pattern) + 1)
    shift = 1
    for pos in range(len(pattern)):
        while shift <= pos and pattern[pos] != pattern[pos-shift]:
            shift += shifts[pos-shift]
        shifts[pos+1] = shift

    print(shifts)
    # do the actual search
    startPos = 0
    matchLen = 0
    for c in text:
        while matchLen == len(pattern) or \
              matchLen >= 0 and pattern[matchLen] != c:
            startPos += shifts[matchLen]
            matchLen -= shifts[matchLen]
        matchLen += 1
        if matchLen == len(pattern):
            yield startPos

def search_list(text, pattern):
    matchLen = len(pattern)
    for i in range(len(text) - matchLen + 1):
        startPos = 0
        while(startPos < matchLen and text[i + startPos] == pattern[startPos]):
            startPos += 1
        #if (text[i:i+matchLen] == pattern): return i
        if (startPos == matchLen): return i
    
    return -1

In [33]:
vocab_dict = {}
vocab_list = []
review_lines = []

i = 0
#with open('reviews_test.txt') as f:
with open('reviews_sample.txt') as f: 
    for line in f.read().splitlines():
        vocab_index = ""
        for word in line.split(' '):
            if word not in vocab_dict:                
                vocab_dict[word] = i
                word_index = i
                vocab_list.append(word)
                i += 1
            else:
                word_index = vocab_dict[word]
            
            vocab_index += ":" + str(word_index)
        
        vocab_index += ":"
        review_lines.append(vocab_index)

In [34]:
from tqdm import tqdm

MIN_SUPPORT = len(review_lines) * 0.01

def get_support(all_reviews, pattern):
    str_pattern = ""
    for i in pattern:
        str_pattern += ":" + str(i)
    str_pattern += ":"
    
    support = 0
    
    for review in all_reviews:
        #if search_list(review, pattern) >= 0:
        if review.find(str_pattern) >= 0:
            support += 1
    
    return support

L = []
S = []
L1 = []
S1 = []
for vocab in tqdm(range(len(vocab_list))):
    s = get_support(review_lines, [vocab])    
    if(s >= MIN_SUPPORT):
        L1.append([vocab])
        S1.append(s)

L.append(L1)
S.append(S1)

100%|███████████████████████████████████████████████████████████████████████████| 22104/22104 [02:21<00:00, 156.42it/s]


In [35]:
len(L1)

977

In [46]:
 def apriori_gen(all_lines, LK_1, min_support):
    LK = []
    S = []
    for l1 in tqdm(LK_1):
        for l2 in LK_1:
            if l1[:-1] == l2[:-1]: #and l1[-1] < l2[-1]:
                l = l1 + [l2[-1]]
                support = get_support(all_lines, l)
                if support >= min_support:
                    print(l)
                    print(support)
                    print([vocab_list[i] for i in l])
                    LK.append(l)
                    S.append(support)
    
    return LK, S

In [47]:
LK_1 = L1
while len(LK_1) > 0:
    print("Working on %d-Itemsets" % len(LK_1[0]))
    LK_1, SK_1 = apriori_gen(review_lines, LK_1, MIN_SUPPORT)
    if (len(LK_1) > 0):
        L.append(LK_1)
        S.append(SK_1)

Working on 1-Itemsets


  0%|▎                                                                               | 4/977 [00:22<1:31:13,  5.63s/it]

[7, 8]
170
['year', 'ago']


  1%|▊                                                                              | 10/977 [00:56<1:32:52,  5.76s/it]

[16, 32]
246
['food', 'good']
[16, 39]
118
['food', 'service']
[16, 45]
163
['food', 'great']


  2%|█▊                                                                             | 23/977 [02:10<1:29:29,  5.63s/it]

[32, 16]
205
['good', 'food']
[32, 39]
147
['good', 'service']
[32, 46]
109
['good', 'place']
[32, 77]
108
['good', 'thing']


  3%|██                                                                             | 26/977 [02:26<1:29:06,  5.62s/it]

[38, 39]
209
['customer', 'service']


  3%|██▏                                                                            | 27/977 [02:32<1:28:48,  5.61s/it]

[39, 32]
128
['service', 'good']
[39, 45]
135
['service', 'great']


  3%|██▌                                                                            | 32/977 [03:00<1:27:02,  5.53s/it]

[45, 16]
151
['great', 'food']
[45, 39]
153
['great', 'service']
[45, 46]
273
['great', 'place']


  3%|██▋                                                                            | 33/977 [03:05<1:27:24,  5.56s/it]

[46, 5]
104
['place', 'like']
[46, 250]
131
['place', 'get']


  4%|███                                                                            | 38/977 [03:33<1:27:22,  5.58s/it]

[54, 93]
154
['staff', 'friendly']


  4%|███▍                                                                           | 42/977 [03:55<1:26:27,  5.55s/it]

[58, 309]
211
['make', 'sure']


  5%|███▊                                                                           | 47/977 [04:23<1:25:58,  5.55s/it]

[65, 66]
220
['ice', 'cream']


  5%|███▉                                                                           | 49/977 [04:34<1:25:05,  5.50s/it]

[67, 32]
274
['really', 'good']
[67, 91]
111
['really', 'nice']


  6%|████▉                                                                          | 61/977 [05:40<1:23:58,  5.50s/it]

[89, 16]
110
['whole', 'food']


  7%|█████▎                                                                         | 65/977 [06:02<1:24:22,  5.55s/it]

[93, 54]
101
['friendly', 'staff']


  7%|█████▋                                                                         | 70/977 [06:30<1:22:43,  5.47s/it]

[101, 128]
130
['would', 'definitely']
[101, 177]
133
['would', 'recommend']


  7%|█████▋                                                                         | 71/977 [06:36<1:23:57,  5.56s/it]

[102, 46]
178
['love', 'place']


 10%|███████▌                                                                       | 93/977 [08:46<1:28:27,  6.00s/it]

[128, 391]
140
['definitely', 'back']


 10%|███████▌                                                                       | 94/977 [08:52<1:28:29,  6.01s/it]

[129, 391]
237
['come', 'back']


 11%|████████▎                                                                     | 104/977 [09:52<1:27:22,  6.00s/it]

[140, 32]
355
['pretty', 'good']
[140, 141]
141
['pretty', 'much']


 11%|████████▍                                                                     | 105/977 [09:58<1:27:06,  5.99s/it]

[141, 122]
162
['much', 'better']


 11%|████████▌                                                                     | 107/977 [10:10<1:26:57,  6.00s/it]

[143, 1500]
125
['hot', 'dog']


 12%|█████████                                                                     | 113/977 [10:46<1:26:17,  5.99s/it]

[151, 152]
107
['fish', 'sandwich']


 12%|█████████▌                                                                    | 120/977 [11:28<1:25:32,  5.99s/it]

[158, 15]
201
['beer', 'selection']


 14%|██████████▊                                                                   | 136/977 [13:03<1:23:45,  5.98s/it]

[176, 177]
215
['highly', 'recommend']


 14%|███████████▎                                                                  | 141/977 [13:32<1:21:49,  5.87s/it]

[183, 108]
323
['first', 'time']


 18%|██████████████▎                                                               | 180/977 [17:20<1:18:10,  5.88s/it]

[248, 391]
152
['going', 'back']


 20%|███████████████▍                                                              | 194/977 [18:42<1:16:06,  5.83s/it]

[275, 368]
253
['even', 'though']


 21%|████████████████▏                                                             | 202/977 [19:29<1:14:59,  5.81s/it]

[290, 5]
175
['look', 'like']


 22%|████████████████▊                                                             | 211/977 [20:20<1:13:14,  5.74s/it]

[313, 5601]
153
['strip', 'district']


 23%|█████████████████▊                                                            | 223/977 [21:30<1:12:17,  5.75s/it]

[333, 86]
139
['last', 'night']
[333, 108]
155
['last', 'time']


 25%|███████████████████▊                                                          | 248/977 [23:54<1:10:32,  5.81s/it]

[374, 780]
113
['across', 'street']


 26%|████████████████████                                                          | 251/977 [24:11<1:10:01,  5.79s/it]

[386, 20]
186
['one', 'best']
[386, 826]
135
['one', 'favorite']


 26%|████████████████████▏                                                         | 253/977 [24:23<1:09:46,  5.78s/it]

[390, 204]
108
['reasonable', 'price']


 26%|████████████████████▎                                                         | 255/977 [24:34<1:09:58,  5.81s/it]

[392, 108]
230
['every', 'time']


 26%|████████████████████▌                                                         | 258/977 [24:52<1:09:30,  5.80s/it]

[404, 54]
128
['wait', 'staff']


 31%|███████████████████████▉                                                      | 300/977 [28:54<1:05:13,  5.78s/it]

[486, 108]
125
['long', 'time']


 33%|█████████████████████████▌                                                    | 320/977 [30:48<1:03:33,  5.80s/it]

[523, 86]
110
['saturday', 'night']


 34%|██████████████████████████▎                                                   | 329/977 [31:40<1:02:28,  5.78s/it]

[536, 5635]
122
['giant', 'eagle']


 35%|████████████████████████████                                                    | 342/977 [32:55<59:23,  5.61s/it]

[556, 273]
163
['happy', 'hour']


 41%|████████████████████████████████▌                                               | 398/977 [38:14<54:37,  5.66s/it]

[674, 5]
113
['looked', 'like']


 45%|████████████████████████████████████                                            | 441/977 [42:19<50:43,  5.68s/it]

[760, 5]
230
['feel', 'like']


 52%|█████████████████████████████████████████▎                                      | 504/977 [48:16<44:58,  5.71s/it]

[923, 391]
101
['coming', 'back']


 52%|█████████████████████████████████████████▎                                      | 505/977 [48:22<45:07,  5.74s/it]

[926, 927]
103
['reasonably', 'priced']


 58%|██████████████████████████████████████████████                                  | 562/977 [53:57<40:01,  5.79s/it]

[1117, 5]
118
['felt', 'like']


 60%|████████████████████████████████████████████████▎                               | 590/977 [56:42<38:10,  5.92s/it]

[1192, 166]
120
['french', 'fry']


 63%|██████████████████████████████████████████████████▌                             | 617/977 [59:27<36:57,  6.16s/it]

[1268, 108]
227
['next', 'time']


 70%|██████████████████████████████████████████████████████▌                       | 684/977 [1:06:15<29:21,  6.01s/it]

[1562, 142]
133
['parking', 'lot']


 74%|██████████████████████████████████████████████████████████                    | 727/977 [1:10:35<25:14,  6.06s/it]

[1735, 430]
120
['grocery', 'store']


 88%|████████████████████████████████████████████████████████████████████▌         | 859/977 [1:23:32<11:29,  5.84s/it]

[2601, 5]
103
['tasted', 'like']


 92%|███████████████████████████████████████████████████████████████████████▋      | 898/977 [1:27:20<07:31,  5.71s/it]

[3047, 7363]
100
['pasta', 'trio']


100%|██████████████████████████████████████████████████████████████████████████████| 977/977 [1:34:58<00:00,  5.73s/it]


Working on 2-Itemsets


100%|█████████████████████████████████████████████████████████████████████████████████| 63/63 [00:00<00:00, 112.16it/s]


In [65]:
L[2]

[[7, 8],
 [16, 32],
 [16, 39],
 [16, 45],
 [32, 16],
 [32, 39],
 [32, 46],
 [32, 77],
 [38, 39],
 [39, 32],
 [39, 45],
 [45, 16],
 [45, 39],
 [45, 46],
 [46, 5],
 [46, 250],
 [54, 93],
 [58, 309],
 [65, 66],
 [67, 32],
 [67, 91],
 [89, 16],
 [93, 54],
 [101, 128],
 [101, 177],
 [102, 46],
 [128, 391],
 [129, 391],
 [140, 32],
 [140, 141],
 [141, 122],
 [143, 1500],
 [151, 152],
 [158, 15],
 [176, 177],
 [183, 108],
 [248, 391],
 [275, 368],
 [290, 5],
 [313, 5601],
 [333, 86],
 [333, 108],
 [374, 780],
 [386, 20],
 [386, 826],
 [390, 204],
 [392, 108],
 [404, 54],
 [486, 108],
 [523, 86],
 [536, 5635],
 [556, 273],
 [674, 5],
 [760, 5],
 [923, 391],
 [926, 927],
 [1117, 5],
 [1192, 166],
 [1268, 108],
 [1562, 142],
 [1735, 430],
 [2601, 5],
 [3047, 7363]]

In [50]:
with open('patterns.txt', 'w') as f:
    for i in range(len(L)):
        for j in range(len(L[i])):
            f.write("%d:"%(S[i][j]))
            for k in range(len(L[i][j]) - 1):
                f.write("%s;"%(vocab_list[L[i][j][k]]))
            f.write("%s\n"%(vocab_list[L[i][j][len(L[i][j]) - 1]]))

In [67]:
vocab_list[8]

'ago'