# Finding frequent pairs

In [37]:
import pandas as pd
import numpy as np
import re

In [38]:
data = pd.read_csv('clean_data/books_data_clean.csv')
ratings = pd.read_csv('clean_data/books_rating_clean.csv')

In [39]:
print(ratings.columns)

Index(['Unnamed: 0', 'title', 'user_id', 'helpfulness', 'score', 'summary',
       'text', 'helpfulness_count', 'helpfulness_pct'],
      dtype='object')


In [40]:
print(ratings['title'].value_counts(sort=True))

title
wuthering heights                                                                        8048
pride and prejudice                                                                      7166
the picture of dorian gray                                                               4716
little women or meg jo beth and amy                                                      3294
emma                                                                                     3135
                                                                                         ... 
staying sober a guide for relapse prevention based upon the cenaps model of treatment       1
the world rushed in the california gold rush experience                                     1
the naive and sentimental lover                                                             1
charms and charm bracelets the complete guide                                               1
smalltalk best practice patterns                      

In [41]:
print(ratings['title'].value_counts(sort=False))

title
eyewitness travel guide to europe                      3
voices from the farm adventures in community living    1
tess and the highlander                                4
communicating with orcas  the whales perspective       1
the impatient gardeners lawn book                      1
                                                      ..
rain village signed                                    4
spiders and their kin                                  3
killing mister watson                                  9
very bad deaths library edition                        9
the idea of history                                    2
Name: count, Length: 32101, dtype: int64


In [42]:
print(ratings['title'].value_counts()[0:100])

title
wuthering heights                            8048
pride and prejudice                          7166
the picture of dorian gray                   4716
little women or meg jo beth and amy          3294
emma                                         3135
                                             ... 
a farewell to arms                            395
a journey to the center of the earth          383
nineteen eightyfour                           379
george orwell 1984                            378
zen and the art of motorcycle maintenance     368
Name: count, Length: 100, dtype: int64


In [43]:
print(ratings.head())

   Unnamed: 0                                              title  \
0         140                  eyewitness travel guide to europe   
1         141                  eyewitness travel guide to europe   
2         142                  eyewitness travel guide to europe   
3         182  voices from the farm adventures in community l...   
4         217                            tess and the highlander   

          user_id helpfulness  score  \
0  A281NPSIMI1C2R       19/19    5.0   
1  A2TAPL67U2A5HM       12/13    5.0   
2   AT9YSY20RJUDX       11/13    4.0   
3  A1ER5AYS3FQ9O3         6/7    5.0   
4  A2VCGJLKGK2WJJ       17/18    5.0   

                                             summary  \
0    The Major Sights in Twenty Countries - Amazing!   
1   Europe (Eyewitness Travel Guides) by Dan Colwell   
2              Helpful guide on tours through Europe   
3  Intense memoir told in brief segments by forme...   
4  My new favorite book from the Avon True Romanc...   

             

A-priori algorithm

In [44]:
print(data.columns)
print(ratings.columns)

Index(['Unnamed: 0', 'title', 'description', 'authors', 'published_date',
       'categories', 'n_ratings', 'avg_rating'],
      dtype='object')
Index(['Unnamed: 0', 'title', 'user_id', 'helpfulness', 'score', 'summary',
       'text', 'helpfulness_count', 'helpfulness_pct'],
      dtype='object')


In [45]:
# get a list with books that each user reviewed 
group_by_user = ratings.groupby('user_id')['title']

In [46]:
baskets = [list(set(a[1]['title'].tolist())) for a in list(ratings.groupby('user_id'))]

In [47]:
print(baskets[0])

['20 000 leagues under the sea', 'the annotated jules verne twenty thousand leagues under the sea', 'the mysterious island', 'a journey to the center of the earth', 'twenty thousand leagues under the sea', 'journey to the centre of the earth', 'journey to the center of the earth', 'a journey to the centre of the earth']


In [48]:
### unique #items
items= ratings['title'].unique()
print(len(items))

32102


In [49]:
### hash all singletons
df_item_hash = pd.DataFrame(range(len(items)), index = list(items), columns =['hashcode'], dtype=int)
df_item_hash

Unnamed: 0,hashcode
eyewitness travel guide to europe,0
voices from the farm adventures in community living,1
tess and the highlander,2
communicating with orcas the whales perspective,3
the impatient gardeners lawn book,4
...,...
rain village signed,32097
spiders and their kin,32098
killing mister watson,32099
very bad deaths library edition,32100


In [50]:
### count the items, store the count into the hashed array index

# item_count = pd.DataFrame(np.zeros((len(items),1)), index = list(items), columns =['count'], dtype=int)
item_count_arr = np.zeros((len(items),1))

for b in baskets:
    for item in b:
        idx = df_item_hash.loc[item,'hashcode']
        item_count_arr[idx] += 1
            
### find frequent items with support > s1 (here s1 = 0.02), and hash back from array index to items           
freq_items  = [df_item_hash[df_item_hash['hashcode']==x].index[0] for x in np.where(item_count_arr > 0.02*len(baskets))[0]] 
freq_items

# item_count_arr[item_count['count']>0.02*len(baskets)]
#freq_items['hashcode'] = list(range(1,len(freq_items)+1))

['the scarlet letter a romance',
 '1984',
 'fahrenheit 451',
 'prey',
 'of mice and men',
 'mere christianity',
 'lord of the flies',
 'emma',
 'ulysses',
 'the stranger',
 'monte cristo',
 'dune',
 'jane eyre',
 'the hobbitt or there and back again illustrated by the author',
 'little women',
 'great expectations',
 'a christmas carol in prose',
 'robert louis stevensons treasure island',
 'persuasion',
 'east of eden',
 'the scarlet letter',
 'brave new world',
 'little women or meg jo beth and amy',
 'the giver',
 'the old man and the sea',
 'frankenstein',
 'a confederacy of dunces',
 'the picture of dorian gray',
 'the great gatsby',
 'slaughterhousefive  or the childrens crusade',
 'the hitchhikers guide to the galaxy',
 'in cold blood a true account of a multiple murder and its consequences',
 'moby dick',
 'guns germs and steel the fates of human societies',
 'night',
 'the adventures of huckleberry finn',
 'a christmas carol',
 'pride and prejudice',
 'wuthering heights',
 'th

In [51]:
### hash the frequent items (starting from 1)

df_freq_item_hash = pd.DataFrame(range(1,len(freq_items)+1), index=freq_items, columns=['hashcode'])
df_freq_item_hash

Unnamed: 0,hashcode
the scarlet letter a romance,1
1984,2
fahrenheit 451,3
prey,4
of mice and men,5
...,...
the corrections,137
a tale of two cities literary touchstone edition,138
under the banner of heaven,139
of mice and men hb,140


In [52]:
### count the pairs using only frequent items, store the count into the (triangular) matrix.

# pair_mat = pd.DataFrame(np.zeros((len(freq_items.index),len(freq_items.index))), 
#                         columns=freq_items.index, index=freq_items.index,
#                        dtype=int)

pair_mat_hashed = np.zeros((len(freq_items)+1,len(freq_items)+1))
# n = len(freq_items)
# triangular_arr = np.zeros((n*n,))


for b in baskets:
    cand_list = [item for item in b if item in freq_items]
    if len(cand_list)<2:
        continue
    for idx, item1 in enumerate(cand_list):
        for item2 in cand_list[idx+1:]:
            i = df_freq_item_hash.loc[item1,'hashcode'] 
            j = df_freq_item_hash.loc[item2,'hashcode'] 
            #triangular_arr[triangular_encode(i,j,n)] +=1
            #pair_mat.loc[item1, item2] += 1
            pair_mat_hashed[max(i,j),min(i,j)]+=1

# pair_mat
pair_mat_hashed

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0., 28.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  4., 13., ...,  0.,  0.,  0.],
       [ 0., 26., 37., ..., 11.,  0.,  0.],
       [ 0., 10., 24., ...,  3., 10.,  0.]])

In [53]:
### extract frequent pairs that exceed support s2 (assume s2 = 0.02), and hash back.

freq_pairs = [[df_freq_item_hash[df_freq_item_hash['hashcode']==x].index[0], df_freq_item_hash[df_freq_item_hash['hashcode']==y].index[0]] for x, y in zip(*np.where(pair_mat_hashed > 0.02*len(baskets)))]
freq_pairs
# freq_pairs = [[freq_itemset[x], freq_itemset[y]] for x, y in zip(*np.where(pair_mat.values > 0.02*len(baskets)))]

[['the scarlet letter', 'the scarlet letter a romance'],
 ['little women or meg jo beth and amy', 'little women'],
 ['a christmas carol', 'a christmas carol in prose'],
 ['the hobbit',
  'the hobbitt or there and back again illustrated by the author'],
 ['great gatsby', 'the great gatsby'],
 ['the fellowship of the rings', 'the fellowship of the ring'],
 ['adventures of huckleberry finn', 'the adventures of huckleberry finn'],
 ['jane eyre complete and unabridged', 'jane eyre'],
 ['huckleberry finn', 'the adventures of huckleberry finn'],
 ['huckleberry finn', 'adventures of huckleberry finn'],
 ['jonathan strange and mr norrell', 'jonathan strange and mr norrell signed'],
 ['heart of darkness and the secret sharer', 'heart of darkness'],
 ['slaughterhouse five', 'slaughterhousefive  or the childrens crusade'],
 ['treasure island', 'robert louis stevensons treasure island'],
 ['the lord of the rings the fellowship of the ring',
  'the fellowship of the ring'],
 ['the lord of the rings 

### Build-in tools

In [54]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [55]:
te = TransactionEncoder()
te_ary = te.fit(baskets).transform(baskets)
df_one_hot = pd.DataFrame(te_ary, columns=te.columns_)
df_one_hot

TypeError: '<' not supported between instances of 'float' and 'str'

In [None]:
frq_items = apriori(df_one_hot, min_support = 0.02, use_colnames = True)
frq_items['length'] = frq_items['itemsets'].apply(lambda x: len(x))
frq_items

Unnamed: 0,support,itemsets,length
0,0.036072,(lord of the rings),1
1,0.021562,(manhattan stories from the heart of a great c...,1
2,0.027847,(pride and prejudice),1
3,0.021671,(the great gatsby),1
4,0.055491,(the hobbit),1
5,0.025158,(to kill a mocking bird),1
6,0.021562,"(the great gatsby, manhattan stories from the ...",2


In [None]:
### reformat a little, to put the frozenset into lists

ml_freq_items = []
for i in frq_items[frq_items['length']==1].itemsets.values:
    ml_freq_items.extend(list(i))
    
ml_freq_pairs = []
for i in frq_items[frq_items['length']==2].itemsets.values:
    ml_freq_pairs.append(list(i))

### check if the frequent itemsets found by ourselves and mlxtend are the same
for i in ml_freq_items:
    if i not in freq_items:
        print(i)
        
len(ml_freq_pairs)==len(freq_pairs)

True

In [None]:
association_rules(frq_items, metric="confidence", min_threshold=0.6)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(the great gatsby),(manhattan stories from the heart of a great c...,0.021671,0.021562,0.021562,0.994949,46.1443,0.021094,193.730784,1.0
1,(manhattan stories from the heart of a great c...,(the great gatsby),0.021562,0.021671,0.021562,1.0,46.1443,0.021094,inf,0.999888


In [None]:
association_rules(frq_items, metric="lift", min_threshold=1.2)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(the great gatsby),(manhattan stories from the heart of a great c...,0.021671,0.021562,0.021562,0.994949,46.1443,0.021094,193.730784,1.0
1,(manhattan stories from the heart of a great c...,(the great gatsby),0.021562,0.021671,0.021562,1.0,46.1443,0.021094,inf,0.999888
