# Finding frequent pairs

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
data = pd.read_csv('clean_data/books_data_clean.csv')
ratings = pd.read_csv('clean_data/books_rating_clean.csv')

In [3]:
print(ratings.columns)

Index(['Unnamed: 0', 'title', 'user_id', 'helpfulness', 'score', 'summary',
       'text', 'helpfulness_count', 'helpfulness_pct'],
      dtype='object')


In [4]:
print(ratings['title'].value_counts(sort=True))

title
pride and prejudice                                                                                    5537
wuthering heights                                                                                      4922
the hobbit                                                                                             4567
the picture of dorian gray                                                                             3241
wuthering heights                                                                                      3126
                                                                                                       ... 
freelance writing                                                                                         1
whales dolphins  porpoises                                                                                1
the everyday writer                                                                                       1
assertive discipline f

In [5]:
ratings['title'] = ratings['title'].map(lambda s: s.lower() if type(s) == str else s)
ratings['title'] = ratings['title'].replace(r"\(.*\)","", regex=True)

In [6]:
print(ratings['title'].value_counts(sort=False))

title
eyewitness travel guide to europe                      3
voices from the farm adventures in community living    1
tess and the highlander                                4
communicating with orcas  the whales perspective       1
the impatient gardeners lawn book                      1
                                                      ..
the wild palms                                         6
spiders and their kin                                  3
killing mister watson                                  9
very bad deaths library edition                        9
the idea of history                                    2
Name: count, Length: 32920, dtype: int64


In [7]:
ratings['title'] = ratings['title'].map(lambda s: 'the hobbit' if 'hobbit' in s else s)
ratings['title'] = ratings['title'].map(lambda s: 'lord of the rings' if 'lord of the rings' in s else s)
ratings['title'] = ratings['title'].map(lambda s: 'pride and prejudice' if 'pride & prejudice' in s else s)
ratings['title'] = ratings['title'].map(lambda s: 'pride and prejudice' if 'pride and prejudice' in s else s)
ratings['title'] = ratings['title'].map(lambda s: 'the great gatsby' if 'great gatsby' in s else s)
ratings['title'] = ratings['title'].map(lambda s: 'to kill a mocking bird' if 'mocking bird' in s else s)
ratings['title'] = ratings['title'].map(lambda s: 'to kill a mocking bird' if 'mockingbird' in s else s)

TypeError: argument of type 'float' is not iterable

In [None]:
print(ratings['title'].value_counts()[0:100])

title
the hobbit                                                         32761
pride and prejudice                                                23405
lord of the rings                                                  14505
to kill a mocking bird                                              9947
wuthering heights                                                   8660
                                                                   ...  
the quiet american                                                   888
the giver                                                            886
alice's adventures in wonderland and through the looking glass       882
harry potter and the sorcerer's stone                                870
wuthering heights.                                                   868
Name: count, Length: 100, dtype: int64


In [None]:
print(ratings.head())

   Unnamed: 0                               title         user_id helpfulness  \
0         413  night world: daughters of darkness  A2KBHSK5BS35BH         0/0   
1         414  night world: daughters of darkness   AAGXEK8SQP7FB         0/0   
2         437  night world: daughters of darkness  A18WQCGU74XHDR         0/0   
3         446  night world: daughters of darkness  A31HGIGKNKCC2C         0/0   
4         450  night world: daughters of darkness  A2WZON0QPX7C9X         0/0   

   score                               summary  \
0    1.0  Night World 2: Daughters of Darkness   
1    5.0                           a good read   
2    5.0                               Perfect   
3    4.0                Night World - Book Two   
4    5.0                            Ash rules!   

                                                text  helpfulness_count  \
0  Night World #2: Daughters of Darkness, by L. J...                  0   
1  I loved the first book. the secondone I read i...          

A-priori algorithm

In [None]:
print(data.columns)
print(ratings.columns)

Index(['Unnamed: 0', 'title', 'description', 'authors', 'published_date',
       'categories', 'n_ratings', 'avg_rating'],
      dtype='object')
Index(['Unnamed: 0', 'title', 'user_id', 'helpfulness', 'score', 'summary',
       'text', 'helpfulness_count', 'helpfulness_pct'],
      dtype='object')


In [None]:
# get a list with books that each user reviewed 
group_by_user = ratings.groupby('user_id')['title']

In [None]:
baskets = [list(set(a[1]['title'].tolist())) for a in list(ratings.groupby('user_id'))]

In [None]:
print(baskets[0])

['the richest man in babylon', 'the richest man in babylon ', 'attitude 101']


In [None]:
### unique #items
items= ratings['title'].unique()
print(len(items))

12311


In [None]:
### hash all singletons
df_item_hash = pd.DataFrame(range(len(items)), index = list(items), columns =['hashcode'], dtype=int)
df_item_hash

Unnamed: 0,hashcode
night world: daughters of darkness,0
death dream,1
the food of love,2
the scarletti curse,3
cruel and unusual,4
...,...
deliverance from evil spirits: a practical manual,12306
"cracking da vinci's code: you've read the fiction, now read the facts",12307
the wild palms,12308
killing mister watson,12309


In [None]:
### count the items, store the count into the hashed array index

# item_count = pd.DataFrame(np.zeros((len(items),1)), index = list(items), columns =['count'], dtype=int)
item_count_arr = np.zeros((len(items),1))

for b in baskets:
    for item in b:
        idx = df_item_hash.loc[item,'hashcode']
        item_count_arr[idx] += 1
            
### find frequent items with support > s1 (here s1 = 0.02), and hash back from array index to items           
freq_items  = [df_item_hash[df_item_hash['hashcode']==x].index[0] for x in np.where(item_count_arr > 0.001*len(baskets))[0]] 
freq_items

# item_count_arr[item_count['count']>0.02*len(baskets)]
#freq_items['hashcode'] = list(range(1,len(freq_items)+1))

['economics in one lesson',
 'the scarlet letter a romance',
 'prodigal son ',
 "tess of the d'urbervilles: a pure woman ",
 'the dharma bums',
 'foundation',
 'romeo and juliet',
 'king rat',
 'helter skelter',
 'small gods',
 'hard times',
 'the clan of the cave bear',
 '1984',
 'fahrenheit 451',
 'finnegans wake',
 'one hundred years of solitude',
 'the canterbury tales',
 'prey',
 'hound of the baskervilles ',
 'leaves of grass',
 'walden',
 'third secret: a novel of suspense ',
 'candide and other writings',
 'ransom',
 'we',
 'the daughter of time',
 'in the heart of the sea: the tragedy of the whaleship essex',
 'life and death of the mayor of casterbridge ',
 'of mice and men ',
 'lord of light',
 'twenty thousand leagues under the sea ',
 'live from new york: an uncensored history of saturday night live',
 'up from slavery: an autobiography',
 'winter solstice',
 'the greatest generation',
 'mayor of casterbridge, the',
 'witch of blackbird pond ',
 'a princess of mars',
 'til

In [None]:
### hash the frequent items (starting from 1)

df_freq_item_hash = pd.DataFrame(range(1,len(freq_items)+1), index=freq_items, columns=['hashcode'])
df_freq_item_hash

Unnamed: 0,hashcode
economics in one lesson,1
the scarlet letter a romance,2
prodigal son,3
tess of the d'urbervilles: a pure woman,4
the dharma bums,5
...,...
oryx and crake,1491
alice in wonderland,1492
six days of war: june 1967 and the making of the modern middle east,1493
lost world,1494


In [None]:
### count the pairs using only frequent items, store the count into the (triangular) matrix.

# pair_mat = pd.DataFrame(np.zeros((len(freq_items.index),len(freq_items.index))), 
#                         columns=freq_items.index, index=freq_items.index,
#                        dtype=int)

pair_mat_hashed = np.zeros((len(freq_items)+1,len(freq_items)+1))
# n = len(freq_items)
# triangular_arr = np.zeros((n*n,))


for b in baskets:
    cand_list = [item for item in b if item in freq_items]
    if len(cand_list)<2:
        continue
    for idx, item1 in enumerate(cand_list):
        for item2 in cand_list[idx+1:]:
            i = df_freq_item_hash.loc[item1,'hashcode'] 
            j = df_freq_item_hash.loc[item2,'hashcode'] 
            #triangular_arr[triangular_encode(i,j,n)] +=1
            #pair_mat.loc[item1, item2] += 1
            pair_mat_hashed[max(i,j),min(i,j)]+=1

# pair_mat
pair_mat_hashed

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 4., ..., 0., 0., 0.],
       [0., 0., 3., ..., 0., 0., 0.],
       [0., 1., 2., ..., 3., 0., 0.]])

In [None]:
### extract frequent pairs that exceed support s2 (assume s2 = 0.02), and hash back.

freq_pairs = [[df_freq_item_hash[df_freq_item_hash['hashcode']==x].index[0], df_freq_item_hash[df_freq_item_hash['hashcode']==y].index[0]] for x, y in zip(*np.where(pair_mat_hashed > 0.02*len(baskets)))]
freq_pairs
# freq_pairs = [[freq_itemset[x], freq_itemset[y]] for x, y in zip(*np.where(pair_mat.values > 0.02*len(baskets)))]

[['manhattan stories from the heart of a great city', 'the great gatsby']]

### Build-in tools

In [None]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [None]:
te = TransactionEncoder()
te_ary = te.fit(baskets).transform(baskets)
df_one_hot = pd.DataFrame(te_ary, columns=te.columns_)
df_one_hot

Unnamed: 0,Unnamed: 1,"""a"" is for alibi","""c"" is for corpse","""d"" is for deadbeat","""f"" is for fugitive: a kinsey millhone mystery","""hey, whipple, squeeze this"": a guide to creating great ads","""i, the jury""","""more more more,"" said the baby board book","""stand back,"" said the elephant, ""i'm going to sneeze!""","'an essay concerning human understanding,'",...,zero: the biography of a dangerous idea,zig ziglar's secrets of closing the sale,zin! zin! zin! a violin,"zipporah, wife of moses: a novel",zlata's diary,zodiac,zondervan niv study bible,zorba the greek,zorro - a novel,zuleika dobson
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63951,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
63952,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
63953,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
63954,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
frq_items = apriori(df_one_hot, min_support = 0.02, use_colnames = True)
frq_items['length'] = frq_items['itemsets'].apply(lambda x: len(x))
frq_items

Unnamed: 0,support,itemsets,length
0,0.036072,(lord of the rings),1
1,0.021562,(manhattan stories from the heart of a great c...,1
2,0.027847,(pride and prejudice),1
3,0.021671,(the great gatsby),1
4,0.055491,(the hobbit),1
5,0.025158,(to kill a mocking bird),1
6,0.021562,"(the great gatsby, manhattan stories from the ...",2


In [None]:
### reformat a little, to put the frozenset into lists

ml_freq_items = []
for i in frq_items[frq_items['length']==1].itemsets.values:
    ml_freq_items.extend(list(i))
    
ml_freq_pairs = []
for i in frq_items[frq_items['length']==2].itemsets.values:
    ml_freq_pairs.append(list(i))

### check if the frequent itemsets found by ourselves and mlxtend are the same
for i in ml_freq_items:
    if i not in freq_items:
        print(i)
        
len(ml_freq_pairs)==len(freq_pairs)

True

In [None]:
association_rules(frq_items, metric="confidence", min_threshold=0.6)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(the great gatsby),(manhattan stories from the heart of a great c...,0.021671,0.021562,0.021562,0.994949,46.1443,0.021094,193.730784,1.0
1,(manhattan stories from the heart of a great c...,(the great gatsby),0.021562,0.021671,0.021562,1.0,46.1443,0.021094,inf,0.999888


In [None]:
association_rules(frq_items, metric="lift", min_threshold=1.2)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(the great gatsby),(manhattan stories from the heart of a great c...,0.021671,0.021562,0.021562,0.994949,46.1443,0.021094,193.730784,1.0
1,(manhattan stories from the heart of a great c...,(the great gatsby),0.021562,0.021671,0.021562,1.0,46.1443,0.021094,inf,0.999888
