In [1]:
import pandas as pd
import numpy as np

### Import the data in csv format

In [2]:
data=pd.read_csv('online_retail.csv')
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description
0,562583,35637A,IVORY STRING CURTAIN WITH POLE
1,562583,35638A,PINK AND BLACK STRING CURTAIN
2,562583,84927F,PSYCHEDELIC TILE HOOK
3,562583,22425,ENAMEL COLANDER CREAM
4,562583,16008,SMALL FOLDING SCISSOR(POINTED EDGE)


### Checking the dataset
As it can be seen that, each row is sale of an item with similar Invoice No. So, we have to group all the items brought by the customer on the same invoice number. Lets keep StockCode for mining the rule then we can replace the StockCode with the Desciption of the item.

In [3]:
new=data.groupby('InvoiceNo')['StockCode'].apply(list)
transanctions=list(new)
#print(transanctions)

Total number of unique items in the dataset are:

In [4]:
len(data.StockCode.unique())

3353

### Movie Lense dataset
Lets import and see the movie lense dataset.

In [5]:
movie_data=pd.read_csv('movielens_movies.csv')
movie_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


So the assoication in the movie dataset would be done on the genres. So, we have to add generes as the tranasaction data for better analysis.

In [6]:
#extract on the basis of the | character.
movie_transanctions=movie_data['genres'].apply(lambda t: t.split('|'))
#convert this as the list
movie_transanctions=list(movie_transanctions)

Lets print and see the first transaction

In [7]:
print(movie_transanctions[0])

['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']


### Preparing the dataset for Association Rule Mining

### Online Retail Dataset

In [8]:
#The online retail dataset
#import the package for creation of one-hot encoder
from mlxtend.preprocessing import TransactionEncoder
#Get unique values from the dataset for encoding
encoderRetail=TransactionEncoder().fit(transanctions)
#Create one hot encoder
oneHotRetail=encoderRetail.transform(transanctions)
#Create the dataframe
oneHotRetail=pd.DataFrame(oneHotRetail, columns=encoderRetail.columns_)
print(oneHotRetail)

      10080  10120  10124A  10124G  10125  10133  10135  11001  15030  15034  \
0     False  False   False   False  False  False  False  False  False  False   
1     False  False   False   False  False  False  False  False  False  False   
2     False  False   False   False  False  False  False  False  False  False   
3     False  False   False   False  False  False  False  False  False  False   
4     False  False   False   False  False  False  False  False  False  False   
...     ...    ...     ...     ...    ...    ...    ...    ...    ...    ...   
9704  False  False   False   False  False  False  False  False  False  False   
9705  False  False   False   False  False  False  False  False  False  False   
9706  False  False   False   False  False  False  False  False  False  False   
9707  False  False   False   False  False  False  False  False  False  False   
9708  False  False   False   False  False  False  False  False  False  False   

      ...  DCGSSBOY  DCGSSGIRL    DOT  

Calculate the support value

In [9]:
oneHotRetail.mean()

10080           0.001545
10120           0.001545
10124A          0.000103
10124G          0.000206
10125           0.001133
                  ...   
POST            0.007519
S               0.000721
gift_0001_10    0.000412
gift_0001_20    0.000103
gift_0001_30    0.000412
Length: 3353, dtype: float64

### Movie Lense Dataset

In [10]:
encoderMovie=TransactionEncoder().fit(movie_transanctions)
oneHotMovie=encoderMovie.transform(movie_transanctions)
oneHotMovie=pd.DataFrame(oneHotMovie, columns=encoderMovie.columns_)
print(oneHotMovie)

       (no genres listed)  Action  Adventure  Animation  Children  Comedy  \
0                   False   False       True       True      True    True   
1                   False   False       True      False      True   False   
2                   False   False      False      False     False    True   
3                   False   False      False      False     False    True   
4                   False   False      False      False     False    True   
...                   ...     ...        ...        ...       ...     ...   
27273               False   False      False      False     False    True   
27274               False   False      False      False     False    True   
27275               False   False       True      False     False   False   
27276                True   False      False      False     False   False   
27277               False   False       True      False     False   False   

       Crime  Documentary  Drama  Fantasy  Film-Noir  Horror   IMAX  Musica

Computing the support value

In [11]:
oneHotMovie.mean()

(no genres listed)    0.009018
Action                0.129042
Adventure             0.085380
Animation             0.037649
Children              0.041755
Comedy                0.306987
Crime                 0.107743
Documentary           0.090586
Drama                 0.489185
Fantasy               0.051763
Film-Noir             0.012098
Horror                0.095718
IMAX                  0.007185
Musical               0.037979
Mystery               0.055503
Romance               0.151294
Sci-Fi                0.063898
Thriller              0.153164
War                   0.043772
Western               0.024782
dtype: float64

Calculate comfidence value of Drama and Comedy

In [12]:
def confidence_fun(antecedent, consequent):
    #find support of antecedent and consequent individual
    supportA=antecedent.mean()
    supportC=consequent.mean()
    #find support for both combined
    supportAC=np.logical_and(antecedent,consequent).mean()

    #calculate the confidence value
    return supportAC/supportA

#print the value
print(confidence_fun(oneHotMovie['Drama'], oneHotMovie['Comedy']))

0.19079736211031173


Also create the lift value

In [13]:
def lift_fun(antecedent, consequent):
    #find support of antecedent and consequent individual
    supportA=antecedent.mean()
    supportC=consequent.mean()
    #find support for both combined
    supportAC=np.logical_and(antecedent,consequent).mean()

    return supportAC/(supportA*supportC)

print(lift_fun(oneHotMovie['Drama'],oneHotMovie['Comedy']))

0.6215154578033297


As the value of lift is less than 1 than we have to say that this is not a good association rule. Now lets find leverage and conviction

In [14]:
def leverage_fun(antecedent, consequent):
    #find support of antecedent and consequent individual
    supportA=antecedent.mean()
    supportC=consequent.mean()
    #find support for both combined
    supportAC=np.logical_and(antecedent,consequent).mean()

    #leverage of comedy to drama
    return supportAC - (supportA*supportC)

print(leverage_fun(oneHotMovie['Drama'],oneHotMovie['Comedy']))
#Calculate the conviction, we have the combined support of Drama and Comedy
def conviction_fun(antecedent, consequent):
    #find support of antecedent and consequent individual
    supportA=antecedent.mean()
    supportC=consequent.mean()
    #find support for both combined
    supportAC=np.logical_and(antecedent,consequent).mean()

    #compute support for not drama
    supportnA=1-supportA
    #support for comedy but not drama
    supportCnA=supportC-supportAC

    #compute conviction
    return (supportC*supportnA)/supportCnA

print(conviction_fun(oneHotMovie['Drama'],oneHotMovie['Comedy']))


-0.05683843177077659
0.7339672714750785


### Zhang metric
Zhang in 2000 derived a new metric which not only calculates the association, it does find the amount of disassociation between antecedent and consequent.
Lets create a function fpr zhang metric and find their values.

In [15]:
def zhang(antecedent, consequent):
    #compute support for anecedent and consequent
    supportA=antecedent.mean()
    supportC=consequent.mean()

    #compute support for both
    supportAC=np.logical_and(antecedent, consequent).mean()

    #find numerator
    num= supportAC- (supportA*supportC)

    #find demonator
    dem=max(supportAC * (1-supportA), supportA*(supportC-supportAC))

    return num/dem

In [16]:
#lets create the itemset
movie_items= list(oneHotMovie.columns)
#remove the no genre listed
movie_items.remove('(no genres listed)')

#create permutations
from itertools import permutations

movie_items=list(permutations(movie_items,2))
print(len(movie_items))

342


So we have 342 different rules from our movie dataset, lets create zhang index for these rules.
Before that create a new dataframe with all values

In [17]:
rules=pd.DataFrame(movie_items, columns=['antecedent', 'consequent'])
print(rules)

    antecedent consequent
0       Action  Adventure
1       Action  Animation
2       Action   Children
3       Action     Comedy
4       Action      Crime
..         ...        ...
337    Western    Mystery
338    Western    Romance
339    Western     Sci-Fi
340    Western   Thriller
341    Western        War

[342 rows x 2 columns]


In [18]:
# Define an empty list for all metrics
rules=pd.DataFrame(movie_items, columns=['antecedent', 'consequent'])
support=[]
support_consequent=[]
confidence=[]
lift=[]
leverage=[]
conviction=[]
zhangs = []

for itemset in movie_items:
    antecedent=oneHotMovie[itemset[0]]
    consequent=oneHotMovie[itemset[1]]
    #add metrics
    support.append(antecedent.mean())
    support_consequent.append(consequent.mean())
    zhangs.append(zhang(antecedent, consequent))
    confidence.append(confidence_fun(antecedent, consequent))
    lift.append(lift_fun(antecedent, consequent))
    leverage.append(leverage_fun(antecedent, consequent))
    conviction.append(conviction_fun(antecedent, consequent))
    
# Print results
rules['support_antecedent']= support
rules['support_consequent']=support_consequent
rules['zhang'] = zhangs
rules['confidence']= confidence
rules['lift']=lift
rules['leverage']=leverage
rules['conviction']=conviction

#print the rules dataframe
print(rules.head(5))


  antecedent consequent  support_antecedent  support_consequent     zhang  \
0     Action  Adventure            0.129042            0.085380  0.793154   
1     Action  Animation            0.129042            0.037649  0.379671   
2     Action   Children            0.129042            0.041755 -0.336121   
3     Action     Comedy            0.129042            0.306987 -0.366056   
4     Action      Crime            0.129042            0.107743  0.580437   

   confidence      lift  leverage  conviction  
0    0.276136  3.234198  0.024616    1.494813  
1    0.056250  1.494048  0.002400    1.078980  
2    0.028977  0.693979 -0.001649    0.956626  
3    0.204261  0.665374 -0.013256    0.952764  
4    0.217898  2.022393  0.014215    1.178520  


### Multi criteria assoication rule mining

In [19]:
# Select the subset of rules with antecedent support greater than 0.05
rules = rules[rules['support_antecedent'] > 0.05]

# Select the subset of rules with a consequent support greater than 0.02
rules = rules[rules['support_consequent'] > 0.02]

# Select the subset of rules with a conviction greater than 1.01
rules = rules[rules['confidence']>0.1]

# Set the lift threshold to 1.5
rules = rules[rules['lift'] > 1.5]

# Set the conviction threshold to 1.0
rules = rules[rules['conviction']>1]

# Set the threshold for Zhang's rule to 0.65
rules = rules[rules['zhang']>0.65]

# Print rule
print(rules[['antecedent','consequent']])

    antecedent consequent
0       Action  Adventure
14      Action     Sci-Fi
18   Adventure     Action
19   Adventure  Animation
20   Adventure   Children
25   Adventure    Fantasy
32   Adventure     Sci-Fi
102      Crime    Mystery
105      Crime   Thriller
145    Fantasy  Adventure
146    Fantasy  Animation
147    Fantasy   Children
192     Horror    Mystery
194     Horror     Sci-Fi
195     Horror   Thriller
239    Mystery      Crime
244    Mystery     Horror
249    Mystery   Thriller
270     Sci-Fi     Action
271     Sci-Fi  Adventure
280     Sci-Fi     Horror
293   Thriller      Crime
298   Thriller     Horror
301   Thriller    Mystery


### Apriori Algorithm

In [24]:
# Import Apriori algorithm
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
# Compute frequent itemsets
frequent_itemsets = apriori(oneHotMovie, min_support = 0.0005, max_len = 4, use_colnames = True)
# Print number of itemsets
print(len(frequent_itemsets))

532


In [22]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.009018,((no genres listed))
1,0.129042,(Action)
2,0.085380,(Adventure)
3,0.037649,(Animation)
4,0.041755,(Children)
...,...,...
527,0.000807,"(Horror, Thriller, Sci-Fi, Drama)"
528,0.001210,"(Romance, Thriller, Drama, Mystery)"
529,0.001100,"(Thriller, Sci-Fi, Drama, Mystery)"
530,0.000623,"(Horror, Thriller, Fantasy, Mystery)"


In [26]:
# Compute association rules
rules = association_rules(frequent_itemsets, metric = "support", min_threshold = 0.001)
print(rules)

           antecedents                  consequents  antecedent support  \
0             (Action)                  (Adventure)            0.129042   
1          (Adventure)                     (Action)            0.085380   
2             (Action)                  (Animation)            0.129042   
3          (Animation)                     (Action)            0.037649   
4           (Children)                     (Action)            0.041755   
...                ...                          ...                 ...   
1849  (Drama, Mystery)           (Thriller, Sci-Fi)            0.026432   
1850        (Thriller)     (Sci-Fi, Drama, Mystery)            0.153164   
1851          (Sci-Fi)   (Thriller, Drama, Mystery)            0.063898   
1852           (Drama)  (Thriller, Sci-Fi, Mystery)            0.489185   
1853         (Mystery)    (Thriller, Sci-Fi, Drama)            0.055503   

      consequent support   support  confidence      lift  leverage  conviction  
0               0.

In [27]:
print(rules.columns)

Index(['antecedents', 'consequents', 'antecedent support',
       'consequent support', 'support', 'confidence', 'lift', 'leverage',
       'conviction'],
      dtype='object')


In [29]:
rules = association_rules(frequent_itemsets, metric = "antecedent support", min_threshold = 0.002)
print(len(rules))

3303


In [30]:
rules = association_rules(frequent_itemsets, metric = "lift", min_threshold = 1)
print(len(rules))

2358


In [34]:
filtered_rules = rules[(rules['antecedent support'] > 0.001) & (rules['support'] > 0.009) & (rules['confidence'] > 0.25) & (rules['lift'] > 1.00)]
print(len(filtered_rules))

47


In [37]:
filtered_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Action),(Adventure),0.129042,0.08538,0.035633,0.276136,3.234198,0.024616,1.263525
1,(Adventure),(Action),0.08538,0.129042,0.035633,0.417347,3.234198,0.024616,1.494813
4,(Crime),(Action),0.107743,0.129042,0.028118,0.260973,2.022393,0.014215,1.17852
11,(Sci-Fi),(Action),0.063898,0.129042,0.023499,0.367757,2.849906,0.015253,1.377568
12,(Thriller),(Action),0.153164,0.129042,0.040655,0.265438,2.056994,0.020891,1.185684
13,(Action),(Thriller),0.129042,0.153164,0.040655,0.315057,2.056994,0.020891,1.23636
15,(War),(Action),0.043772,0.129042,0.010961,0.250419,1.940603,0.005313,1.161926
18,(Animation),(Adventure),0.037649,0.08538,0.012904,0.342746,4.01435,0.00969,1.391577
20,(Children),(Adventure),0.041755,0.08538,0.016497,0.395083,4.627344,0.012932,1.511977
22,(Fantasy),(Adventure),0.051763,0.08538,0.018733,0.361898,4.238666,0.014313,1.433344
