Association Rule Mining on Radio dataset : 
#### By : Saurabh Gupta

In [1]:
import pandas as pd
import numpy as np

In [2]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [3]:
# reading the dataset
lastfm1=pd.read_csv("./lastfm.csv")

In [4]:
# introducing a new column of freq with 1 as constant value
lastfm1=lastfm1[['user','artist']]
lastfm1['freq']=1
lastfm1['user']=lastfm1['user'].astype('str')
lastfm1

Unnamed: 0,user,artist,freq
0,1,red hot chili peppers,1
1,1,the black dahlia murder,1
2,1,goldfrapp,1
3,1,dropkick murphys,1
4,1,le tigre,1
...,...,...,...
289950,19718,bob dylan,1
289951,19718,pixies,1
289952,19718,the clash,1
289953,19718,a tribe called quest,1


In [5]:
# operating on freq column based on matcching combinations of user and artist
lastfmgrp=lastfm1.groupby(['user', 'artist'])['freq'].sum().unstack().reset_index().fillna(0).set_index('user')

In [6]:
lastfmgrp

artist,...and you will know us by the trail of dead,2pac,3 doors down,30 seconds to mars,311,36 crazyfists,44,50 cent,65daysofstatic,[unknown],...,wilco,within temptation,wolfgang amadeus mozart,wu-tang clan,yann tiersen,yeah yeah yeahs,yellowcard,yo la tengo,zero 7,Édith piaf
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10002,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# an encoding function to represent above data as boolean
def encode_freq(x):
    if x<=0:
        return 0
    if x>=1:
        return 1

In [8]:
# using the above function to generate frequency set which will be ready to be fed into apriori algorithm
freq_set=lastfmgrp.applymap(encode_freq)

In [9]:
freq_set

artist,...and you will know us by the trail of dead,2pac,3 doors down,30 seconds to mars,311,36 crazyfists,44,50 cent,65daysofstatic,[unknown],...,wilco,within temptation,wolfgang amadeus mozart,wu-tang clan,yann tiersen,yeah yeah yeahs,yellowcard,yo la tengo,zero 7,Édith piaf
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10002,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# calling the apriori algorithm to generate its own object based on min support parameters
freq_item=apriori(freq_set,min_support=0.01,use_colnames=True)

In [14]:
# further calling the association rules and finding the items with highest lift
rules = association_rules(freq_item, metric="lift", min_threshold=1)
rules.sort_values(by='lift',ascending=False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1031,(nas),(jay-z),0.024867,0.030333,0.0106,0.426273,14.052971,0.009846,1.69012
1030,(jay-z),(nas),0.030333,0.024867,0.0106,0.349451,14.052971,0.009846,1.498938
1783,(the pussycat dolls),(rihanna),0.018,0.043067,0.0104,0.577778,13.415893,0.009625,2.266421
1782,(rihanna),(the pussycat dolls),0.043067,0.018,0.0104,0.241486,13.415893,0.009625,1.294637
243,(rihanna),(beyoncé),0.043067,0.029733,0.013933,0.323529,10.881034,0.012653,1.434307


In [12]:
del freq_item ## drop this as it consumes a lot of memory

## Observations and Conclusions:

### With a minimum support of 0.01, we have seen top 5 results based on lift.
### Among those, we have <i> (the pussycat dolls)	(rihanna) </i> as the best choice as they have a relatively higher confidence than others.