## Association rule mining for FM artist data

## Given set of users and the corresponding FM artist they listen to, use association rule mining to generate rules of which artists are listened together. This can be used to improved artists and song recommendations to users in future. 

In [45]:
import pandas as pd
import os
import numpy as np
from mlxtend.frequent_patterns import apriori as ap, association_rules

In [2]:
import os
os.chdir("C:\\Users\\satish\\Downloads")

In [4]:
# READ DATA: consists of user and artist the user has purchased/listened
lastfm = pd.read_csv("lastfm.csv")
print(lastfm.head(50))
print(lastfm.shape)

    user                   artist sex         country
0      1    red hot chili peppers   f         Germany
1      1  the black dahlia murder   f         Germany
2      1                goldfrapp   f         Germany
3      1         dropkick murphys   f         Germany
4      1                 le tigre   f         Germany
5      1               schandmaul   f         Germany
6      1                    edguy   f         Germany
7      1             jack johnson   f         Germany
8      1                eluveitie   f         Germany
9      1              the killers   f         Germany
10     1             judas priest   f         Germany
11     1               rob zombie   f         Germany
12     1               john mayer   f         Germany
13     1                  the who   f         Germany
14     1               guano apes   f         Germany
15     1       the rolling stones   f         Germany
16     3         devendra banhart   m   United States
17     3         boards of c

In [6]:
lastfm = lastfm[['user','artist']]
lastfm = lastfm.drop_duplicates()
lastfm.shape

(289953, 2)

In [7]:
# form list of lists for artists of each user
records = []
for i in lastfm['user'].unique():
    records.append(list(lastfm[lastfm['user'] == i]['artist'].values)) # subset to match user, then select user's artists

# we care only about artists, not the users

In [8]:
len(records)

15000

### Support A -> B: P(A & B)
### Confidence A -> B: P(B | A)
### Lift A -> B: confidence(A -> B)/support(B) ; we want this to be more than 1, meaning keeping together is better than keeping B alone; if it's less than 1, means keeping B alone is better

## 1. Using apyori and list format

In [20]:
from apyori import apriori

association_rules = apriori(records, min_support = 0.01, min_confidence = 0.2, min_lift = 2, min_length = 2)
association_results = list(association_rules)

In [21]:
print("There are {} Relation derived.".format(len(association_results)))


There are 619 Relation derived.


In [22]:
association_results

[RelationRecord(items=frozenset({'3 doors down', 'linkin park'}), support=0.011066666666666667, ordered_statistics=[OrderedStatistic(items_base=frozenset({'3 doors down'}), items_add=frozenset({'linkin park'}), confidence=0.3577586206896552, lift=3.6431631434791774)]),
 RelationRecord(items=frozenset({'linkin park', '30 seconds to mars'}), support=0.012533333333333334, ordered_statistics=[OrderedStatistic(items_base=frozenset({'30 seconds to mars'}), items_add=frozenset({'linkin park'}), confidence=0.38211382113821135, lift=3.8911794413259813)]),
 RelationRecord(items=frozenset({'muse', 'a perfect circle'}), support=0.011333333333333334, ordered_statistics=[OrderedStatistic(items_base=frozenset({'a perfect circle'}), items_add=frozenset({'muse'}), confidence=0.308529945553539, lift=2.7048212643501377)]),
 RelationRecord(items=frozenset({'nine inch nails', 'a perfect circle'}), support=0.010333333333333333, ordered_statistics=[OrderedStatistic(items_base=frozenset({'a perfect circle'}),

In [23]:
# Display some of the rules
for i in range(0, 100):
    print(association_results[i][0])

frozenset({'3 doors down', 'linkin park'})
frozenset({'linkin park', '30 seconds to mars'})
frozenset({'muse', 'a perfect circle'})
frozenset({'nine inch nails', 'a perfect circle'})
frozenset({'radiohead', 'a perfect circle'})
frozenset({'tool', 'a perfect circle'})
frozenset({'aerosmith', 'ac/dc'})
frozenset({'black sabbath', 'ac/dc'})
frozenset({'iron maiden', 'ac/dc'})
frozenset({'led zeppelin', 'ac/dc'})
frozenset({'metallica', 'ac/dc'})
frozenset({'ac/dc', 'pink floyd'})
frozenset({'ac/dc', 'queen'})
frozenset({'aerosmith', 'metallica'})
frozenset({'red hot chili peppers', 'aerosmith'})
frozenset({'björk', 'air'})
frozenset({'boards of canada', 'air'})
frozenset({'daft punk', 'air'})
frozenset({'goldfrapp', 'air'})
frozenset({'massive attack', 'air'})
frozenset({'moby', 'air'})
frozenset({'portishead', 'air'})
frozenset({'radiohead', 'air'})
frozenset({'röyksopp', 'air'})
frozenset({'the chemical brothers', 'air'})
frozenset({'amy winehouse', 'feist'})
frozenset({'radiohead', 'an

In [24]:
# RelationRecord(items=frozenset({'radiohead', 'broken social scene'}), support=0.015066666666666667, 
# ordered_statistics=[OrderedStatistic(items_base=frozenset({'broken social scene'}), items_add=frozenset({'radiohead'}), 
# confidence=0.5472154963680388, lift=3.0355889221599788)]),

# RelationRecord(items=frozenset({'led zeppelin', 'radiohead', 'pink floyd'}), support=0.0112, ordered_statistics=
# [OrderedStatistic(items_base=frozenset({'led zeppelin', 'radiohead'}), items_add=frozenset({'pink floyd'}), 
# confidence=0.4528301886792453, lift=4.315408405456594)])

# https://stackoverflow.com/questions/47134237/understanding-apyoris-output
# items_base is antecedant, items_add is consequent

for item in association_results:
    # first index of the inner list
    # Contains base item and add item
    pair = item[0]
    # extract each artist, make csv
    antecedant = ','.join([x for x in item[2][0][0]])
    consequent = ','.join([x for x in item[2][0][1]])
    
    print("Rule-> Antecedant: " +  antecedant + " ; Consequent: " + consequent)

    # second index of the inner list
    print("Support: " + str(item[1]))

    # third index of the list located at 0th
    # of the third index of the inner list

    print("Confidence: " + str(item[2][0][2]))
    print("Lift: " + str(item[2][0][3]))
    print("=====================================")

Rule-> Antecedant: 3 doors down ; Consequent: linkin park
Support: 0.011066666666666667
Confidence: 0.3577586206896552
Lift: 3.6431631434791774
Rule-> Antecedant: 30 seconds to mars ; Consequent: linkin park
Support: 0.012533333333333334
Confidence: 0.38211382113821135
Lift: 3.8911794413259813
Rule-> Antecedant: a perfect circle ; Consequent: muse
Support: 0.011333333333333334
Confidence: 0.308529945553539
Lift: 2.7048212643501377
Rule-> Antecedant: a perfect circle ; Consequent: nine inch nails
Support: 0.010333333333333333
Confidence: 0.2813067150635209
Lift: 4.232297618809241
Rule-> Antecedant: a perfect circle ; Consequent: radiohead
Support: 0.013466666666666667
Confidence: 0.3666061705989111
Lift: 2.033688076547214
Rule-> Antecedant: a perfect circle ; Consequent: tool
Support: 0.016266666666666665
Confidence: 0.44283121597096187
Lift: 8.717149920688225
Rule-> Antecedant: aerosmith ; Consequent: ac/dc
Support: 0.011
Confidence: 0.3254437869822485
Lift: 5.3003874101343404
Rule-> A

Rule-> Antecedant: modest mouse ; Consequent: the white stripes
Support: 0.0106
Confidence: 0.20463320463320464
Lift: 2.9571272345838824
Rule-> Antecedant: mogwai ; Consequent: radiohead
Support: 0.0156
Confidence: 0.4474187380497132
Lift: 2.4819826445065454
Rule-> Antecedant: mogwai ; Consequent: sigur rós
Support: 0.011466666666666667
Confidence: 0.32887189292543023
Lift: 4.738788082498995
Rule-> Antecedant: morrissey ; Consequent: the smiths
Support: 0.011266666666666666
Confidence: 0.465564738292011
Lift: 8.896141496025688
Rule-> Antecedant: my chemical romance ; Consequent: muse
Support: 0.0114
Confidence: 0.2714285714285714
Lift: 2.379560824914419
Rule-> Antecedant: nine inch nails ; Consequent: muse
Support: 0.015666666666666666
Confidence: 0.23570712136409228
Lift: 2.066397907925999
Rule-> Antecedant: oasis ; Consequent: muse
Support: 0.0164
Confidence: 0.28018223234624146
Lift: 2.456302446051211
Rule-> Antecedant: muse ; Consequent: placebo
Support: 0.028066666666666667
Confid

### From the above, we can see which of the artists are bought together among users: Antecedant -> Consequent and shows the corresponding support, confidence and lift. For example, if the lift is 4, then the consequent is 4 times more probable to be purchased together with antecedant than when it's kept alone. One usecase of this is we can when a user buys an antecedeant in the future, we can reccomend him/her the consequents with high confidence and lift, as it's likely to be purchased together 

## 3. Using mlextend and matrix format
### But here we only use subset of data as it's very slow on big data

In [27]:
lastfm = lastfm.head(500)


In [38]:
# form list of lists for artists of each user
records = []
for i in lastfm['user'].unique():
    records.append(list(lastfm[lastfm['user'] == i]['artist'].values)) # subset to match user, then select user's artists

# we care only about artists, not the users

len(records)

24

In [32]:
# get unique artists
artist = lastfm['artist'].unique()

In [33]:
lastfm['user'].unique()

array([ 1,  3,  4,  5,  6,  7,  9, 12, 13, 14, 18, 19, 20, 22, 23, 24, 25,
       28, 29, 31, 32, 33, 34, 35], dtype=int64)

In [34]:
# form empty dataframe with columns

res = pd.DataFrame(columns=artist)

In [35]:
artist

array(['red hot chili peppers', 'the black dahlia murder', 'goldfrapp',
       'dropkick murphys', 'le tigre', 'schandmaul', 'edguy',
       'jack johnson', 'eluveitie', 'the killers', 'judas priest',
       'rob zombie', 'john mayer', 'the who', 'guano apes',
       'the rolling stones', 'devendra banhart', 'boards of canada',
       'cocorosie', 'aphex twin', 'animal collective', 'atmosphere',
       'joanna newsom', 'air', 'portishead', 'massive attack',
       'broken social scene', 'arcade fire', 'plaid', 'prefuse 73', 'm83',
       'the flashbulb', 'pavement', 'amon tobin', 'sage francis',
       'four tet', 'max richter', 'autechre', 'radiohead',
       'neutral milk hotel', 'beastie boys', 'aesop rock', 'mf doom',
       'the books', 'tv on the radio', 'tool', 'kyuss', 'dj shadow',
       'a tribe called quest', 'the cinematic orchestra', 'beck',
       'bon iver', 'röyksopp', 'bonobo', 'the decemberists',
       'snow patrol', 'battles', 'the prodigy', 'pink floyd', 'rjd2',
  

In [36]:
res.head()

Unnamed: 0,red hot chili peppers,the black dahlia murder,goldfrapp,dropkick murphys,le tigre,schandmaul,edguy,jack johnson,eluveitie,the killers,...,elvis presley,archive,enigma,amy macdonald,robbie williams,loreena mckennitt,damien rice,louis armstrong,black rebel motorcycle club,lcd soundsystem


In [39]:
# Form binary matrix: mark artists of user in each row as 1 
row = 0

for values in records:
    for artist in values:
        res.at[row, artist] = 1     
    row+=1

In [40]:
res.shape


(24, 346)

In [42]:
# replace null values with 0
res = res.fillna(0)
res

Unnamed: 0,red hot chili peppers,the black dahlia murder,goldfrapp,dropkick murphys,le tigre,schandmaul,edguy,jack johnson,eluveitie,the killers,...,elvis presley,archive,enigma,amy macdonald,robbie williams,loreena mckennitt,damien rice,louis armstrong,black rebel motorcycle club,lcd soundsystem
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
# build fequent itemset using apriori
freq_itemset = ap(res, min_support = 0.1, use_colnames = True)
freq_itemset

Unnamed: 0,support,itemsets
0,0.166667,(the who)
1,0.25,(the rolling stones)
2,0.166667,(cocorosie)
3,0.125,(animal collective)
4,0.125,(portishead)
5,0.25,(radiohead)
6,0.125,(tool)
7,0.125,(beck)
8,0.125,(snow patrol)
9,0.208333,(pink floyd)


In [47]:
# form association rules from the frequent itemset
rules = association_rules(freq_itemset, min_threshold = 0.5)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(the who),(the rolling stones),0.166667,0.25,0.166667,1.0,4.0,0.125,inf
1,(the rolling stones),(the who),0.25,0.166667,0.166667,0.666667,4.0,0.125,2.5
2,(simon & garfunkel),(the rolling stones),0.125,0.25,0.125,1.0,4.0,0.09375,inf
3,(the rolling stones),(simon & garfunkel),0.25,0.125,0.125,0.5,4.0,0.09375,1.75
4,(radiohead),(cocorosie),0.25,0.166667,0.125,0.5,3.0,0.083333,1.666667
5,(cocorosie),(radiohead),0.166667,0.25,0.125,0.75,3.0,0.083333,3.0
6,(animal collective),(portishead),0.125,0.125,0.125,1.0,8.0,0.109375,inf
7,(portishead),(animal collective),0.125,0.125,0.125,1.0,8.0,0.109375,inf
8,(joy division),(radiohead),0.125,0.25,0.125,1.0,4.0,0.09375,inf
9,(radiohead),(joy division),0.25,0.125,0.125,0.5,4.0,0.09375,1.75


In [48]:
# write results 
res.to_csv("res5k.csv", index=False)

### From the above, we can see which of the artists are bought together among users: Antecedant -> Consequent and shows the corresponding support, confidence and lift. For example, if the lift is 4, then the consequent is 4 times more probable to be purchased together with antecedant than when it's kept alone. One usecase of this is we can when a user buys an antecedeant in the future, we can reccomend him/her the consequents with high confidence and lift, as it's likely to be purchased together 