In [2]:
import pandas as pd
import numpy as np
from timeit import default_timer

In [4]:
# df = pd.read_csv('test.csv')
# user_id_col = 'user_id'
# item_id_col = 'item_id'
df = pd.read_csv('sub_df.csv')
user_id_col = 'user_id'
item_id_col = 'song'
len(df)
df

Unnamed: 0.1,Unnamed: 0,user_id,song,listen_count
0,225526,38767872c514c1b43bab5c7b213b6702980d87c0,..Come Around - Collie Buddz,401
1,534979,2e4cdf4a4b0e26a1577f218440830ac83ce07122,(Antichrist Television Blues) - Arcade Fire,84
2,939527,faa6962ada2b39cd900d8fb144b2e15e2ca3f72a,..Come Around - Collie Buddz,79
3,1236731,d34d523fb05593e9e68e42b28fe4022f0f05eb9d,..Come Around - Collie Buddz,66


In [5]:
all_users = list(df[user_id_col].unique())
all_items = list(df[item_id_col].unique())
print(all_users)
print(all_items)

['38767872c514c1b43bab5c7b213b6702980d87c0', '2e4cdf4a4b0e26a1577f218440830ac83ce07122', 'faa6962ada2b39cd900d8fb144b2e15e2ca3f72a', 'd34d523fb05593e9e68e42b28fe4022f0f05eb9d']
['..Come Around - Collie Buddz', '(Antichrist Television Blues) - Arcade Fire']


In [6]:
item_users_df = df.groupby([item_id_col]).agg({user_id_col: (lambda x: list(x.unique()))})
item_users_df = item_users_df.rename(columns={user_id_col: 'users'}).reset_index()
item_users_df

Unnamed: 0,song,users
0,(Antichrist Television Blues) - Arcade Fire,[2e4cdf4a4b0e26a1577f218440830ac83ce07122]
1,..Come Around - Collie Buddz,"[38767872c514c1b43bab5c7b213b6702980d87c0, faa..."


In [7]:
user_items_df = df.groupby([user_id_col]).agg({item_id_col: (lambda x: list(x.unique()))})
user_items_df = user_items_df.rename(columns={item_id_col: 'items'}).reset_index()
user_items_df

Unnamed: 0,user_id,items
0,2e4cdf4a4b0e26a1577f218440830ac83ce07122,[(Antichrist Television Blues) - Arcade Fire]
1,38767872c514c1b43bab5c7b213b6702980d87c0,[..Come Around - Collie Buddz]
2,d34d523fb05593e9e68e42b28fe4022f0f05eb9d,[..Come Around - Collie Buddz]
3,faa6962ada2b39cd900d8fb144b2e15e2ca3f72a,[..Come Around - Collie Buddz]


In [8]:
def get_users(item_id):
    """Get unique users for a given item"""
    item_data = item_users_df[item_users_df[item_id_col] == item_id]
    item_users = (item_data['users'].values)[0]
    return item_users

def construct_cooccurence_matrix(items):
    """Construct cooccurence matrix"""
    # Initialize the item cooccurence matrix
    len_items = len(items)
    cooccurence_matrix = np.matrix(np.zeros(shape=(len_items, len_items)), float)

    # Calculate similarity between item pairs for upper triangular elements
    for i, item_i in enumerate(items):
        # Get unique users of item_i
        users_i = set(get_users(item_i))

        for j, item_j in enumerate(items):
            if i == j:
                cooccurence_matrix[i, j] = 1.0
                continue
            if i > j:
                continue #same result as corresponding j, i

            # Get unique users of item_j
            users_j = set(get_users(item_j))

            # Calculate intersection of users of items i and j
            users_intersection = users_i.intersection(users_j)
            no_of_common_users = len(users_intersection)
            # Calculate cooccurence_matrix[i,j] as Jaccard Index
            if no_of_common_users != 0:
                # Calculate union of users of items i and j
                users_union = users_i.union(users_j)
                no_of_all_users = len(users_union)
                if no_of_all_users != 0:
                    cooccurence_matrix[i, j] = float(
                        no_of_common_users) / float(no_of_all_users)
                    cooccurence_matrix[j, i] = cooccurence_matrix[i, j]
    return cooccurence_matrix

In [9]:
start_time = default_timer()
cooccurence_matrix = construct_cooccurence_matrix(all_items)
end_time = default_timer()
print("Time Taken : {} sec".format(end_time-start_time))
#cooccurence_matrix

Time Taken : 0.005232876001173281 sec


In [10]:
#https://stackoverflow.com/questions/31518937/convert-two-column-data-frame-to-occurrence-matrix-in-pandas
#pd.get_dummies(df.item_id)
uim_df = pd.get_dummies(df[item_id_col]).groupby(df[user_id_col]).apply(max)
uim_df.head()

Unnamed: 0_level_0,(Antichrist Television Blues) - Arcade Fire,..Come Around - Collie Buddz
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2e4cdf4a4b0e26a1577f218440830ac83ce07122,1.0,0.0
38767872c514c1b43bab5c7b213b6702980d87c0,0.0,1.0
d34d523fb05593e9e68e42b28fe4022f0f05eb9d,0.0,1.0
faa6962ada2b39cd900d8fb144b2e15e2ca3f72a,0.0,1.0


In [11]:
uim = uim_df.as_matrix()
uim

array([[ 1.,  0.],
       [ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.]])

In [12]:
non_zero_count = np.count_nonzero(uim)
count = uim.size
density = non_zero_count/count
print(non_zero_count, count, density)

4 8 0.5


In [13]:
uim.T

array([[ 1.,  0.,  0.,  0.],
       [ 0.,  1.,  1.,  1.]])

In [14]:
item_item_intersection = np.dot(uim.T, uim)
item_item_intersection

array([[ 1.,  0.],
       [ 0.,  3.]])

In [43]:
item_item_intersection_df = pd.DataFrame(item_item_intersection, columns=uim_df.columns, index=uim_df.columns)
item_item_intersection_df.to_csv('item_item_intersection.csv')

In [15]:
#item1 = "'Round Midnight - Miles Davis"
#item2 = "16 Candles - The Crests"
item1 = "(Antichrist Television Blues) - Arcade Fire"
item2 = "..Come Around - Collie Buddz"
users_item1 = set(df[df[item_id_col] == item1][user_id_col].values)
users_item2 = set(df[df[item_id_col] == item2][user_id_col].values)
print(users_item1)
print(users_item2)
common_users = users_item1.intersection(users_item2)
union_users = users_item1.union(users_item2)
print(common_users)
print(union_users)
print(len(common_users), len(union_users))

{'2e4cdf4a4b0e26a1577f218440830ac83ce07122'}
{'faa6962ada2b39cd900d8fb144b2e15e2ca3f72a', 'd34d523fb05593e9e68e42b28fe4022f0f05eb9d', '38767872c514c1b43bab5c7b213b6702980d87c0'}
set()
{'2e4cdf4a4b0e26a1577f218440830ac83ce07122', '38767872c514c1b43bab5c7b213b6702980d87c0', 'd34d523fb05593e9e68e42b28fe4022f0f05eb9d', 'faa6962ada2b39cd900d8fb144b2e15e2ca3f72a'}
0 4


In [16]:
flip_uim = 1-uim
flip_uim

array([[ 0.,  1.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.]])

In [17]:
flip_uim.T

array([[ 0.,  1.,  1.,  1.],
       [ 1.,  0.,  0.,  0.]])

In [18]:
users_left_out_of_union = np.dot(flip_uim.T, flip_uim)
users_left_out_of_union

array([[ 3.,  0.],
       [ 0.,  1.]])

In [19]:
items = list(uim_df.columns)
no_of_items = len(items)
users = list(uim_df.index)
no_of_users = len(users)
print("No of Items : ", no_of_items)
print("No of Users : ", no_of_users)

No of Items :  2
No of Users :  4


In [20]:
item_item_union = no_of_users - users_left_out_of_union
item_item_union

array([[ 1.,  4.],
       [ 4.,  3.]])

In [52]:
item_item_union_df = pd.DataFrame(item_item_union, columns=uim_df.columns, index=uim_df.columns)
item_item_union_df.to_csv('item_item_union.csv')

In [21]:
item_item_jaccard = item_item_intersection/item_item_union
item_item_jaccard

array([[ 1.,  0.],
       [ 0.,  1.]])

In [24]:
item_item_jaccard_df = pd.DataFrame(item_item_jaccard, columns=uim_df.columns, index=uim_df.columns)
#item_item_jaccard_df.to_csv('item_item_jaccard.csv')

In [23]:
cooccurence_matrix_df = pd.DataFrame(cooccurence_matrix, columns=uim_df.columns, index=uim_df.columns)
#cooccurence_matrix_df.to_csv('cooccurence_matrix.csv')

In [25]:
item_item_jaccard_df.head()

Unnamed: 0,(Antichrist Television Blues) - Arcade Fire,..Come Around - Collie Buddz
(Antichrist Television Blues) - Arcade Fire,1.0,0.0
..Come Around - Collie Buddz,0.0,1.0


In [26]:
cooccurence_matrix_df.head()

Unnamed: 0,(Antichrist Television Blues) - Arcade Fire,..Come Around - Collie Buddz
(Antichrist Television Blues) - Arcade Fire,1.0,0.0
..Come Around - Collie Buddz,0.0,1.0


In [27]:
item_item_jaccard_df.equals(cooccurence_matrix_df)
#item_item_jaccard_df.index.names == cooccurence_matrix_df.index.names

True

In [28]:
#https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.equal.html
#https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.all.html#numpy.all
res = np.equal(cooccurence_matrix, item_item_jaccard)#.all()
res

matrix([[ True,  True],
        [ True,  True]], dtype=bool)

In [73]:
res_df = pd.DataFrame(res, columns=uim_df.columns, index=uim_df.columns)
res_df.to_csv('res.csv')

matrix([[ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        ..., 
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True],
        [ True,  True,  True, ...,  True,  True,  True]], dtype=bool)

In [74]:
res_df

Unnamed: 0,#40 - DAVE MATTHEWS BAND,'Round Midnight - Miles Davis,'Til We Die (Album Version) - Slipknot,'Till I Collapse - Eminem / Nate Dogg,(Antichrist Television Blues) - Arcade Fire,(I've Had) The Time Of My Life - Bill Medley & Jennifer Warnes,(Nice Dream) - Radiohead,(They Long To Be) Close To You - Carpenters,(You Drive Me) Crazy (The Stop Remix!) - Britney Spears,(iii) - The Gerbils,...,Your Rocky Spine - Great Lake Swimmers,Your Woman - White Town,Yours Alone - Dum Dum Girls,Youth Against Fascism - Sonic Youth,Youth Of The Nation (2006 Remastered Album Version) - P.O.D.,Yön hetket - Yölintu,Zero - Yeah Yeah Yeahs,paranoid android - Christopher O'Riley,re:stacks - Bon Iver,Época - Gotan Project
#40 - DAVE MATTHEWS BAND,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
'Round Midnight - Miles Davis,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
'Til We Die (Album Version) - Slipknot,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
'Till I Collapse - Eminem / Nate Dogg,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
(Antichrist Television Blues) - Arcade Fire,True,True,True,True,True,True,True,True,True,False,...,True,True,True,True,True,True,True,True,True,True
(I've Had) The Time Of My Life - Bill Medley & Jennifer Warnes,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
(Nice Dream) - Radiohead,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
(They Long To Be) Close To You - Carpenters,True,True,True,True,True,True,True,True,True,True,...,False,True,True,True,True,True,True,True,True,True
(You Drive Me) Crazy (The Stop Remix!) - Britney Spears,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
(iii) - The Gerbils,True,True,True,True,False,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True


In [83]:
item1 = "(Antichrist Television Blues) - Arcade Fire"
item2 = "..Come Around - Collie Buddz"

# item1 = "'Round Midnight - Miles Davis"
# item2 = "16 Candles - The Crests"
users_item1 = get_users(item1)
users_item2 = get_users(item2)
print(users_item1)
print(users_item2)
print(item_item_jaccard_df[item1][item2])
print(cooccurence_matrix_df[item1][item2])
print(res_df[item1][item2])

['2e4cdf4a4b0e26a1577f218440830ac83ce07122']
['38767872c514c1b43bab5c7b213b6702980d87c0', 'faa6962ada2b39cd900d8fb144b2e15e2ca3f72a', 'd34d523fb05593e9e68e42b28fe4022f0f05eb9d']
0.0
0.00621118012422
False


In [87]:
sub_df = df[(df[item_id_col] == item1) | (df[item_id_col] == item2)]
sub_df

Unnamed: 0.1,Unnamed: 0,user_id,song,listen_count
25,225526,38767872c514c1b43bab5c7b213b6702980d87c0,..Come Around - Collie Buddz,401
1383,534979,2e4cdf4a4b0e26a1577f218440830ac83ce07122,(Antichrist Television Blues) - Arcade Fire,84
1583,939527,faa6962ada2b39cd900d8fb144b2e15e2ca3f72a,..Come Around - Collie Buddz,79
2456,1236731,d34d523fb05593e9e68e42b28fe4022f0f05eb9d,..Come Around - Collie Buddz,66
