In [1]:
import pandas as pd
import numpy as np
from timeit import default_timer

## Load Data

In [2]:
df = pd.read_csv('test.csv')
user_id_col = 'user_id'
item_id_col = 'item_id'

# df = pd.read_csv('../songs_recommender/preprocessed_data/user_songs.csv')
# df = df[df['listen_count'] > 50]
# user_id_col = 'user_id'
# item_id_col = 'song'
df

Unnamed: 0,user_id,item_id,view_count
0,u2,i2,1
1,u2,i3,1
2,u3,i2,1
3,u3,i3,1
4,u3,i4,1
5,u1,i2,1
6,u1,i1,1
7,,i5,0


In [3]:
def get_unique_non_na(series):
    return set(series[~series.isna()])

all_users = sorted(get_unique_non_na(df[user_id_col]))
no_of_users = len(all_users)
all_items = sorted(get_unique_non_na(df[item_id_col]))
no_of_items = len(all_items)
print(all_users, no_of_users)
print(all_items, no_of_items)

['u1', 'u2', 'u3'] 3
['i1', 'i2', 'i3', 'i4', 'i5'] 5


In [4]:
item_users_df = df.groupby([item_id_col]).agg({user_id_col: (lambda x: get_unique_non_na(x))})
item_users_df = item_users_df.rename(columns={user_id_col: 'users'})#.reset_index()
item_users_df

Unnamed: 0_level_0,users
item_id,Unnamed: 1_level_1
i1,{u1}
i2,"{u1, u2, u3}"
i3,"{u2, u3}"
i4,{u3}
i5,{}


In [5]:
user_items_df = df.groupby([user_id_col]).agg({item_id_col: (lambda x: get_unique_non_na(x))})
user_items_df = user_items_df.rename(columns={item_id_col: 'items'})#.reset_index()
user_items_df

Unnamed: 0_level_0,items
user_id,Unnamed: 1_level_1
u1,"{i1, i2}"
u2,"{i2, i3}"
u3,"{i2, i3, i4}"


In [6]:
def get_users(item_id):
    """Get unique users for a given item"""
    item_users = item_users_df.loc[item_id].values[0]    
    return item_users

def get_items(user_id):
    """Get unique items for a given user"""
    user_items = user_items_df.loc[user_id].values[0]    
    return user_items

## Item Similarity

### Naive Way of Computing CoOccurence Matrix

In [7]:
def get_cooccurence_matrix_naive(items):
    """Construct cooccurence matrix"""
    # Initialize the item cooccurence matrix
    len_items = len(items)
    cooccurence_matrix = np.matrix(np.zeros(shape=(len_items, len_items)), float)

    # Calculate similarity between item pairs for upper triangular elements
    for i, item_i in enumerate(items):
        # Get unique users of item_i
        users_i = get_users(item_i)

        for j, item_j in enumerate(items):
            if i == j:
                cooccurence_matrix[i, j] = 1.0
                continue
            if i > j:
                continue #same result as corresponding j, i

            # Get unique users of item_j
            users_j = get_users(item_j)

            # Calculate intersection of users of items i and j
            users_intersection = users_i.intersection(users_j)
            no_of_common_users = len(users_intersection)
            # Calculate cooccurence_matrix[i,j] as Jaccard Index
            if no_of_common_users != 0:
                # Calculate union of users of items i and j
                users_union = users_i.union(users_j)
                no_of_all_users = len(users_union)
                if no_of_all_users != 0:
                    cooccurence_matrix[i, j] = float(no_of_common_users) / float(no_of_all_users)
                    cooccurence_matrix[j, i] = cooccurence_matrix[i, j]
    np.fill_diagonal(cooccurence_matrix, 0.0)
    return cooccurence_matrix

### Using Matrix Mult to Compute CoOccurence Matrix

#### Debug

In [34]:
df

Unnamed: 0,user_id,item_id,view_count
0,u2,i2,1
1,u2,i3,1
2,u3,i2,1
3,u3,i3,1
4,u3,i4,1
5,u1,i2,1
6,u1,i1,1
7,,i5,0


In [33]:
pd.get_dummies(df[item_id_col])

Unnamed: 0,i1,i2,i3,i4,i5
0,0,1,0,0,0
1,0,0,1,0,0
2,0,1,0,0,0
3,0,0,1,0,0
4,0,0,0,1,0
5,0,1,0,0,0
6,1,0,0,0,0
7,0,0,0,0,1


In [36]:
pd.get_dummies(df[item_id_col]).groupby(df[user_id_col])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7ff7b1bcd750>

In [8]:
#https://stackoverflow.com/questions/31518937/convert-two-column-data-frame-to-occurrence-matrix-in-pandas
#pd.get_dummies(df.item_id)
uim_df = pd.get_dummies(df[item_id_col]).groupby(df[user_id_col]).apply(max)
uim_df

Unnamed: 0_level_0,i1,i2,i3,i4,i5
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
u1,1,1,0,0,0
u2,0,1,1,0,0
u3,0,1,1,1,0


In [9]:
temp_df = uim_df[['i2', 'i3', 'i1']]
temp_df

Unnamed: 0_level_0,i2,i3,i1
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
u1,1,0,1
u2,1,1,0
u3,1,1,0


In [10]:
items_sorted = sorted(temp_df.columns)
uim_df = temp_df[items_sorted]
uim_df

Unnamed: 0_level_0,i1,i2,i3
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
u1,1,1,0
u2,0,1,1
u3,0,1,1


In [11]:
uim = uim_df.values
uim

array([[1, 1, 0],
       [0, 1, 1],
       [0, 1, 1]], dtype=uint8)

In [12]:
non_zero_count = np.count_nonzero(uim)
count = uim.size
density = non_zero_count/count
print(non_zero_count, count, density)

6 9 0.6666666666666666


In [13]:
uim.T

array([[1, 0, 0],
       [1, 1, 1],
       [0, 1, 1]], dtype=uint8)

In [14]:
item_item_intersection = np.dot(uim.T, uim)
item_item_intersection

array([[1, 1, 0],
       [1, 3, 2],
       [0, 2, 2]], dtype=uint8)

In [15]:
item_item_intersection_df = pd.DataFrame(item_item_intersection, columns=uim_df.columns, index=uim_df.columns)
item_item_intersection_df

Unnamed: 0,i1,i2,i3
i1,1,1,0
i2,1,3,2
i3,0,2,2


In [16]:
item1 = 'i2'
item2 = 'i5'

users_item1 = get_users(item1)
users_item2 = get_users(item2)
print(users_item1)
print(users_item2)
intersection_users = users_item1.intersection(users_item2)
union_users = users_item1.union(users_item2)
print("users who interact with both items    : ", intersection_users)
print("users who interact with either items  : ", union_users)
print(len(intersection_users), len(union_users))

{'u1', 'u2', 'u3'}
set()
users who interact with both items    :  set()
users who interact with either items  :  {'u1', 'u2', 'u3'}
0 3


In [17]:
flip_uim = 1-uim
flip_uim

array([[0, 0, 1],
       [1, 0, 0],
       [1, 0, 0]], dtype=uint8)

In [18]:
flip_uim.T

array([[0, 1, 1],
       [0, 0, 0],
       [1, 0, 0]], dtype=uint8)

In [19]:
users_left_out_of_union = np.dot(flip_uim.T, flip_uim)
users_left_out_of_union

array([[2, 0, 0],
       [0, 0, 0],
       [0, 0, 1]], dtype=uint8)

In [20]:
items = list(uim_df.columns)
no_of_items = len(items)
users = list(uim_df.index)
no_of_users = len(users)
print("No of Items : ", no_of_items)
print("No of Users : ", no_of_users)

No of Items :  3
No of Users :  3


In [21]:
item_item_union = no_of_users - users_left_out_of_union
item_item_union

array([[1, 3, 3],
       [3, 3, 3],
       [3, 3, 2]], dtype=uint8)

In [22]:
item_item_union_df = pd.DataFrame(item_item_union, columns=uim_df.columns, index=uim_df.columns)
item_item_union_df

Unnamed: 0,i1,i2,i3
i1,1,3,3
i2,3,3,3
i3,3,3,2


In [23]:
item_item_jaccard_df = item_item_intersection_df.div(item_item_union_df)
# item_item_jaccard_df.to_csv('item_item_jaccard.csv')
item_item_jaccard_df

Unnamed: 0,i1,i2,i3
i1,1.0,0.333333,0.0
i2,0.333333,1.0,0.666667
i3,0.0,0.666667,1.0


In [24]:
item_item_jaccard_df.values

array([[1.        , 0.33333333, 0.        ],
       [0.33333333, 1.        , 0.66666667],
       [0.        , 0.66666667, 1.        ]])

#### Function

In [25]:
def get_cooccurence_matrix_mult(df):
    uim_df = pd.get_dummies(df[item_id_col]).groupby(df[user_id_col]).apply(max)    
    items_sorted = sorted(uim_df.columns)
    uim_df = uim_df[items_sorted]
    
    uim = uim_df.values
    
    item_item_intersection = np.dot(uim.T, uim)
    item_item_intersection_df = pd.DataFrame(item_item_intersection, 
                                             columns=items_sorted, 
                                             index=items_sorted)
    
    
    flip_uim = 1-uim
    users_left_out_of_union = np.dot(flip_uim.T, flip_uim)
    item_item_union = no_of_users - users_left_out_of_union
    item_item_union_df = pd.DataFrame(item_item_union, 
                                      columns=items_sorted, 
                                      index=items_sorted)
    
    item_item_jaccard_df = item_item_intersection_df.div(item_item_union_df)
    item_item_jaccard_df.fillna(0, inplace=True)
    np.fill_diagonal(item_item_jaccard_df.values, 0.0)
    return item_item_jaccard_df.values

### Compare

In [26]:
start_time = default_timer()
cooccurence_matrix_naive = get_cooccurence_matrix_naive(all_items)
end_time = default_timer()
print("Naive Approach,       Time Taken : {} sec".format(end_time-start_time))

start_time = default_timer()
cooccurence_matrix_mult = get_cooccurence_matrix_mult(df)
end_time = default_timer()
print("Matrix-Mult Approach, Time Taken : {} sec".format(end_time-start_time))

Naive Approach,       Time Taken : 0.0014174629999956778 sec
Matrix-Mult Approach, Time Taken : 0.0062663430000071685 sec


#### Inspect Values

In [27]:
cooccurence_matrix_naive

matrix([[0.        , 0.33333333, 0.        , 0.        , 0.        ],
        [0.33333333, 0.        , 0.66666667, 0.33333333, 0.        ],
        [0.        , 0.66666667, 0.        , 0.5       , 0.        ],
        [0.        , 0.33333333, 0.5       , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ]])

In [28]:
cooccurence_matrix_mult

array([[0.        , 0.33333333, 0.        , 0.        , 0.        ],
       [0.33333333, 0.        , 0.66666667, 0.33333333, 0.        ],
       [0.        , 0.66666667, 0.        , 0.5       , 0.        ],
       [0.        , 0.33333333, 0.5       , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ]])

#### Equivalent ?

In [30]:
# cooccurence_matrix_naive == cooccurence_matrix_mult

In [31]:
#https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.equal.html
#https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.all.html#numpy.all
res = np.equal(cooccurence_matrix_naive, cooccurence_matrix_mult).all()
res

True