In [1]:
import pandas as pd
import numpy as np
import pickle
from scipy.sparse import csr_matrix, csc_matrix
from pandas.api.types import CategoricalDtype

### Ineraction Matrix
----

In [2]:
data = np.array([[1,1,2,3,1,2,3,3,3,3],['A','A','B','B','C','C','C','C','C','D']]).T

In [3]:
df = pd.DataFrame(data, columns = ['user_id','product_name'])
df

Unnamed: 0,user_id,product_name
0,1,A
1,1,A
2,2,B
3,3,B
4,1,C
5,2,C
6,3,C
7,3,C
8,3,C
9,3,D


In [4]:
# create interaction matrix
interaction_matrix_count = df.pivot_table(index='user_id', columns='product_name', aggfunc=len, fill_value=0)
interaction_matrix_count.head()

product_name,A,B,C,D
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2,0,1,0
2,0,1,1,0
3,0,1,3,1


In [5]:
#interaction_matrix_count = df.pivot_table(index='user_id', columns='product_name', values=[1])

In [6]:
interaction_matrix_count.applymap(lambda x: int(x>0))

product_name,A,B,C,D
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,0,1,0
2,0,1,1,0
3,0,1,1,1


## Interaction Matrix as sparse Matrix
----

In [7]:
# df = pd.read_csv('Recommender4Retail.csv')

In [8]:
def interaction(df, recommender='item'):
    user_c = CategoricalDtype(sorted(df.user_id.unique()), ordered=True)
    product_name_c = CategoricalDtype(sorted(df.product_name.unique()), ordered=True)

    row = df.user_id.astype(user_c).cat.codes
    col = df.product_name.astype(product_name_c).cat.codes
    val = [1]*len(col)
    if recommender == 'item':     
        sparse_matrix = csc_matrix((val,(row, col)), shape=(user_c.categories.size, product_name_c.categories.size))
    elif recommender == 'user':     
        sparse_matrix = csr_matrix((val,(row, col)), shape=(user_c.categories.size, product_name_c.categories.size))
    return sparse_matrix

In [9]:
sparse_matrix = interaction(df)
type(sparse_matrix)

scipy.sparse.csc.csc_matrix

In [10]:
# binary sparse matrix:
sparse_binary = sparse_matrix
sparse_binary[sparse_binary.nonzero()] = 1
sparse_binary.todense()

matrix([[1, 0, 1, 0],
        [0, 1, 1, 0],
        [0, 1, 1, 1]], dtype=int32)

In [11]:
# turn to dense matrix if necessary
sparse_matrix.todense()

matrix([[1, 0, 1, 0],
        [0, 1, 1, 0],
        [0, 1, 1, 1]], dtype=int32)

In [12]:
# turn dense matrix to dataframe
# dfs = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, index=user_c.categories, columns=product_name_c.categories)

In [13]:
# extract of a product and its values for diffrent customers in interaction matrix. 
# dfs.iloc[:, 401].value_counts()

## Similarity Measures
----

### Cosine Similarity
Cosine similarity measures the similarity between two vectors. To achieve this it calculates the angle between the two vectors. With an intermediate angle of $0°$ (the two vectors point in the same direction) the similarity is $1$, ($cos(0) = 1$). For any other angle the value is less than one. In our example we consider products (columns of the interaction matrix) or users (rows of the interaction matrix) as vectors. In our formula are $i$ and $j$ items which each have a vector consisting of all their ratings $r$. Note: This function can be used for UBCF and IBCF.

$$
s(i,j) = \frac{r_{i}\cdot r_{j}}{\|r_{i}\|_{2}\cdot\|r_{j}\|_{2}}
$$

In [14]:
def sim_cosine(v1, v2, norm=False):
    """
    Calculates the cosine similarity between two given vectors
    :param v1, v2: v1 and v2 are vectors in a numpy array format
    :param norm: if norm is set to true the vectors will be normalized to unit vectors before the calculation
    :return s: similarity value between -1 and 1 (1 high correlation, 0 no correlation, -1 high negative correlation)
    """
    if isinstance(v1, np.ndarray) is False or isinstance(v2, np.ndarray) is False:
        raise TypeError(f'Function only accepts v1 and v2 as type numpy.ndarray')
    
    # normalize vectors
    if norm:
        v1 = v1/np.linalg.norm(v1)
        v2 = v2/np.linalg.norm(v2)
        
    # cosine similarity:
    s = np.dot(v1,v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))
    
    # print(f'Cosine Similarity: {round(s,2)}')
    return s

In [15]:
r_i = np.array([1,2,3])
r_j = np.array([5,0,7])

sim_cosine(r_i,r_j, norm=False)

0.8077806958015601

In [16]:
a = np.asarray(sparse_matrix[:,0].todense()).T[0]
b = np.asarray(sparse_matrix[:,1].todense()).T[0]

In [17]:
# get similarity for all two products:
sim_cosine(a,b) # result 0 means the two vectors are orthogonal

0.0

### Pearson Correlation
----
Person Correlation measures the statistical correlation between the ratings of two user's (or items). The formula takes as input two users $u$ and $v$ and calculates a similarity $s$ using the ratings $r$ for every product the two users have in common. It is possible to use pearson correlation to measure the similarity between items as well. But it is said that it yields **worse results** as a cosine similarity for example. The pearson correlation also suffers from problems if there are just a few ratings to compare. A threshold of a number of ratings per user could counteract this problem.

$$
s(u,v) = \frac{\sum_{i\in I_{u}\cap I_{v}}(r_{u,i}-\bar r_{u})(r_{v,i}-\bar r_{v})}{\sqrt{\sum_{i\in I_{u}\cap I_{v}}(r_{u,i}-\bar r_{u})^2} \sqrt{\sum_{i\in I_{u}\cap I_{v}}(r_{v,i}-\bar r_{v})^2}}
$$

In [18]:
sparse_matrix = interaction(df, recommender='user') # 'user' returns a csc-matrix (c=column)
sparse_matrix.todense()

matrix([[2, 0, 1, 0],
        [0, 1, 1, 0],
        [0, 1, 3, 1]], dtype=int32)

In [19]:
def sim_pearson(u, v, threshold=False):
    """
    Calculates the pearson similarity between two given users
    :param u and v: u and v are the id of two users
    :param threshold: activate involvement of a threshold "multiplication" of the function
    :return s: similarity value between -1 and 1 (1 high correlation, 0 no correlation, -1 high negative correlation) or NaN if calculation is not possible
    """
    # get overlapping items for user u and v inclusive ratings.
    intersection = [k for k, i, j in zip(np.arange(len(u)),u,v) if i > 0  and j > 0]
    
    u_mean_r, v_mean_r = np.average(u, weights=(u > 0)), np.average(v, weights=(v > 0)) # average rating
    
    numerator = sum(a*b for a,b in zip([u[i] - u_mean_r for i in intersection],[v[i] - v_mean_r for i in intersection]))
    denominator1 = np.sqrt(sum([(u[i] - u_mean_r)**2 for i in intersection]))
    denominator2 = np.sqrt(sum([(v[i] - v_mean_r)**2 for i in intersection]))

    s = numerator / (denominator1 * denominator2)
    if threshold:
        s = s * min(len(interesection)/50,1)

    return s

In [55]:
a = np.asarray(sparse_matrix[0,:].todense())[0] # user 1
b = np.asarray(sparse_matrix[2,:].todense())[0] # user 2

In [21]:
%%time
sim_pearson(a,b)

Wall time: 998 µs


-1.0

## Similarity Matrix
----
The function `similarity()` creates a similarity matrix using a similarity function (for example cosine similarity (`sim_cosine`). 

In [56]:
s = pickle.load(open('rating_count_item_interaction.pkl', 'rb'))

In [23]:
def similarity(df, sim='cosine', recommender = 'item'):
    
    # check if matrix already exists here
    # load correct df here
    
    # create empty similarity_matrix(nxn)
    if recommender == 'item':
        length = df.shape[1]
        similarity_matrix = np.zeros((length, length))
        count = 0
        # get value pairs:

        for i in range(length):
            a = np.asarray(df[:,i].todense()).T[0]
            for j in range(length):
                if i == j:
                    similarity_matrix[i,i] = 1 # fill diagonal of matrix with similarity "1"
                elif i > j:
                    b = np.asarray(df[:,j].todense()).T[0]
                    similarity_matrix[i,j] = sim_cosine(a,b) # fill upper triangular matrix
                    similarity_matrix[j,i] = similarity_matrix[i,j] # fill lower triangular matrix
                else:
                    pass
                
    elif recommender == 'user':
        length = df.shape[0]
        similarity_matrix = np.zeros((length, length))
    
        # get value pairs:
        for i in range(length):
            a = np.asarray(df[i,:].todense())[0]
            for j in range(length):
                if i == j:
                    simularity_matrix[i,i] = 1 # fill diagonal of matrix with similarity "1"
                elif i > j:
                    b = np.asarray(df[j,:].todense())[0]
                    similarity_matrix[i,j] = sim_cosine(a,b) # fill upper triangular matrix
                    similarity_matrix[j,i] = similarity_matrix[i,j] # fill lower triangular matrix
                else: 
                    pass
    
    # save as pickle file 
    
    return similarity_matrix

In [None]:
%%time
similarity_items = similarity(s, recommender = 'item')
similarity_items

## Recommend
----

In [96]:
def recommend(matrix_name, item_id, nr_of_items):
  
    matrix = pickle.load(open(matrix_name, 'rb'))
    #Sets diagonal to zero (if we dont want to recomend the item the user has just bought)
    np.fill_diagonal(matrix, -2)
    
    #gets two list of item index and item similarity rating
    nr_of_rows = matrix.shape[0]
    index = np.zeros((nr_of_rows, nr_of_items))
    ratings = np.zeros((nr_of_rows, nr_of_items))
    for row in range(nr_of_rows):
        index[row,:] = matrix[row].argsort()[-nr_of_items:][::-1].tolist()
        ratings[row,:] = matrix[row, index[row,:].astype(int)]
        
    # print results
    print("Recommendation for {}:".format(item_id))
    for i in range(index.shape[1]):
        print("{}: {} with a similarity rating of {} ".format((i+1), int(index[item_id, i]), round(ratings[item_id, i], 3)))

    return 

In [103]:
# pd.read_csv('data.csv')  
item_similarity = recommend(matrix_name = 'rating_count_item_similarity.pkl', item_id = 2, nr_of_items = 15)

Recommendation for 2:
1: 4164 with a similarity rating of 0.565 
2: 3982 with a similarity rating of 0.511 
3: 4156 with a similarity rating of 0.501 
4: 124 with a similarity rating of 0.454 
5: 2543 with a similarity rating of 0.413 
6: 2278 with a similarity rating of 0.36 
7: 5924 with a similarity rating of 0.295 
8: 4188 with a similarity rating of 0.269 
9: 3208 with a similarity rating of 0.204 
10: 5920 with a similarity rating of 0.197 
11: 5077 with a similarity rating of 0.192 
12: 5289 with a similarity rating of 0.19 
13: 5405 with a similarity rating of 0.181 
14: 1871 with a similarity rating of 0.179 
15: 800 with a similarity rating of 0.172 


In [112]:
['{} st'.format(s) for s in np.arange(1, 15+1, 1)]

['1 st',
 '2 st',
 '3 st',
 '4 st',
 '5 st',
 '6 st',
 '7 st',
 '8 st',
 '9 st',
 '10 st',
 '11 st',
 '12 st',
 '13 st',
 '14 st',
 '15 st']

## User Product Rating
----

#### Version 1

In order to get a user-item rating for the interaction matrix, we had to develop a rating function.

The following thoughts were made:
- We want to give sufficient weight to the first product purchase
    - We decided that the first purchase has a weight $\displaystyle \omega = \frac{1}{3}$
- We want to even more weight a reorder of a product (this assumes that the user liked the product)
    - To create the gap, we defined the minimum gap of $\displaystyle 2 \cdot \omega$
- In addition, we do not want to weight multiple reorders too much that users with e.g. 6 or 10 reorders differs not too much
    - To achieve this, we can take the square-root of the number of reorders since $\sqrt{x}\,\, |\,\, x \geq 0\,\,$ is concave
- The rating should be a number between 0 and 1

Therefore the following formula has been developed:

$\displaystyle{ rating(o, o_{tot}) =
  \begin{cases}
    0            & \quad \text{if } o \text{ is } 0\\
    \omega       & \quad \text{if } o \text{ is }1 \land o_{tot} > 1\\
    2 \cdot \omega + (1 - 2 \cdot \omega) \cdot \sqrt{\frac{o}{o_{tot}}}  & \quad \text{if } o \text{ is } \geq 1 \lor \left( o \text{ is }1 \land o_{tot} \text{ is } 1 \right)
  \end{cases}}$
    
- where $o$ is the number of orders of the specified product $p$ of the user and $o_{tot}$ is the number of orders of the user.
- $\omega$ must be well defined $0 < \omega < 0.5$, optimally somewhere in the middle

This approach does strongly weight the first reorder and neither takes the number of orders of a user nor the aisles into account.

In [None]:
omega = 0.35
gamma = 10 # number of orders of the user
for o in range(gamma + 1):
    if o == 0:
        x = 0
    elif o == 1 and gamma > 1:
        x = omega
    else:
        x = 2*omega + (1-2*omega) * np.sqrt(o/gamma)
    print(x)

#### Version 2

In order to get a user-item rating for the interaction matrix, we had to develop a rating function.

The following thoughts were made:
- We want to give sufficient weight to the first product purchase
    - We decided that the first purchase has a weight $\displaystyle \omega = \frac{1}{3}$
    
- Our main weight should be the frequency of a product being in an order.
    - To achieve this, we can take the square-root of the number of orders containing a product  $o$ divided by the total number of orders by the customer $o_{tot}$: $\sqrt{\frac{o}{o_{tot}}}$
    
- Because there are a lot of customers with a low number of orders and related to that a specific uncertainty, we want to weaken the ratings for these customers.
    - To achieve this, we can take the square-root of the total amount of orders for each customer $o_{tot}$ divided by a specific treshold value $m$: $\sqrt{\frac{o_{tot}}{m}}\,\, |\,\, o_{tot} < m\,\,$

- The rating should be a number between 0 and 1

Therefore the following formula has been developed:

$\displaystyle{ rating(o, o_{tot}) =
  \begin{cases}
    0            & \quad \text{if } o \text{ is } 0\\
    \omega       & \quad \text{if } o \text{ is }1 \\
    \omega + (1 - \omega) \cdot \sqrt{\frac{o}{o_{tot}}}  & \quad \text{if } o \text{ is } > 1 \land \left(  o_{tot} \geq m \right)\\
    \omega + (1 - \omega) \cdot \sqrt{\frac{o}{o_{tot}}} \cdot \sqrt{\frac{o_{tot}}{m}}  & \quad \text{if } o \text{ is } > 1 \land \left(  o_{tot} < m \right)\\
  \end{cases}}$
    
- where $o$ is the number of orders of the specified product and $o_{tot}$ is the total amount of orders from the corresponding customer $p$.

This approach takes the ratio of each product beeing ordered by a customer and weakens the rating, if to little orders are aviable.

In [None]:
theta = 1/3
o_tot = 10 # products in o
# o total o of customer
m = 10 # lowers ratings ir less than m orders

for o_tot in range(m+20):
    print("o_tot =", o_tot)
    for o in range(o_tot + 1):
        if o == 0:
            x = 0
        elif o == 1:
            x = theta
        else:
            if o_tot < m:
                w_freq = np.sqrt(o_tot / m)
            else:
                w_freq = 1

            w_prod = np.sqrt(o / o_tot)
            
            x = theta + (1-theta) * w_prod * w_freq

        print("x =", x)