In [12]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype

### Erstellen der Ineraction Matrix
----

In [2]:
data = np.array([[1,1,2,3],['A','A','B','B']]).T

In [3]:
df = pd.DataFrame(data, columns = ['user_id','product_name'])
df

Unnamed: 0,user_id,product_name
0,1,A
1,1,A
2,2,B
3,3,B


In [4]:
# create interaction matrix
interaction_matrix_count = df.pivot_table(index='user_id', columns='product_name', aggfunc=len, fill_value=0)
interaction_matrix_count.head()

product_name,A,B
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2,0
2,0,1
3,0,1


In [5]:
interaction_matrix_binary = df.pivot_table(index='user_id', columns='product_name', aggfunc=len, fill_value=0)

In [6]:
def binary(x):
    if x > 0:
        x = 1
    else:
        x = 0
    return x

In [7]:
interaction_matrix_binary.applymap(binary)

product_name,A,B
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,0
2,0,1
3,0,1


## Interaction Matrix as sparse Matrix
----

In [43]:
df = pd.read_csv('Recommender4Retail_big.csv')

In [45]:
user_c = CategoricalDtype(sorted(df.user_id.unique()), ordered=True)
product_name_c = CategoricalDtype(sorted(df.product_name.unique()), ordered=True)

row = df.user_id.astype(user_c).cat.codes
col = df.product_name.astype(product_name_c).cat.codes
val = [1]*len(col)
sparse_matrix = csr_matrix((val,(row, col)), shape=(user_c.categories.size, product_name_c.categories.size))

In [46]:
sparse_matrix

<206209x49685 sparse matrix of type '<class 'numpy.int32'>'
	with 13863746 stored elements in Compressed Sparse Row format>

In [48]:
# turn to dense matrix if necessary
# sparse_matrix.todense()

In [50]:
# turn to dense matrix to dataframe
# dfs = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, index=user_c.categories, columns=product_name_c.categories)

In [51]:
# extract of a product and its values for diffrent customers in interaction matrix. 
# dfs.iloc[:, 401].value_counts()

## User Product Rating
----

In order to get a user-item rating for the interaction matrix, we had to develop a rating function.

The following thoughts were made:
- We want to give sufficient weight to the first product purchase
    - We decided that the first purchase has a weight $\displaystyle \theta = \frac{1}{3}$
- We want to even more weight a reorder of a product (this assumes that the user liked the product)
    - To create the gap, we defined the minimum gap of $\displaystyle 2 \cdot \theta$
- In addition, we do not want to weight multiple reorders too much that users with e.g. 6 or 10 reorders differs not too much
    - To achieve this, we can take the square-root of the number of reorders since $\sqrt{x}\,\, |\,\, x \geq 0\,\,$ is concave
- The rating should be a number between 0 and 1

Therefore the following formula has been developed:

$\displaystyle{ rating(o, p) =
  \begin{cases}
    0            & \quad \text{if } o \text{ is } 0\\
    \theta       & \quad \text{if } o \text{ is }1 \land max(p) > 1\\
    2 \cdot \theta + (1 - 2 \cdot \theta) \cdot \sqrt{\frac{o}{max(p)}}  & \quad \text{if } o \text{ is } \geq 1 \lor \left( o \text{ is }1 \land max(p) = 1 \right)
  \end{cases}}$
    
- where $o$ is the number of orders of the specified product $p$ of the user and $max(p)$ is the max count of all user orders of the specified product $p$.
- $\theta$ must be well defined $0 < \theta < 0.5$, optimally somewhere in the middle

This approach does strongly weight the first reorder and does not take the number of orders of a user into account.

In [None]:
theta = 0.35
max_ = 10 #max(p)
for o in range(max_ + 1):
    if o == 0:
        x = 0
    elif o == 1 and max_ > 1:
        x = theta
    else:
        x = 2*theta + (1-2*theta) * np.sqrt(o/max_)
    print(x)