### Creating data for our example

In [8]:
import numpy as np
import pandas as pd

In [9]:
n_products = 50  # number of products of opur shop
size = 10000  # size of the muestra

# users with id between 1-1000 can rate products with id between 1-50 with a rating from 1-4
df = pd.DataFrame()
df['userID'] = np.random.randint(0, 1000, size=size)  # 100 people
df['productID'] = np.random.randint(0, n_products, size=size)  # 50 products
df['rating'] = np.random.randint(1, 4, size=size)  # rating from 1-4

# user could rate the same product so better to drop duplicates
df.drop_duplicates(subset=['userID', 'productID'], inplace=True)
print('Everything ok' if not any(df.duplicated(subset=['userID', 'productID'])) else 'Something wrong')

# save into .csv
df.to_csv('product_ratings.csv', index_label=False)

df.head()

Everything ok


Unnamed: 0,userID,productID,rating
0,630,31,1
1,178,44,1
2,853,35,1
3,417,41,3
4,38,24,2


In [15]:
# This table comntains random names for our products
df2 = pd.DataFrame()
df2['productID'] = np.arange(n_products)
df2['productName'] = pd.util.testing.rands_array(10, n_products)
df2.set_index('productID', inplace=True)

df2.to_csv('product_list.csv', index_label=False)
df2.head()

Unnamed: 0_level_0,productName
productID,Unnamed: 1_level_1
0,v9ioqvrenl
1,LYjau9YZsv
2,QQMqz1cTxR
3,4Ai4ND49N2
4,ZzF7UrZmuL


## 1. Non-Machine Learning methods

### 1.1 General Counting

In [12]:
# Based on count

n_products = 5
product_ratings = pd.read_csv('product_ratings.csv')
product_list = pd.read_csv('product_list.csv')
    
# group by and count
counts_df = pd.DataFrame(product_ratings.groupby('productID')['rating'].count())
counts_df.sort_values('rating', ascending=False, inplace=True)
# counts_df.head(10)

# # filter and return most valued product
# counts_df.head(n_products)
most_valued = counts_df.head(n_products).index.tolist()
print(f"Products Id: {most_valued}")

# loc for index (productID) an show names
products_df.loc[most_valued][['productName']]

Products Id: [15, 23, 1, 33, 44]


Unnamed: 0,productName
15,vJLAjNtkNr
23,NLMrD3Bdse
1,N0OLUyY8Im
33,ZUQe4FoLVL
44,up8BYzlGQ7


### 1.2 Rating Average

In [13]:
# Based on average

n_products = 5
product_ratings = pd.read_csv('product_ratings.csv')
product_list = pd.read_csv('product_list.csv')

# group by and count
counts_df = pd.DataFrame(product_ratings.groupby('productID')['rating'].count())
counts_df['rating_avg'] = pd.DataFrame(ratings_df.groupby('productID')['rating'].mean())
counts_df.sort_values('rating_avg', ascending=False, inplace=True)

# filter and return most valued product
most_valued = counts_df.head(n_products).index.tolist()
print(f"Products Id: {most_valued}")

# loc for index (productID) an show names
product_list.loc[most_valued][['productName']]

Products Id: [8, 48, 15, 5, 10]


Unnamed: 0,productName
8,UIMebVAchm
48,Q2pILx17OS
15,u5f5h3NHOO
5,LKwrvrFLEe
10,jiSDMc9p8j


### 1.3 Product Correlation

In [14]:
# Based on corr.

n_products = 5
random_product_id = 7

ratings_df = pd.read_csv('product_ratings.csv')
products_df = pd.read_csv('product_list.csv')

product_name = products_df.loc[random_product_id, 'productName']
print(f"We look for similar products to: '{product_name}' with ID: '{random_product_id}'")

# get id for the product. A bit redundant
index_column = products_df[products_df['productName'] == product_name].index[0]
index_column == random_product_id

# Utility  1000x50  1000 people x 50 product, not all products are rated by users
ratings_pivot = pd.pivot_table(
    data=ratings_df, 
    values='rating', 
    index='userID', 
    columns='productID'
)
# print(ratings_pivot)

# then get vector of correlation for this product respect to the rest of products
# correlation of one column with the rest ??
pearson = ratings_pivot.corrwith(ratings_pivot[index_column])
print('Pearson values')
print(pearson.head(8))

# # make a dataframe with the obtained vector and arrange from the higher to the lowest
similar_products = pd.DataFrame(pearson, columns=['Pearson_Corr'])
similar_products.dropna(inplace=True)
similar_products = similar_products.sort_values('Pearson_Corr', ascending=False)

# make a list of index products with the method tolist()
most_valued = similar_products.head(n_products).index.tolist()

# look for those index in products dataframe and return them
products_df.loc[most_valued][['productName']]

We look for similar products to: 'J6JpB9iWEo' with ID: '7'
Pearson values
productID
0   -0.166667
1    0.038443
2    0.000000
3    0.000000
4   -0.014456
5   -0.154016
6   -0.038580
7    1.000000
dtype: float64


Unnamed: 0,productName
7,J6JpB9iWEo
30,azJ4HVQT3L
22,LIfvpTNxPn
32,JWzawy4AYC
39,YrsqogcT9s
