### Creating data for our example

In [152]:
import numpy as np
import pandas as pd

In [153]:
n_products = 50  # number of products of opur shop
size = 10000  # size of the muestra

# users with id between 1-1000 can rate products with id between 1-50 with a rating from 1-4
df = pd.DataFrame()
df['userID'] = np.random.randint(0, 1000, size=size)  # 100 people
df['productID'] = np.random.randint(0, n_products, size=size)  # 50 products
df['rating'] = np.random.randint(1, 4, size=size)  # rating from 1-4

# user could rate the same product so better to drop duplicates
df.drop_duplicates(subset=['userID', 'productID'], inplace=True)
print('Everything ok' if not any(df.duplicated(subset=['userID', 'productID'])) else 'Something wrong')

# save into .csv
df.to_csv('product_ratings.csv', index_label=False)

df.head()

Everything ok


Unnamed: 0,userID,productID,rating
0,595,45,1
1,447,47,3
2,323,12,3
3,190,18,2
4,263,46,2


In [154]:
# This table comntains random names for our products
df2 = pd.DataFrame()
df2['productID'] = np.arange(n_products)
df2['productName'] = pd.util.testing.rands_array(10, n_products)
df2.set_index('productID', inplace=True)

df2.to_csv('product_list.csv', index_label=False)

In [155]:
df2.head()

Unnamed: 0_level_0,productName
productID,Unnamed: 1_level_1
0,fnAf1n4XHm
1,j1cghyPDVH
2,MzdNHf1BrP
3,vLNIowQPBD
4,nuMpNFXBNz


## 1. Non-Machine Learning methods

### 1.1 General Counting

In [156]:
# Based on count

import pandas as pd

n_products = 5
product_ratings = pd.read_csv('product_ratings.csv')
product_list = pd.read_csv('product_list.csv')
    
# group by and count
counts_df = pd.DataFrame(product_ratings.groupby('productID')['rating'].count())
counts_df.sort_values('rating', ascending=False, inplace=True)
# counts_df.head(10)

# # filter and return most valued product
# counts_df.head(n_products)
most_valued = counts_df.head(n_products).index.tolist()
print(f"Products Id: {most_valued}")

# loc for index (productID) an show names
products_df.loc[most_valued][['productName']]

Products Id: [36, 15, 34, 19, 43]


Unnamed: 0,productName
36,XazvHbzy3w
15,yhdbrLtegK
34,HSuLlR03nT
19,RoHuywtq11
43,rVunlvi4lR


### 1.2 Product Correlation

In [157]:
# Based on avg

import pandas as pd

n_products = 5
product_ratings = pd.read_csv('product_ratings.csv')
product_list = pd.read_csv('product_list.csv')

# group by and count
counts_df = pd.DataFrame(product_ratings.groupby('productID')['rating'].count())
counts_df['rating_avg'] = pd.DataFrame(ratings_df.groupby('productID')['rating'].mean())
counts_df.sort_values('rating_avg', ascending=False, inplace=True)

# filter and return most valued product
most_valued = counts_df.head(n_products).index.tolist()
print(f"Products Id: {most_valued}")

# loc for index (productID) an show names
product_list.loc[most_valued][['productName']]

Products Id: [32, 23, 0, 46, 34]


Unnamed: 0,productName
32,nR6u7GWHMq
23,tehQAmCFkU
0,fnAf1n4XHm
46,x0CP9oZGXB
34,FM3a5RfMal


### 1.3 Rating Average

In [158]:
import pandas as pd

n_products = 5
random_product_id = 7

ratings_df = pd.read_csv('product_ratings.csv')
products_df = pd.read_csv('product_list.csv')

product_name = products_df.loc[random_product_id, 'productName']
print(f"We look for similar products to: '{product_name}' with ID: '{random_product_id}'")

# get id for the product. A bit redundant
index_column = products_df[products_df['productName'] == product_name].index[0]
index_column == random_product_id

# Utility  1000x50  1000 people x 50 product, not all products are rated by users
ratings_pivot = pd.pivot_table(
    data=ratings_df, 
    values='rating', 
    index='userID', 
    columns='productID'
)
# print(ratings_pivot)

# then get vector of correlation for this product respect to the rest of products
# correlation of one column with the rest
pearson = ratings_pivot.corrwith(ratings_pivot[index_column])
print('Pearson values')
print(pearson.head(8))

# # make a dataframe with the obtained vector and arrange from the higher to the lowest
similar_products = pd.DataFrame(pearson, columns=['Pearson_Corr'])
similar_products.dropna(inplace=True)
similar_products = similar_products.sort_values('Pearson_Corr', ascending=False)

# make a list of index products with the method tolist()
most_valued = similar_products.head(n_products).index.tolist()

# look for those index in products dataframe and return them
products_df.loc[most_valued][['productName']]

We look for similar products to: 't2vGpjxGRU' with ID: '7'
Pearson values
productID
0   -0.001387
1   -0.163949
2   -0.133439
3    0.207331
4   -0.130289
5   -0.304819
6    0.262277
7    1.000000
dtype: float64


Unnamed: 0,productName
7,t2vGpjxGRU
11,lmuEiiqjY3
48,snAsGv3ZIr
29,hmVNHvOyfP
6,XZXev2msMB
