In [1]:
import numpy as np
import numpy.random as rand
import pandas as pd
import os, json
import matplotlib.pyplot as plt
from random import randint
import pickle
from sklearn.preprocessing import MinMaxScaler 
from sklearn import model_selection
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
warnings.filterwarnings("ignore")

In [3]:
#Get the final dataset from CNN
product_prop = pd.read_csv("dataset_from_cnn_all_final.csv")

In [4]:
product_prop

Unnamed: 0,product_id,category_id,bounding_box,occlusion,viewpoint,zoom-in,image,pattern,color
0,P00003,10,"[1, 52, 467, 831]",1,1,2,000003.jpg,plain,Charcoal
1,P00004,10,"[0, 113, 467, 623]",1,2,2,000004.jpg,printed,Charcoal
2,P00005,10,"[1, 98, 467, 814]",1,1,2,000005.jpg,printed,Charcoal
3,P00006,10,"[0, 324, 466, 831]",3,2,2,000006.jpg,plain,Gray
4,P00007,10,"[23, 106, 430, 696]",1,1,1,000007.jpg,plain,Gray
...,...,...,...,...,...,...,...,...,...
11914,P29973,7,"[98, 58, 403, 630]",1,1,2,029973.jpg,plain,Charcoal
11915,P29981,7,"[65, 1, 395, 557]",2,3,2,029981.jpg,plain,Tan
11916,P29997,3,"[0, 17, 467, 623]",1,1,3,029997.jpg,printed,Gray
11917,P29998,3,"[0, 0, 463, 779]",1,2,2,029998.jpg,printed,Gray


In [5]:
#Replacing category flag with values

conditions = [
    (product_prop['category_id'] == 0),
    (product_prop['category_id'] == 1),
    (product_prop['category_id'] == 2),
    (product_prop['category_id'] == 3),
    (product_prop['category_id'] == 4),
    (product_prop['category_id'] == 5),
    (product_prop['category_id'] == 6),
    (product_prop['category_id'] == 7),
    (product_prop['category_id'] == 8),
    (product_prop['category_id'] == 9),
    (product_prop['category_id'] == 10),
    (product_prop['category_id'] == 11),
    (product_prop['category_id'] == 12)
    ]

values = ['short_sleeve_top', 'long_sleeve_top', 'short_sleeve_outwear', 'long_sleeve_outwear','vest','sling',
          'shorts','trousers','skirt','short_sleeve_dress','long_sleeve_dress','vest_dress','sling_dress']

product_prop['category_name'] = np.select(conditions, values)

In [6]:
#Replacing occlusion flag with values

conditions = [
    (product_prop['occlusion'] == 1),
    (product_prop['occlusion'] == 2),
    (product_prop['occlusion'] == 3)
    ]

values = ['slight', 'medium', 'heavy']

product_prop['occlusion_type'] = np.select(conditions, values)

In [7]:
#Replacing zoom-in flag with values

conditions = [
    (product_prop['zoom-in'] == 1),
    (product_prop['zoom-in'] == 2),
    (product_prop['zoom-in'] == 3)
    ]

values = ['no', 'medium', 'heavy']

product_prop['zoom_in_type'] = np.select(conditions, values)

In [8]:
#Replacing viewpoint flag with values

conditions = [
    (product_prop['viewpoint'] == 1),
    (product_prop['viewpoint'] == 2),
    (product_prop['viewpoint'] == 3)
    ]

values = ['no', 'frontal', 'side_back']

product_prop['viewpoint_type'] = np.select(conditions, values)

In [9]:
product_prop

Unnamed: 0,product_id,category_id,bounding_box,occlusion,viewpoint,zoom-in,image,pattern,color,category_name,occlusion_type,zoom_in_type,viewpoint_type
0,P00003,10,"[1, 52, 467, 831]",1,1,2,000003.jpg,plain,Charcoal,long_sleeve_dress,slight,medium,no
1,P00004,10,"[0, 113, 467, 623]",1,2,2,000004.jpg,printed,Charcoal,long_sleeve_dress,slight,medium,frontal
2,P00005,10,"[1, 98, 467, 814]",1,1,2,000005.jpg,printed,Charcoal,long_sleeve_dress,slight,medium,no
3,P00006,10,"[0, 324, 466, 831]",3,2,2,000006.jpg,plain,Gray,long_sleeve_dress,heavy,medium,frontal
4,P00007,10,"[23, 106, 430, 696]",1,1,1,000007.jpg,plain,Gray,long_sleeve_dress,slight,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11914,P29973,7,"[98, 58, 403, 630]",1,1,2,029973.jpg,plain,Charcoal,trousers,slight,medium,no
11915,P29981,7,"[65, 1, 395, 557]",2,3,2,029981.jpg,plain,Tan,trousers,medium,medium,side_back
11916,P29997,3,"[0, 17, 467, 623]",1,1,3,029997.jpg,printed,Gray,long_sleeve_outwear,slight,heavy,no
11917,P29998,3,"[0, 0, 463, 779]",1,2,2,029998.jpg,printed,Gray,long_sleeve_outwear,slight,medium,frontal


In [10]:
#Remove unnecessary columns
product_prop = product_prop.drop(['category_id','bounding_box','image','occlusion',
                     'viewpoint','zoom-in'],axis=1)

In [11]:
product_prop

Unnamed: 0,product_id,pattern,color,category_name,occlusion_type,zoom_in_type,viewpoint_type
0,P00003,plain,Charcoal,long_sleeve_dress,slight,medium,no
1,P00004,printed,Charcoal,long_sleeve_dress,slight,medium,frontal
2,P00005,printed,Charcoal,long_sleeve_dress,slight,medium,no
3,P00006,plain,Gray,long_sleeve_dress,heavy,medium,frontal
4,P00007,plain,Gray,long_sleeve_dress,slight,no,no
...,...,...,...,...,...,...,...
11914,P29973,plain,Charcoal,trousers,slight,medium,no
11915,P29981,plain,Tan,trousers,medium,medium,side_back
11916,P29997,printed,Gray,long_sleeve_outwear,slight,heavy,no
11917,P29998,printed,Gray,long_sleeve_outwear,slight,medium,frontal


In [5]:
#this function creates users id's and associate the products randomly using normal distribution
# to the user based on a hyperparameter that is passed to this function.
# It also creates product reviews based on random normal distribution

#ppu: products per user, pc: products count
def associateUsersProducts(users,pc,ppu):
    dataset = {'user_id':list(),'product_ids':list(),'product_reviews':list(),'products_count':list()}
    
    # creating users
    length = len(str(users))
    for i in range(1,users+1):
        dataset['user_id'].append("U"+str(i).zfill(length))
    
    # creating products
    products = list(product_prop['product_id'].unique())
    products_list = products[0:250]
    
#     products_list = list()
#     length = 5 #len(str(pc)) # as per siddharth code, I am changing my code to match the product id's
#     for i in range(1,pc+1):
#         products_list.append("P"+str(i).zfill(length))

    # assigning products to users with randomly generated ratings
    min_rating = 1
    max_rating = 5 
    for i in range(users):
        total_products_count = rand.randint(1,ppu+1)
        dataset['product_ids'].append(
            [ products_list[x] for x in rand.randint(
                0,
                len(products_list),
                total_products_count
            ).tolist() ]
        )
        
        # dataset['product_reviews'].append(rand.randint(min_rating,max_rating+1,total_products_count))
        
        #a randomly generated value using normal distribution whose value will range from (MEAN - 2 x STD) <= value <= (MEAN + 2 x STD)
        # since the values can be 0 and our ratings should not be 0, we add 1 as minimum and since we are adding 1 as minmu, I am reducing the modulo to 4
        dataset['product_reviews'].append([round(x) for x in abs(1+np.random.normal(0,2.5,total_products_count)%4)])
        
        dataset['products_count'].append(total_products_count)
        
    return dataset

In [7]:
#parameters are: users count, products count, how many products to each user
dataset = associateUsersProducts(100,10000,80)

In [8]:
data = pd.DataFrame(dataset)
data

Unnamed: 0,user_id,product_ids,product_reviews,products_count
0,U001,"[P00715, P00265, P00237, P00524, P00588, P0032...","[4, 4, 1, 5, 3, 5, 2, 3, 3, 2, 3, 1, 3, 4, 4, ...",73
1,U002,"[P00216, P00006, P00229, P00423, P00711, P0033...","[1, 3, 2, 2, 2, 2, 1, 2, 3, 3, 4, 4, 5, 1, 3, ...",22
2,U003,"[P00457, P00165, P00438, P00430, P00464, P0022...","[4, 5, 5, 4, 4, 4, 5, 2, 1, 1, 5, 2, 4, 4, 4, 2]",16
3,U004,"[P00010, P00271, P00711, P00409, P00272, P0027...","[4, 4, 2, 5, 3, 4, 1, 5, 5, 1, 5, 3, 1, 2, 1, ...",78
4,U005,"[P00393, P00511, P00217, P00394, P00566, P0016...","[3, 1, 2, 4, 3, 4, 3, 4, 4, 4, 2, 2, 2, 2, 2, ...",23
...,...,...,...,...
95,U096,"[P00523, P00168, P00459, P00101, P00514, P0054...","[4, 2, 5, 4, 4, 4, 4, 3, 2, 5, 4, 3, 5, 2, 5, ...",77
96,U097,"[P00358, P00163, P00144, P00151, P00393, P0003...","[3, 4, 5, 3, 4, 2, 4, 4]",8
97,U098,"[P00010, P00008, P00365, P00427, P00271, P0041...","[4, 1, 2, 1, 1, 3, 4, 2, 4, 4, 2, 2, 4, 4, 1, ...",49
98,U099,"[P00409, P00451, P00154, P00589, P00450, P0042...","[3, 5, 4, 4, 2, 2, 2, 3, 4, 3, 2, 3]",12


In [9]:
#Saving as pickle file to use it anytime later
data.to_pickle('user_preference_all.pkl')

In [12]:
user_preference = pd.read_pickle('user_preference_all.pkl')
print(type(user_preference))

<class 'pandas.core.frame.DataFrame'>


In [13]:
#Converting the dataframe to long form
user_preference = user_preference.apply(pd.Series.explode).reset_index()

In [14]:
user_preference

Unnamed: 0,index,user_id,product_ids,product_reviews,products_count
0,0,U001,P00715,4,73
1,0,U001,P00265,4,73
2,0,U001,P00237,1,73
3,0,U001,P00524,5,73
4,0,U001,P00588,3,73
...,...,...,...,...,...
3948,99,U100,P00433,3,66
3949,99,U100,P00003,3,66
3950,99,U100,P00624,4,66
3951,99,U100,P00440,3,66


In [15]:
user_preference = user_preference.drop_duplicates(subset=['user_id', 'product_ids'], keep='first')

In [16]:
user_preference = user_preference.rename(columns={"product_ids": "product_id"})

In [17]:
#Adding other attributes like pattern, color etc
cbf_df = user_preference.merge(product_prop,how ="left",on = "product_id")

In [18]:
cbf_df

Unnamed: 0,index,user_id,product_id,product_reviews,products_count,pattern,color,category_name,occlusion_type,zoom_in_type,viewpoint_type
0,0,U001,P00715,4,73,plain,Silver,long_sleeve_dress,slight,no,frontal
1,0,U001,P00265,4,73,plain,Silver,long_sleeve_dress,medium,no,frontal
2,0,U001,P00237,1,73,plain,Gray,long_sleeve_top,medium,heavy,frontal
3,0,U001,P00524,5,73,printed,Silver,vest_dress,medium,medium,frontal
4,0,U001,P00588,3,73,plain,Gray,long_sleeve_top,slight,no,no
...,...,...,...,...,...,...,...,...,...,...,...
3542,99,U100,P00534,2,66,plain,Charcoal,trousers,medium,no,frontal
3543,99,U100,P00433,3,66,plain,Silver,short_sleeve_top,slight,medium,frontal
3544,99,U100,P00003,3,66,plain,Charcoal,long_sleeve_dress,slight,medium,no
3545,99,U100,P00624,4,66,printed,Gray,shorts,slight,medium,frontal


In [19]:
#Remove unnecessary columns
cbf_df = cbf_df.drop(['products_count','index'],axis=1)

In [20]:
cbf_df

Unnamed: 0,user_id,product_id,product_reviews,pattern,color,category_name,occlusion_type,zoom_in_type,viewpoint_type
0,U001,P00715,4,plain,Silver,long_sleeve_dress,slight,no,frontal
1,U001,P00265,4,plain,Silver,long_sleeve_dress,medium,no,frontal
2,U001,P00237,1,plain,Gray,long_sleeve_top,medium,heavy,frontal
3,U001,P00524,5,printed,Silver,vest_dress,medium,medium,frontal
4,U001,P00588,3,plain,Gray,long_sleeve_top,slight,no,no
...,...,...,...,...,...,...,...,...,...
3542,U100,P00534,2,plain,Charcoal,trousers,medium,no,frontal
3543,U100,P00433,3,plain,Silver,short_sleeve_top,slight,medium,frontal
3544,U100,P00003,3,plain,Charcoal,long_sleeve_dress,slight,medium,no
3545,U100,P00624,4,printed,Gray,shorts,slight,medium,frontal


In [21]:
#Standardizing ratings so that it can be easy for comparision with predicted ratings later
min_max = MinMaxScaler(feature_range = (0.1,1))
min_max.fit(cbf_df['product_reviews'].values.reshape(-1,1))
cbf_df['standardized_rating'] = min_max.transform(cbf_df['product_reviews'].values.reshape(-1,1))

In [22]:
print(np.unique(cbf_df[['product_reviews', 'standardized_rating']].values))

[0.1 0.325 0.55 0.775 1.0 2 3 4 5]


**Build CBF on pattern,color and category**

In [23]:
#Step 1: Build product matrix on patterns
pattern_product = cbf_df[['product_id','pattern']]
pattern_product = pattern_product.sort_values(by='product_id')

In [26]:
pattern_product = pattern_product.drop_duplicates(subset=['product_id','pattern'], keep='first')

In [27]:
pattern_product

Unnamed: 0,product_id,pattern
1169,P00003,plain
2539,P00004,printed
1972,P00005,printed
244,P00006,plain
2004,P00007,plain
...,...,...
502,P00712,plain
840,P00713,printed
2581,P00714,printed
347,P00715,plain


In [28]:
#As patterns are strings, using TFIDF to convert them to usable values
tfidf = TfidfVectorizer()
tfidf_fit = tfidf.fit_transform(list(pattern_product['pattern']))

In [29]:
cols = tfidf.get_feature_names()

In [30]:
#Adding a small number (0.01) to avoid zeros
pattern_prod_matrix = (pd.DataFrame(tfidf_fit.toarray().round(3), 
                                   index=[i for i in pattern_product['product_id']], columns=cols))+0.01


In [31]:
pattern_prod_matrix

Unnamed: 0,plain,printed,stripes
P00003,1.01,0.01,0.01
P00004,0.01,1.01,0.01
P00005,0.01,1.01,0.01
P00006,1.01,0.01,0.01
P00007,1.01,0.01,0.01
...,...,...,...
P00712,1.01,0.01,0.01
P00713,0.01,1.01,0.01
P00714,0.01,1.01,0.01
P00715,1.01,0.01,0.01


In [32]:
#Step 2: Build product matrix on color
color_product = cbf_df[['product_id','color']]
color_product = color_product.sort_values(by='product_id')

In [33]:
color_product

Unnamed: 0,product_id,color
1169,P00003,Charcoal
1592,P00003,Charcoal
1135,P00003,Charcoal
1979,P00003,Charcoal
3544,P00003,Charcoal
...,...,...
1888,P00716,Charcoal
1922,P00716,Charcoal
2527,P00716,Charcoal
2323,P00716,Charcoal


In [34]:
color_product = color_product.drop_duplicates(subset=['product_id','color'], keep='first')

In [35]:
#As colors are strings, using TFIDF to convert them to usable values
tfidf = TfidfVectorizer()
tfidf_fit_1 = tfidf.fit_transform(list(color_product['color']))

In [36]:
cols_1 = tfidf.get_feature_names()
cols_1

['bronze',
 'brown',
 'charcoal',
 'coral',
 'gray',
 'lavender',
 'pink',
 'rust',
 'silver',
 'tan']

In [37]:
#Adding a small number (0.01) to avoid zeros
color_prod_matrix = (pd.DataFrame(tfidf_fit_1.toarray().round(3), 
                                   index=[i for i in color_product['product_id']], columns=cols_1))+0.01

In [38]:
color_prod_matrix

Unnamed: 0,bronze,brown,charcoal,coral,gray,lavender,pink,rust,silver,tan
P00003,0.01,0.01,1.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01
P00004,0.01,0.01,1.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01
P00005,0.01,0.01,1.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01
P00006,0.01,0.01,0.01,0.01,1.01,0.01,0.01,0.01,0.01,0.01
P00007,0.01,0.01,0.01,0.01,1.01,0.01,0.01,0.01,0.01,0.01
...,...,...,...,...,...,...,...,...,...,...
P00712,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,1.01,0.01
P00713,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,1.01,0.01
P00714,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,1.01,0.01
P00715,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,1.01,0.01


In [39]:
#Step 3: Build product matrix on category
category_product = cbf_df[['product_id','category_name']]
category_product = category_product.sort_values(by='product_id')

In [40]:
category_product

Unnamed: 0,product_id,category_name
1169,P00003,long_sleeve_dress
1592,P00003,long_sleeve_dress
1135,P00003,long_sleeve_dress
1979,P00003,long_sleeve_dress
3544,P00003,long_sleeve_dress
...,...,...
1888,P00716,long_sleeve_dress
1922,P00716,long_sleeve_dress
2527,P00716,long_sleeve_dress
2323,P00716,long_sleeve_dress


In [41]:
category_product = category_product.drop_duplicates(subset=['product_id','category_name'], keep='first')

In [42]:
#As categories are strings, using TFIDF to convert them to usable values
tfidf = TfidfVectorizer()
tfidf_fit_2 = tfidf.fit_transform(list(category_product['category_name']))

In [43]:
cols_2 = tfidf.get_feature_names()
cols_2

['long_sleeve_dress',
 'long_sleeve_outwear',
 'long_sleeve_top',
 'short_sleeve_dress',
 'short_sleeve_top',
 'shorts',
 'trousers',
 'vest',
 'vest_dress']

In [44]:
#Adding a small number (0.01) to avoid zeros
category_prod_matrix = (pd.DataFrame(tfidf_fit_2.toarray().round(3), 
                                   index=[i for i in category_product['product_id']], columns=cols_2))+0.01

In [45]:
category_prod_matrix

Unnamed: 0,long_sleeve_dress,long_sleeve_outwear,long_sleeve_top,short_sleeve_dress,short_sleeve_top,shorts,trousers,vest,vest_dress
P00003,1.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01
P00004,1.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01
P00005,1.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01
P00006,1.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01
P00007,1.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01
...,...,...,...,...,...,...,...,...,...
P00712,1.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01
P00713,1.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01
P00714,1.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01
P00715,1.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01


Finding cosine similarity between each attribute for all products

In [46]:
cosine_sim_matrix_pattern = pd.DataFrame(cosine_similarity(pattern_prod_matrix.values),
                                  columns=pattern_product['product_id'],index=pattern_product['product_id'])

In [47]:
cosine_sim_matrix_pattern

product_id,P00003,P00004,P00005,P00006,P00007,P00008,P00009,P00010,P00015,P00020,...,P00686,P00688,P00690,P00693,P00711,P00712,P00713,P00714,P00715,P00716
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P00003,1.000000,0.019896,0.019896,1.000000,1.000000,0.019896,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.019896,0.019896,1.000000,1.000000
P00004,0.019896,1.000000,1.000000,0.019896,0.019896,1.000000,0.019896,0.019896,0.019896,0.019896,...,0.019896,0.019896,0.019896,0.019896,0.019896,0.019896,1.000000,1.000000,0.019896,0.019896
P00005,0.019896,1.000000,1.000000,0.019896,0.019896,1.000000,0.019896,0.019896,0.019896,0.019896,...,0.019896,0.019896,0.019896,0.019896,0.019896,0.019896,1.000000,1.000000,0.019896,0.019896
P00006,1.000000,0.019896,0.019896,1.000000,1.000000,0.019896,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.019896,0.019896,1.000000,1.000000
P00007,1.000000,0.019896,0.019896,1.000000,1.000000,0.019896,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.019896,0.019896,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P00712,1.000000,0.019896,0.019896,1.000000,1.000000,0.019896,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.019896,0.019896,1.000000,1.000000
P00713,0.019896,1.000000,1.000000,0.019896,0.019896,1.000000,0.019896,0.019896,0.019896,0.019896,...,0.019896,0.019896,0.019896,0.019896,0.019896,0.019896,1.000000,1.000000,0.019896,0.019896
P00714,0.019896,1.000000,1.000000,0.019896,0.019896,1.000000,0.019896,0.019896,0.019896,0.019896,...,0.019896,0.019896,0.019896,0.019896,0.019896,0.019896,1.000000,1.000000,0.019896,0.019896
P00715,1.000000,0.019896,0.019896,1.000000,1.000000,0.019896,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.019896,0.019896,1.000000,1.000000


In [48]:
cosine_sim_matrix_color = pd.DataFrame(cosine_similarity(color_prod_matrix.values),
                                  columns=color_product['product_id'],index=color_product['product_id'])

In [49]:
cosine_sim_matrix_color

product_id,P00003,P00004,P00005,P00006,P00007,P00008,P00009,P00010,P00015,P00020,...,P00686,P00688,P00690,P00693,P00711,P00712,P00713,P00714,P00715,P00716
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P00003,1.000000,1.000000,1.000000,0.020568,0.020568,0.020568,1.000000,1.000000,0.020568,0.020568,...,1.000000,1.000000,1.000000,1.000000,1.000000,0.020568,0.020568,0.020568,0.020568,1.000000
P00004,1.000000,1.000000,1.000000,0.020568,0.020568,0.020568,1.000000,1.000000,0.020568,0.020568,...,1.000000,1.000000,1.000000,1.000000,1.000000,0.020568,0.020568,0.020568,0.020568,1.000000
P00005,1.000000,1.000000,1.000000,0.020568,0.020568,0.020568,1.000000,1.000000,0.020568,0.020568,...,1.000000,1.000000,1.000000,1.000000,1.000000,0.020568,0.020568,0.020568,0.020568,1.000000
P00006,0.020568,0.020568,0.020568,1.000000,1.000000,1.000000,0.020568,0.020568,1.000000,1.000000,...,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568
P00007,0.020568,0.020568,0.020568,1.000000,1.000000,1.000000,0.020568,0.020568,1.000000,1.000000,...,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P00712,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,...,0.020568,0.020568,0.020568,0.020568,0.020568,1.000000,1.000000,1.000000,1.000000,0.020568
P00713,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,...,0.020568,0.020568,0.020568,0.020568,0.020568,1.000000,1.000000,1.000000,1.000000,0.020568
P00714,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,...,0.020568,0.020568,0.020568,0.020568,0.020568,1.000000,1.000000,1.000000,1.000000,0.020568
P00715,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,0.020568,...,0.020568,0.020568,0.020568,0.020568,0.020568,1.000000,1.000000,1.000000,1.000000,0.020568


In [50]:
cosine_sim_matrix_category = pd.DataFrame(cosine_similarity(category_prod_matrix.values),
                                  columns=category_product['product_id'],index=category_product['product_id'])

In [51]:
cosine_sim_matrix_category

product_id,P00003,P00004,P00005,P00006,P00007,P00008,P00009,P00010,P00015,P00020,...,P00686,P00688,P00690,P00693,P00711,P00712,P00713,P00714,P00715,P00716
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P00003,1.0,1.0,1.0,1.0,1.0,1.0,0.020472,0.020472,0.020472,0.020472,...,0.020472,0.020472,0.020472,0.020472,1.0,1.0,1.0,1.0,1.0,1.0
P00004,1.0,1.0,1.0,1.0,1.0,1.0,0.020472,0.020472,0.020472,0.020472,...,0.020472,0.020472,0.020472,0.020472,1.0,1.0,1.0,1.0,1.0,1.0
P00005,1.0,1.0,1.0,1.0,1.0,1.0,0.020472,0.020472,0.020472,0.020472,...,0.020472,0.020472,0.020472,0.020472,1.0,1.0,1.0,1.0,1.0,1.0
P00006,1.0,1.0,1.0,1.0,1.0,1.0,0.020472,0.020472,0.020472,0.020472,...,0.020472,0.020472,0.020472,0.020472,1.0,1.0,1.0,1.0,1.0,1.0
P00007,1.0,1.0,1.0,1.0,1.0,1.0,0.020472,0.020472,0.020472,0.020472,...,0.020472,0.020472,0.020472,0.020472,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P00712,1.0,1.0,1.0,1.0,1.0,1.0,0.020472,0.020472,0.020472,0.020472,...,0.020472,0.020472,0.020472,0.020472,1.0,1.0,1.0,1.0,1.0,1.0
P00713,1.0,1.0,1.0,1.0,1.0,1.0,0.020472,0.020472,0.020472,0.020472,...,0.020472,0.020472,0.020472,0.020472,1.0,1.0,1.0,1.0,1.0,1.0
P00714,1.0,1.0,1.0,1.0,1.0,1.0,0.020472,0.020472,0.020472,0.020472,...,0.020472,0.020472,0.020472,0.020472,1.0,1.0,1.0,1.0,1.0,1.0
P00715,1.0,1.0,1.0,1.0,1.0,1.0,0.020472,0.020472,0.020472,0.020472,...,0.020472,0.020472,0.020472,0.020472,1.0,1.0,1.0,1.0,1.0,1.0


In [52]:
final_similarity_matrix = cosine_sim_matrix_pattern.multiply(cosine_sim_matrix_color).multiply(cosine_sim_matrix_category)



In [53]:
final_similarity_matrix

product_id,P00003,P00004,P00005,P00006,P00007,P00008,P00009,P00010,P00015,P00020,...,P00686,P00688,P00690,P00693,P00711,P00712,P00713,P00714,P00715,P00716
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P00003,1.000000,0.019896,0.019896,0.020568,0.020568,0.000409,0.020472,0.020472,0.000421,0.000421,...,0.020472,0.020472,0.020472,0.020472,1.000000,0.020568,0.000409,0.000409,0.020568,1.000000
P00004,0.019896,1.000000,1.000000,0.000409,0.000409,0.020568,0.000407,0.000407,0.000008,0.000008,...,0.000407,0.000407,0.000407,0.000407,0.019896,0.000409,0.020568,0.020568,0.000409,0.019896
P00005,0.019896,1.000000,1.000000,0.000409,0.000409,0.020568,0.000407,0.000407,0.000008,0.000008,...,0.000407,0.000407,0.000407,0.000407,0.019896,0.000409,0.020568,0.020568,0.000409,0.019896
P00006,0.020568,0.000409,0.000409,1.000000,1.000000,0.019896,0.000421,0.000421,0.020472,0.020472,...,0.000421,0.000421,0.000421,0.000421,0.020568,0.020568,0.000409,0.000409,0.020568,0.020568
P00007,0.020568,0.000409,0.000409,1.000000,1.000000,0.019896,0.000421,0.000421,0.020472,0.020472,...,0.000421,0.000421,0.000421,0.000421,0.020568,0.020568,0.000409,0.000409,0.020568,0.020568
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P00712,0.020568,0.000409,0.000409,0.020568,0.020568,0.000409,0.000421,0.000421,0.000421,0.000421,...,0.000421,0.000421,0.000421,0.000421,0.020568,1.000000,0.019896,0.019896,1.000000,0.020568
P00713,0.000409,0.020568,0.020568,0.000409,0.000409,0.020568,0.000008,0.000008,0.000008,0.000008,...,0.000008,0.000008,0.000008,0.000008,0.000409,0.019896,1.000000,1.000000,0.019896,0.000409
P00714,0.000409,0.020568,0.020568,0.000409,0.000409,0.020568,0.000008,0.000008,0.000008,0.000008,...,0.000008,0.000008,0.000008,0.000008,0.000409,0.019896,1.000000,1.000000,0.019896,0.000409
P00715,0.020568,0.000409,0.000409,0.020568,0.020568,0.000409,0.000421,0.000421,0.000421,0.000421,...,0.000421,0.000421,0.000421,0.000421,0.020568,1.000000,0.019896,0.019896,1.000000,0.020568


In [54]:
#Creating the user*product matrix with ratings

user_prod_matrix = pd.pivot(data = cbf_df, index = 'user_id', columns = 'product_id', values = 'product_reviews')

In [55]:
#user_prod_matrix
#There are NaN values for products for which ratings does not exist. Fill them with 0s
user_prod_matrix = user_prod_matrix.fillna(0)

In [56]:
user_prod_matrix

product_id,P00003,P00004,P00005,P00006,P00007,P00008,P00009,P00010,P00015,P00020,...,P00686,P00688,P00690,P00693,P00711,P00712,P00713,P00714,P00715,P00716
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U001,0,0,0,0,4,0,0,0,0,0,...,0,0,0,0,0,0,3,3,4,3
U002,0,0,0,3,0,0,3,4,0,0,...,4,0,0,0,2,0,0,0,0,0
U003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U004,0,4,0,0,0,4,0,4,0,0,...,4,0,0,0,2,2,0,0,2,2
U005,0,0,0,0,0,0,0,0,4,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U096,0,4,0,3,0,0,0,0,0,2,...,0,0,0,3,0,0,4,2,0,0
U097,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U098,0,2,0,0,0,1,0,4,3,0,...,0,0,0,4,0,0,0,4,0,0
U099,0,0,0,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
#Predicting ratings based on all attributes

weighted_ratings = user_prod_matrix.dot(final_similarity_matrix)

In [58]:
weighted_ratings

product_id,P00003,P00004,P00005,P00006,P00007,P00008,P00009,P00010,P00015,P00020,...,P00686,P00688,P00690,P00693,P00711,P00712,P00713,P00714,P00715,P00716
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U001,9.150738,0.605544,0.605544,9.633959,9.633959,0.715373,0.705696,0.705696,7.923670,7.923670,...,7.602285,7.602285,7.602285,7.602285,9.150738,30.113060,12.771143,12.771143,30.113060,9.150738
U002,2.622891,0.118399,0.118399,7.162323,7.162323,0.369061,7.378699,7.378699,0.405561,0.405561,...,4.440116,4.440116,4.440116,4.440116,2.622891,0.421887,2.012653,2.012653,0.421887,2.622891
U003,0.444072,0.133399,0.133399,6.378822,6.378822,0.151262,0.302642,0.302642,5.278012,5.278012,...,0.282495,0.282495,0.282495,0.282495,0.444072,6.338156,1.129497,1.129497,6.338156,0.444072
U004,5.230228,4.444330,4.444330,6.817575,6.817575,4.796602,4.831123,4.831123,15.301041,15.301041,...,7.704456,7.704456,7.704456,7.704456,5.230228,26.855211,6.752499,6.752499,26.855211,5.230228
U005,0.160136,0.237151,0.237151,2.422957,2.422957,0.442517,0.158933,0.158933,4.455044,4.455044,...,0.075940,0.075940,0.075940,0.075940,0.160136,2.393839,9.052904,9.052904,2.393839,0.160136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U096,1.521820,4.864002,4.864002,9.558584,9.558584,5.063988,3.612453,3.612453,5.124430,5.124430,...,11.565549,11.565549,11.565549,11.565549,1.521820,34.970792,18.016563,18.016563,34.970792,1.521820
U097,0.170515,0.066889,0.066889,0.271568,0.271568,0.108986,0.170515,0.170515,0.271568,0.271568,...,0.251103,0.251103,0.251103,0.251103,0.170515,0.008910,0.003545,0.003545,0.008910,0.170515
U098,3.052458,2.455513,2.455513,2.851714,2.851714,1.432388,8.725864,8.725864,5.666065,5.666065,...,4.667526,4.667526,4.667526,4.667526,3.052458,18.535535,8.437400,8.437400,18.535535,3.052458
U099,0.091632,0.004769,0.004769,0.335037,0.335037,0.149914,0.153676,0.153676,3.351576,3.351576,...,0.131926,0.131926,0.131926,0.131926,0.091632,0.151785,0.005966,0.005966,0.151785,0.091632


In [59]:
#Standardizing the weighted ratings

min_max = MinMaxScaler(feature_range = (0.1,1))
min_max.fit(weighted_ratings.values)
weighted_ratings = min_max.transform(weighted_ratings.values)

In [60]:
weighted_ratings

array([[0.67015194, 0.19575889, 0.19575889, ..., 0.73796948, 0.74981278,
        0.67015194],
       [0.26333301, 0.11871797, 0.11871797, ..., 0.20053828, 0.10910305,
        0.26333301],
       [0.12754789, 0.12109016, 0.12109016, ..., 0.15642097, 0.236771  ,
        0.12754789],
       ...,
       [0.29010389, 0.48832784, 0.48832784, ..., 0.52148103, 0.49997985,
        0.29010389],
       [0.10558362, 0.10074766, 0.10074766, ..., 0.10029595, 0.10327448,
        0.10558362],
       [0.53194015, 0.82089228, 0.82089228, ..., 0.38536609, 0.57003382,
        0.53194015]])

In [61]:
#Convert this array back to matrix form
weighted_ratings_df = pd.DataFrame(weighted_ratings,index=sorted(cbf_df['user_id'].unique()),
                                columns=sorted(cbf_df['product_id'].unique()))

In [62]:
# test = pd.melt(weighted_ratings, id_vars=sorted(cbf_df['user_id'].unique()),
#                                                 value_vars=sorted(cbf_df['product_id'].unique()))
weighted_ratings_df = weighted_ratings_df.stack().reset_index()

In [63]:
weighted_ratings_df = weighted_ratings_df.rename(columns={'level_0':'user_id','level_1':'product_id',
                                                  0:'predicted_rating'})

In [64]:
weighted_ratings_df

Unnamed: 0,user_id,product_id,predicted_rating
0,U001,P00003,0.670152
1,U001,P00004,0.195759
2,U001,P00005,0.195759
3,U001,P00006,0.671890
4,U001,P00007,0.671890
...,...,...,...
24995,U100,P00712,0.570034
24996,U100,P00713,0.385366
24997,U100,P00714,0.385366
24998,U100,P00715,0.570034


In [83]:
prod_attr = weighted_ratings_df.merge(product_prop,on=['product_id'],how='left')

In [84]:
prod_attr

Unnamed: 0,user_id,product_id,predicted_rating,pattern,color,category_name,occlusion_type,zoom_in_type,viewpoint_type
0,U001,P00003,0.670152,plain,Charcoal,long_sleeve_dress,slight,medium,no
1,U001,P00004,0.195759,printed,Charcoal,long_sleeve_dress,slight,medium,frontal
2,U001,P00005,0.195759,printed,Charcoal,long_sleeve_dress,slight,medium,no
3,U001,P00006,0.671890,plain,Gray,long_sleeve_dress,heavy,medium,frontal
4,U001,P00007,0.671890,plain,Gray,long_sleeve_dress,slight,no,no
...,...,...,...,...,...,...,...,...,...
24995,U100,P00712,0.570034,plain,Silver,long_sleeve_dress,slight,medium,side_back
24996,U100,P00713,0.385366,printed,Silver,long_sleeve_dress,slight,no,frontal
24997,U100,P00714,0.385366,printed,Silver,long_sleeve_dress,slight,no,frontal
24998,U100,P00715,0.570034,plain,Silver,long_sleeve_dress,slight,no,frontal


In [85]:
prod_attr.sort_values(by=['predicted_rating'], inplace=True,ascending=False)

In [163]:
#Checking how ratings are being calculated for a user

In [86]:
user_037_pred = prod_attr[prod_attr['user_id']=="U037"]

In [87]:
user_037_act = cbf_df[cbf_df['user_id']=="U037"]

In [88]:
u037_merged = user_037_pred.merge(user_037_act,how='left',on=['user_id','product_id'])

In [None]:
#This u037_merged table contains predicted ratings for all products of the user U037

In [89]:
#The predicted_rating is standardized (between 0 and 1). standardized_rating is the standardized version 
#of product_reviews column. Note that if the product_reviews column is NaN, that means the user did not provide rating
#for that product. predicted_rating provides the corresponding rating that the user can possibly provide
u037_merged

Unnamed: 0,user_id,product_id,predicted_rating,pattern_x,color_x,category_name_x,occlusion_type_x,zoom_in_type_x,viewpoint_type_x,product_reviews,pattern_y,color_y,category_name_y,occlusion_type_y,zoom_in_type_y,viewpoint_type_y,standardized_rating
0,U037,P00524,1.000000,printed,Silver,vest_dress,medium,medium,frontal,5,printed,Silver,vest_dress,medium,medium,frontal,1.000
1,U037,P00522,1.000000,printed,Silver,vest_dress,medium,medium,side_back,5,printed,Silver,vest_dress,medium,medium,side_back,1.000
2,U037,P00160,1.000000,printed,Brown,vest_dress,medium,no,frontal,4,printed,Brown,vest_dress,medium,no,frontal,0.775
3,U037,P00171,1.000000,printed,Silver,vest_dress,heavy,medium,frontal,1,printed,Silver,vest_dress,heavy,medium,frontal,0.100
4,U037,P00173,1.000000,printed,Silver,vest_dress,heavy,no,frontal,3,printed,Silver,vest_dress,heavy,no,frontal,0.550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,U037,P00157,0.117687,plain,Charcoal,vest_dress,slight,heavy,no,,,,,,,,
246,U037,P00147,0.117687,plain,Charcoal,vest_dress,slight,no,side_back,,,,,,,,
247,U037,P00165,0.117687,plain,Charcoal,vest_dress,medium,no,side_back,,,,,,,,
248,U037,P00167,0.117687,plain,Charcoal,vest_dress,slight,no,frontal,,,,,,,,


Insight:
1. For user U037, the original ratings for P00524, P00522 is 5 (standardized is 1). So the predicted rating for all products with printed silver vest dress is 5. So it is understood that just category, pattern and color does not explain the reason for ratings. So to add more variance, we will add occlusion, viewpoint and zoom-in variables.

Adding occlusion, viewpoint and zoomin in CBF

In [91]:
#Step 4: Build product matrix on occlusion
occlusion_product = cbf_df[['product_id','occlusion_type']]
occlusion_product = occlusion_product.sort_values(by='product_id')

In [92]:
occlusion_product = occlusion_product.drop_duplicates(subset=['product_id','occlusion_type'], keep='first')

In [93]:
#As occlusion are strings, using TFIDF to convert them to usable values
tfidf = TfidfVectorizer()
tfidf_fit = tfidf.fit_transform(list(occlusion_product['occlusion_type']))

In [95]:
cols = tfidf.get_feature_names()
cols

['heavy', 'medium', 'slight']

In [96]:
#Adding a small number (0.01) to avoid zeros
occlusion_prod_matrix = (pd.DataFrame(tfidf_fit.toarray().round(3), 
                                   index=[i for i in occlusion_product['product_id']], columns=cols))+0.01


In [97]:
occlusion_prod_matrix

Unnamed: 0,heavy,medium,slight
P00003,0.01,0.01,1.01
P00004,0.01,0.01,1.01
P00005,0.01,0.01,1.01
P00006,1.01,0.01,0.01
P00007,0.01,0.01,1.01
...,...,...,...
P00712,0.01,0.01,1.01
P00713,0.01,0.01,1.01
P00714,0.01,0.01,1.01
P00715,0.01,0.01,1.01


In [101]:
#Step 4: Build product matrix on viewpoint
viewpoint_product = cbf_df[['product_id','viewpoint_type']]
viewpoint_product = viewpoint_product.sort_values(by='product_id')

In [102]:
viewpoint_product = viewpoint_product.drop_duplicates(subset=['product_id','viewpoint_type'], keep='first')

In [103]:
#As viewpoint are strings, using TFIDF to convert them to usable values
tfidf = TfidfVectorizer()
tfidf_fit = tfidf.fit_transform(list(viewpoint_product['viewpoint_type']))

In [104]:
cols = tfidf.get_feature_names()
cols

['frontal', 'no', 'side_back']

In [105]:
#Adding a small number (0.01) to avoid zeros
viewpoint_prod_matrix = (pd.DataFrame(tfidf_fit.toarray().round(3), 
                                   index=[i for i in viewpoint_product['product_id']], columns=cols))+0.01


In [106]:
viewpoint_prod_matrix

Unnamed: 0,frontal,no,side_back
P00003,0.01,1.01,0.01
P00004,1.01,0.01,0.01
P00005,0.01,1.01,0.01
P00006,1.01,0.01,0.01
P00007,0.01,1.01,0.01
...,...,...,...
P00712,0.01,0.01,1.01
P00713,1.01,0.01,0.01
P00714,1.01,0.01,0.01
P00715,1.01,0.01,0.01


In [107]:
#Step 5: Build product matrix on zoom-in
zoom_product = cbf_df[['product_id','zoom_in_type']]
zoom_product = zoom_product.sort_values(by='product_id')

In [108]:
zoom_product = zoom_product.drop_duplicates(subset=['product_id','zoom_in_type'], keep='first')

In [109]:
#As zoomin are strings, using TFIDF to convert them to usable values
tfidf = TfidfVectorizer()
tfidf_fit = tfidf.fit_transform(list(zoom_product['zoom_in_type']))

In [110]:
cols = tfidf.get_feature_names()
cols

['heavy', 'medium', 'no']

In [111]:
#Adding a small number (0.01) to avoid zeros
zoom_prod_matrix = (pd.DataFrame(tfidf_fit.toarray().round(3), 
                                   index=[i for i in zoom_product['product_id']], columns=cols))+0.01


In [112]:
zoom_prod_matrix

Unnamed: 0,heavy,medium,no
P00003,0.01,1.01,0.01
P00004,0.01,1.01,0.01
P00005,0.01,1.01,0.01
P00006,0.01,1.01,0.01
P00007,0.01,0.01,1.01
...,...,...,...
P00712,0.01,1.01,0.01
P00713,0.01,0.01,1.01
P00714,0.01,0.01,1.01
P00715,0.01,0.01,1.01


In [113]:
cosine_sim_matrix_occlusion = pd.DataFrame(cosine_similarity(occlusion_prod_matrix.values),
                                  columns=occlusion_product['product_id'],index=occlusion_product['product_id'])

In [114]:
cosine_sim_matrix_occlusion

product_id,P00003,P00004,P00005,P00006,P00007,P00008,P00009,P00010,P00015,P00020,...,P00686,P00688,P00690,P00693,P00711,P00712,P00713,P00714,P00715,P00716
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P00003,1.000000,1.000000,1.000000,0.019896,1.000000,1.000000,0.019896,1.000000,0.019896,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
P00004,1.000000,1.000000,1.000000,0.019896,1.000000,1.000000,0.019896,1.000000,0.019896,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
P00005,1.000000,1.000000,1.000000,0.019896,1.000000,1.000000,0.019896,1.000000,0.019896,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
P00006,0.019896,0.019896,0.019896,1.000000,0.019896,0.019896,0.019896,0.019896,0.019896,0.019896,...,0.019896,0.019896,0.019896,0.019896,0.019896,0.019896,0.019896,0.019896,0.019896,0.019896
P00007,1.000000,1.000000,1.000000,0.019896,1.000000,1.000000,0.019896,1.000000,0.019896,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P00712,1.000000,1.000000,1.000000,0.019896,1.000000,1.000000,0.019896,1.000000,0.019896,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
P00713,1.000000,1.000000,1.000000,0.019896,1.000000,1.000000,0.019896,1.000000,0.019896,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
P00714,1.000000,1.000000,1.000000,0.019896,1.000000,1.000000,0.019896,1.000000,0.019896,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
P00715,1.000000,1.000000,1.000000,0.019896,1.000000,1.000000,0.019896,1.000000,0.019896,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000


In [115]:
cosine_sim_matrix_viewpoint = pd.DataFrame(cosine_similarity(viewpoint_prod_matrix.values),
                                  columns=viewpoint_product['product_id'],index=viewpoint_product['product_id'])

In [120]:
cosine_sim_matrix_viewpoint

product_id,P00003,P00004,P00005,P00006,P00007,P00008,P00009,P00010,P00015,P00020,...,P00686,P00688,P00690,P00693,P00711,P00712,P00713,P00714,P00715,P00716
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P00003,1.000000,0.019896,1.000000,0.019896,1.000000,1.000000,0.019896,1.000000,0.019896,0.019896,...,0.019896,0.019896,0.019896,0.019896,1.000000,0.019896,0.019896,0.019896,0.019896,0.019896
P00004,0.019896,1.000000,0.019896,1.000000,0.019896,0.019896,1.000000,0.019896,0.019896,1.000000,...,1.000000,1.000000,1.000000,1.000000,0.019896,0.019896,1.000000,1.000000,1.000000,1.000000
P00005,1.000000,0.019896,1.000000,0.019896,1.000000,1.000000,0.019896,1.000000,0.019896,0.019896,...,0.019896,0.019896,0.019896,0.019896,1.000000,0.019896,0.019896,0.019896,0.019896,0.019896
P00006,0.019896,1.000000,0.019896,1.000000,0.019896,0.019896,1.000000,0.019896,0.019896,1.000000,...,1.000000,1.000000,1.000000,1.000000,0.019896,0.019896,1.000000,1.000000,1.000000,1.000000
P00007,1.000000,0.019896,1.000000,0.019896,1.000000,1.000000,0.019896,1.000000,0.019896,0.019896,...,0.019896,0.019896,0.019896,0.019896,1.000000,0.019896,0.019896,0.019896,0.019896,0.019896
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P00712,0.019896,0.019896,0.019896,0.019896,0.019896,0.019896,0.019896,0.019896,1.000000,0.019896,...,0.019896,0.019896,0.019896,0.019896,0.019896,1.000000,0.019896,0.019896,0.019896,0.019896
P00713,0.019896,1.000000,0.019896,1.000000,0.019896,0.019896,1.000000,0.019896,0.019896,1.000000,...,1.000000,1.000000,1.000000,1.000000,0.019896,0.019896,1.000000,1.000000,1.000000,1.000000
P00714,0.019896,1.000000,0.019896,1.000000,0.019896,0.019896,1.000000,0.019896,0.019896,1.000000,...,1.000000,1.000000,1.000000,1.000000,0.019896,0.019896,1.000000,1.000000,1.000000,1.000000
P00715,0.019896,1.000000,0.019896,1.000000,0.019896,0.019896,1.000000,0.019896,0.019896,1.000000,...,1.000000,1.000000,1.000000,1.000000,0.019896,0.019896,1.000000,1.000000,1.000000,1.000000


In [118]:
cosine_sim_matrix_zoom = pd.DataFrame(cosine_similarity(zoom_prod_matrix.values),
                                  columns=zoom_product['product_id'],index=zoom_product['product_id'])

In [119]:
cosine_sim_matrix_zoom

product_id,P00003,P00004,P00005,P00006,P00007,P00008,P00009,P00010,P00015,P00020,...,P00686,P00688,P00690,P00693,P00711,P00712,P00713,P00714,P00715,P00716
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P00003,1.000000,1.000000,1.000000,1.000000,0.019896,0.019896,1.000000,1.000000,1.000000,1.000000,...,1.000000,0.019896,0.019896,1.000000,1.000000,1.000000,0.019896,0.019896,0.019896,0.019896
P00004,1.000000,1.000000,1.000000,1.000000,0.019896,0.019896,1.000000,1.000000,1.000000,1.000000,...,1.000000,0.019896,0.019896,1.000000,1.000000,1.000000,0.019896,0.019896,0.019896,0.019896
P00005,1.000000,1.000000,1.000000,1.000000,0.019896,0.019896,1.000000,1.000000,1.000000,1.000000,...,1.000000,0.019896,0.019896,1.000000,1.000000,1.000000,0.019896,0.019896,0.019896,0.019896
P00006,1.000000,1.000000,1.000000,1.000000,0.019896,0.019896,1.000000,1.000000,1.000000,1.000000,...,1.000000,0.019896,0.019896,1.000000,1.000000,1.000000,0.019896,0.019896,0.019896,0.019896
P00007,0.019896,0.019896,0.019896,0.019896,1.000000,1.000000,0.019896,0.019896,0.019896,0.019896,...,0.019896,1.000000,1.000000,0.019896,0.019896,0.019896,1.000000,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P00712,1.000000,1.000000,1.000000,1.000000,0.019896,0.019896,1.000000,1.000000,1.000000,1.000000,...,1.000000,0.019896,0.019896,1.000000,1.000000,1.000000,0.019896,0.019896,0.019896,0.019896
P00713,0.019896,0.019896,0.019896,0.019896,1.000000,1.000000,0.019896,0.019896,0.019896,0.019896,...,0.019896,1.000000,1.000000,0.019896,0.019896,0.019896,1.000000,1.000000,1.000000,1.000000
P00714,0.019896,0.019896,0.019896,0.019896,1.000000,1.000000,0.019896,0.019896,0.019896,0.019896,...,0.019896,1.000000,1.000000,0.019896,0.019896,0.019896,1.000000,1.000000,1.000000,1.000000
P00715,0.019896,0.019896,0.019896,0.019896,1.000000,1.000000,0.019896,0.019896,0.019896,0.019896,...,0.019896,1.000000,1.000000,0.019896,0.019896,0.019896,1.000000,1.000000,1.000000,1.000000


In [121]:
final_similarity_matrix_extraattr = final_similarity_matrix.multiply(cosine_sim_matrix_occlusion).multiply(cosine_sim_matrix_viewpoint).multiply(cosine_sim_matrix_zoom)



In [123]:
#Predicting ratings based on all attributes

weighted_ratings_extraattr = user_prod_matrix.dot(final_similarity_matrix_extraattr)

In [124]:
#Standardizing the weighted ratings

min_max = MinMaxScaler(feature_range = (0.1,1))
min_max.fit(weighted_ratings_extraattr.values)
weighted_ratings_extraattr = min_max.transform(weighted_ratings_extraattr.values)

In [125]:
weighted_ratings_extraattr

array([[0.10598532, 0.11278491, 0.11768249, ..., 0.60153473, 0.54611522,
        0.63227042],
       [0.32773952, 0.10067535, 0.10700397, ..., 0.1002317 , 0.10606993,
        0.10162541],
       [0.10101426, 0.10090828, 0.100376  , ..., 0.10802717, 0.33383121,
        0.12070768],
       ...,
       [0.32842861, 0.47615425, 0.13121766, ..., 0.43031547, 0.48075194,
        0.1438119 ],
       [0.10036156, 0.10029068, 0.10001667, ..., 0.10017151, 0.10606442,
        0.10089156],
       [0.76023603, 0.81481572, 0.15824091, ..., 0.51190365, 0.41523173,
        0.12052946]])

In [126]:
#Convert this array back to matrix form
weighted_ratings_extraattrdf = pd.DataFrame(weighted_ratings_extraattr,index=sorted(cbf_df['user_id'].unique()),
                                columns=sorted(cbf_df['product_id'].unique()))

In [127]:
# test = pd.melt(weighted_ratings, id_vars=sorted(cbf_df['user_id'].unique()),
#                                                 value_vars=sorted(cbf_df['product_id'].unique()))
weighted_ratings_extraattrdf = weighted_ratings_extraattrdf.stack().reset_index()

In [128]:
weighted_ratings_extraattrdf = weighted_ratings_extraattrdf.rename(columns={'level_0':'user_id','level_1':'product_id',
                                                  0:'predicted_rating'})

In [129]:
weighted_ratings_extraattrdf

Unnamed: 0,user_id,product_id,predicted_rating
0,U001,P00003,0.105985
1,U001,P00004,0.112785
2,U001,P00005,0.117682
3,U001,P00006,0.115225
4,U001,P00007,0.817410
...,...,...,...
24995,U100,P00712,0.115742
24996,U100,P00713,0.511904
24997,U100,P00714,0.511904
24998,U100,P00715,0.415232


In [130]:
prod_attr_1 = weighted_ratings_extraattrdf.merge(product_prop,on=['product_id'],how='left')

In [131]:
prod_attr_1.sort_values(by=['predicted_rating'], inplace=True,ascending=False)

In [132]:
prod_attr_1

Unnamed: 0,user_id,product_id,predicted_rating,pattern,color,category_name,occlusion_type,zoom_in_type,viewpoint_type
11006,U045,P00009,1.0,plain,Charcoal,long_sleeve_top,medium,medium,frontal
2824,U012,P00227,1.0,plain,Gray,long_sleeve_dress,medium,medium,frontal
9788,U040,P00152,1.0,plain,Gray,vest_dress,slight,medium,no
15956,U064,P00512,1.0,printed,Gray,vest_dress,slight,medium,frontal
18837,U076,P00250,1.0,printed,Gray,long_sleeve_top,slight,medium,side_back
...,...,...,...,...,...,...,...,...,...
16483,U066,P00607,0.1,plain,Charcoal,shorts,slight,medium,no
18692,U075,P00475,0.1,plain,Gray,trousers,slight,medium,frontal
18691,U075,P00473,0.1,plain,Tan,trousers,medium,no,frontal
18690,U075,P00467,0.1,plain,Charcoal,trousers,slight,no,frontal


In [133]:
user_037_pred_1 = prod_attr_1[prod_attr_1['user_id']=="U037"]

In [134]:
user_037_act_1 = cbf_df[cbf_df['user_id']=="U037"]

In [135]:
u037_merged_1 = user_037_pred_1.merge(user_037_act_1,how='left',on=['user_id','product_id'])

In [139]:
#The predicted_rating is standardized (between 0 and 1). standardized_rating is the standardized version 
#of product_reviews column. Note that if the product_reviews column is NaN, that means the user did not provide rating
#for that product. predicted_rating provides the corresponding rating that the user can possibly provide
u037_merged_1

Unnamed: 0,user_id,product_id,predicted_rating,pattern_x,color_x,category_name_x,occlusion_type_x,zoom_in_type_x,viewpoint_type_x,product_reviews,pattern_y,color_y,category_name_y,occlusion_type_y,zoom_in_type_y,viewpoint_type_y,standardized_rating
0,U037,P00524,1.000000,printed,Silver,vest_dress,medium,medium,frontal,5,printed,Silver,vest_dress,medium,medium,frontal,1.000
1,U037,P00522,1.000000,printed,Silver,vest_dress,medium,medium,side_back,5,printed,Silver,vest_dress,medium,medium,side_back,1.000
2,U037,P00160,0.996370,printed,Brown,vest_dress,medium,no,frontal,4,printed,Brown,vest_dress,medium,no,frontal,0.775
3,U037,P00097,0.987964,plain,Gray,long_sleeve_outwear,medium,medium,frontal,5,plain,Gray,long_sleeve_outwear,medium,medium,frontal,1.000
4,U037,P00099,0.987964,plain,Gray,long_sleeve_outwear,medium,medium,frontal,4,plain,Gray,long_sleeve_outwear,medium,medium,frontal,0.775
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,U037,P00307,0.100221,plain,Silver,short_sleeve_top,slight,heavy,no,,,,,,,,
246,U037,P00301,0.100221,plain,Silver,short_sleeve_top,slight,heavy,no,,,,,,,,
247,U037,P00302,0.100221,plain,Silver,short_sleeve_top,slight,heavy,no,,,,,,,,
248,U037,P00598,0.100152,printed,Gray,long_sleeve_top,heavy,no,side_back,,,,,,,,


In [152]:
#Products with 5 top reviews that can be recommended to U037
u037_merged_1[pd.isnull(u037_merged_1['product_reviews'])].head(5)

Unnamed: 0,user_id,product_id,predicted_rating,pattern_x,color_x,category_name_x,occlusion_type_x,zoom_in_type_x,viewpoint_type_x,product_reviews,pattern_y,color_y,category_name_y,occlusion_type_y,zoom_in_type_y,viewpoint_type_y,standardized_rating
12,U037,P00197,0.739081,printed,Gray,short_sleeve_top,slight,heavy,no,,,,,,,,
19,U037,P00425,0.571904,plain,Silver,short_sleeve_dress,medium,no,frontal,,,,,,,,
22,U037,P00257,0.501385,plain,Silver,long_sleeve_dress,medium,no,frontal,,,,,,,,
23,U037,P00276,0.501385,plain,Silver,long_sleeve_dress,medium,no,frontal,,,,,,,,
24,U037,P00273,0.501385,plain,Silver,long_sleeve_dress,medium,no,frontal,,,,,,,,
