# Load Product Data

In [1]:
import pandas as pd
import numpy as np

product_info = pd.read_csv("product_info.csv")
product_info.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8494 entries, 0 to 8493
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   product_id          8494 non-null   object 
 1   product_name        8494 non-null   object 
 2   brand_id            8494 non-null   int64  
 3   brand_name          8494 non-null   object 
 4   loves_count         8494 non-null   int64  
 5   rating              8216 non-null   float64
 6   reviews             8216 non-null   float64
 7   size                6863 non-null   object 
 8   variation_type      7050 non-null   object 
 9   variation_value     6896 non-null   object 
 10  variation_desc      1250 non-null   object 
 11  ingredients         7549 non-null   object 
 12  price_usd           8494 non-null   float64
 13  value_price_usd     451 non-null    float64
 14  sale_price_usd      270 non-null    float64
 15  limited_edition     8494 non-null   int64  
 16  new   

In [2]:
num_missing = product_info.isna().sum()
num_missing

Unnamed: 0,0
product_id,0
product_name,0
brand_id,0
brand_name,0
loves_count,0
rating,278
reviews,278
size,1631
variation_type,1444
variation_value,1598


# Content-based (skincare + makeup only, remove products with no ingredients)

In [3]:
products = pd.DataFrame(product_info, columns=['product_id','product_name','ingredients','highlights', 'primary_category'])
products.head(5)
products.shape

(8494, 5)

In [4]:
product_info.shape

(8494, 27)

In [5]:
products = products[products['primary_category'].isin(['Skincare', 'Makeup'])]

In [6]:
products = products.dropna(subset='ingredients')
products.shape

(4319, 5)

In [7]:
products = products.reset_index(drop=True)

indices = pd.Series(products.index, index=products['product_name'])

products

Unnamed: 0,product_id,product_name,ingredients,highlights,primary_category
0,P398965,Rose Lip Conditioner,['Polybutene; Hydrogenated Polyisobutene; Dext...,"['Hydrating', 'Good for: Dryness']",Makeup
1,P439055,GENIUS Sleeping Collagen Moisturizer,"['Collagen (Vegan)*, Water (Aqua, Eau), Ethylh...","['Vegan', 'Good for: Loss of firmness', 'Colla...",Skincare
2,P421277,GENIUS Liquid Collagen Serum,"['Collagen (Vegan)*, Water (Aqua, Eau), Propan...","['Vegan', 'Good for: Loss of firmness', 'Colla...",Skincare
3,P467602,Triple Algae Eye Renewal Balm Eye Cream,"['Aqua (Water/Eau), Stearic Acid, Isopropyl Is...",,Skincare
4,P432045,GENIUS Liquid Collagen Lip Treatment,"['Collagen (Vegan)*, Water (Aqua, Eau), Glycer...","['Vegan', 'Good for: Loss of firmness', 'Plump...",Skincare
...,...,...,...,...,...
4314,P474604,NU TONE CORRECTOR Color Corrector with Vitamin E,"['Aqua/Water/Eau, Glycerin, Alcohol Denat., Bu...","['Natural Finish', 'Without Mineral Oil', 'Wit...",Makeup
4315,P483501,Mini Rouge Pur Couture Exclusive Set,['Pentaerythrityl Isostearate/Caprate/Caprylat...,"['Hydrating', 'Satin Finish']",Makeup
4316,P467660,Mascara Volume Effet Faux Cils Radical,"['Aqua/Water, Paraffin, Stearic Acid, Copernic...",,Makeup
4317,P467659,Couture Clutch Eyeshadow Palette,"['Talc, Synthetic Fluorphlogopite, Triethylhex...",,Makeup


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

texts = products.ingredients.values

tfidf_matrix = vectorizer.fit_transform(texts)

tfidf_matrix.shape

(4319, 7102)

In [9]:
from sklearn.metrics.pairwise import linear_kernel

cosine_similarity = linear_kernel(tfidf_matrix, tfidf_matrix)

In [11]:
def get_recommendations(product_name, cosine_sim=cosine_similarity):
    idx = indices[product_name]

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    top_5_sim_scores = sim_scores[1:6]

    top_5_products = [(products['product_name'].iloc[i[0]], i[1]) for i in top_5_sim_scores]

    top_10_sim_scores = sim_scores[:10]

    return top_5_products, top_10_sim_scores


In [12]:
get_recommendations('GENIUS Sleeping Collagen Moisturizer', cosine_similarity)

([('GENIUS Liquid Collagen Serum', 0.46843551368353464),
  ('Mini GENIUS Liquid Collagen', 0.46843551368353464),
  ('10 Day Results Kit', 0.4677626217122434),
  ('GENIUS Liquid Collagen Lip Treatment', 0.43205279733916885),
  ('Plump Pout Duo', 0.43205279733916885)],
 [(1, 0.9999999999999992),
  (2, 0.46843551368353464),
  (26, 0.46843551368353464),
  (9, 0.4677626217122434),
  (4, 0.43205279733916885),
  (29, 0.43205279733916885),
  (17, 0.4044541147983461),
  (16, 0.39100973929731325),
  (6, 0.3561567942588289),
  (23, 0.342073799969327)])

# Collaborative Filtering (SVD) | Experiment 3 (SVD+IR) & Experiment 4 (CS+SVD+IR)

# Load Data (cuma yang data review karena di atas udah load product info)

In [13]:
reviews = pd.read_csv("reviews_0-250.csv")
reviews.info()

  reviews = pd.read_csv("reviews_0-250.csv")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 602130 entries, 0 to 602129
Data columns (total 19 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Unnamed: 0                602130 non-null  int64  
 1   author_id                 602130 non-null  object 
 2   rating                    602130 non-null  int64  
 3   is_recommended            484644 non-null  float64
 4   helpfulness               270298 non-null  float64
 5   total_feedback_count      602130 non-null  int64  
 6   total_neg_feedback_count  602130 non-null  int64  
 7   total_pos_feedback_count  602130 non-null  int64  
 8   submission_time           602130 non-null  object 
 9   review_text               601131 non-null  object 
 10  review_title              435119 non-null  object 
 11  skin_tone                 496074 non-null  object 
 12  eye_color                 463642 non-null  object 
 13  skin_type                 527447 non-null  o

# Data Preprocessing

In [14]:
columns_to_drop = ['Unnamed: 0', 'total_feedback_count', 'total_neg_feedback_count', 'total_pos_feedback_count', 'submission_time', 'review_title', 'is_recommended', 'product_name', 'brand_name', 'price_usd']
reviews.drop(columns=columns_to_drop, inplace=True)

In [15]:
new_data = {
    # 'author_id': ['Sukma', 'Cath', 'Keren', 'Adel', 'Adel', 'Adel', 'Caroline', 'Caroline', 'Caroline', 'Caroline', 'Grisella', 'Grisella', 'Tasia', 'Tasia', 'Tasia', 'Neisya', 'Neisya', 'Neisya', 'Matthew', 'Ariya', 'Ariya'],
    'author_id': ['1', '2', '3', '4', '4', '4', '5', '5', '5', '5', '6', '6', '7', '7', '7', '8', '8', '8', '9', '10', '10'],
    'rating': [5, 4, 4, 4, 4, 4, 5, 5, 5, 5, 4, 4, 5, 5, 2, 5, 3, 4, 2, 4, 3],
    'helpfulness': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0.20],
    'review_text': ['I really love it because it makes my lips very moist when I wake up, I use it before going to bed', 'Good for oily skin like me', 'Good, makes my skin smoother',
                    'The product is good for repairing dry and split hair. However, there are many products with more affordable prices that can give similar effects to your hair',
                    'The product is good, especially for dry lips', 'Good for dry skin. The product give quite good brightening effect',
                    'I absolutely love the Lip Sleeping Mask Intense Hydration with Vitamin C! A game-changer for my dry lips',
                    'The Lip Glowy Balm has become my go-to for daily lip care. Lightweight, non-sticky, and provides a beautiful natural shine',
                    'The Glowy Makeup Serum is amazing! It gives my skin a radiant, dewy finish that lasts all day. It works perfectly as a base under my makeup',
                    'The Cream Skin Toner & Moisturizer is fantastic for simplifying my skincare routine. It provides the hydration of a cream with the lightness of a toner',
                    'The lipstick doesnt dry out the lips, the color is stain-resistant, and it lasts a long time',
                    'This product helps me to keep my lips moisturized and not dry',
                    'Use it at night and see the results in the morning, your lips feel soft and plumpy',
                    'My lips become moist, not dry, and not chapped. Gives a glowy impression to my lips',
                    '2 days of use, my face became red and blotchy, my face felt itchy',
                    'I have used this products for like a year. i did see a lot of effects  to my skin. my acnes got so much better like literally improved a lot',
                    'I have used this products only for 1 week. however i didnt see a lot of effect but its did  moisturize my skin',
                    'I have used this product for 6 moths. i see a lot of improvement of my skin like as prevented my skin-aging, smoothed my skin texture, even a bit brighten my skin',
                    'I purchased this product in hope this will clean the sebaceous filaments in my nose. However, the product doesnt give effect as I expected. Rather than cleanse my nose, it only make my face pinkish and soften my face a little',
                    'It is an affordable, potent serum containing niacinamide and zinc (if you’re specifically looking for those ingredients). Some reviews on Sephora have mentioned this has worked wonders for their acne',
                    'The glycolic acid is too harsh for my skin'],
    'skin_tone': ['medium', 'fair', 'medium', 'fair', 'fair', 'fair', 'fair', 'fair', 'fair', 'fair', 'mediumLight', 'mediumLight', 'light', 'light', 'light', 'light', 'light', 'light', 'mediumLight', 'light', 'light'],
    'eye_color': ['black', 'black', 'black', 'brown', 'brown', 'brown', 'black', 'black', 'black', 'black', 'black', 'black', 'brown', 'brown', 'brown', 'black', 'black', 'black', 'black', 'black', 'black'],
    'skin_type': ['dry', 'oily', 'dry', 'dry', 'dry', 'dry', 'normal', 'normal', 'normal', 'normal', 'normal', 'normal', 'oily', 'oily', 'oily', 'combination', 'combination', 'combination', 'normal', 'oily', 'oily'],
    'hair_color': ['black', 'black', 'black', 'brown', 'brown', 'brown', 'black', 'black', 'black', 'black', 'black', 'black', 'black', 'black', 'black', 'brown', 'brown', 'brown', 'black', 'black', 'black'],
    'product_id': ['P420652', 'P468658', 'P468658', 'P461483', 'P467449', 'P427417', 'P420652', 'P443563', 'P457508', 'P446930', 'P476549', 'P420652', 'P503868', 'P417242', 'P458755', 'P427417', 'P468410', 'P447773', 'P469502', 'P442563', 'P427406'],
}

new_df = pd.DataFrame(new_data)

In [16]:
# from google.colab import files
# files.download('reviews_new.csv')

In [17]:
product_info = product_info[product_info['primary_category'].isin(['Skincare', 'Makeup'])]

product_info = product_info.dropna(subset=['reviews','rating'])

# product_stats = reviews.groupby(['product_id'])['product_id'].count().reset_index(name='counts')

# product_stats.sort_values('counts', ascending=False)

# cutoff = product_stats['counts'].quantile(0.1)

# filtered_products = product_stats.loc[product_stats['counts'] > cutoff]

# products_set = filtered_products['product_id'].squeeze()

# products_subset = product_info.loc[product_info['product_id'].isin(products_set)]

In [18]:
product_info.shape
product_info.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4679 entries, 65 to 8489
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   product_id          4679 non-null   object 
 1   product_name        4679 non-null   object 
 2   brand_id            4679 non-null   int64  
 3   brand_name          4679 non-null   object 
 4   loves_count         4679 non-null   int64  
 5   rating              4679 non-null   float64
 6   reviews             4679 non-null   float64
 7   size                3669 non-null   object 
 8   variation_type      3844 non-null   object 
 9   variation_value     3749 non-null   object 
 10  variation_desc      1189 non-null   object 
 11  ingredients         4223 non-null   object 
 12  price_usd           4679 non-null   float64
 13  value_price_usd     229 non-null    float64
 14  sale_price_usd      184 non-null    float64
 15  limited_edition     4679 non-null   int64  
 16  new       

In [19]:
columns_to_drop = ['variation_type', 'variation_value', 'variation_desc', 'price_usd', 'value_price_usd', 'sale_price_usd', 'limited_edition',
                   'new', 'online_only', 'out_of_stock', 'sephora_exclusive', 'child_count', 'child_max_price', 'child_min_price', 'secondary_category', 'tertiary_category']
product_info.drop(columns=columns_to_drop, inplace=True)

In [20]:
author_stats = reviews.groupby(['author_id'])['author_id'].count().reset_index(name='counts')

author_stats.sort_values('counts', ascending=False)

cutoff= author_stats['counts'].quantile(0.99)

filtered_authors = author_stats.loc[author_stats['counts'] > cutoff]

authors = filtered_authors['author_id'].squeeze()

reviews_subset = reviews.loc[reviews['author_id'].isin(authors)]

In [21]:
reviews_new2 = pd.concat([reviews_subset, new_df], axis=0)
# reviews_new2.to_csv('reviews_new2.csv', index=False)

In [22]:
reviews_new2.shape

(27935, 9)

In [23]:
reviews_new2.shape

(27935, 9)

In [24]:
reviews_new2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27935 entries, 194 to 20
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   author_id    27935 non-null  object 
 1   rating       27935 non-null  int64  
 2   helpfulness  12964 non-null  float64
 3   review_text  27908 non-null  object 
 4   skin_tone    25068 non-null  object 
 5   eye_color    24759 non-null  object 
 6   skin_type    26215 non-null  object 
 7   hair_color   24338 non-null  object 
 8   product_id   27935 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 2.1+ MB


In [25]:
reviews_new2 = reviews_new2.dropna(subset=['skin_tone', 'eye_color', 'skin_type', 'hair_color'])
reviews_new2['user_characteristics'] = reviews_new2.apply(
    lambda row: [
        f'{row["skin_tone"].capitalize()} Skintone',
        f'{row["eye_color"].capitalize()} Eyes',
        f'{row["skin_type"].capitalize()} Skintype',
        f'{row["hair_color"].capitalize()} Hair'
    ], axis=1
)

In [26]:
reviews_new2.shape
reviews_new2

Unnamed: 0,author_id,rating,helpfulness,review_text,skin_tone,eye_color,skin_type,hair_color,product_id,user_characteristics
194,35439265952,4,1.0,Works for me. I use a thick layer of this last...,medium,brown,combination,black,P420652,"[Medium Skintone, Brown Eyes, Combination Skin..."
597,5069912549,3,0.6,Not sure how I feel about the gummy-bear scent...,fair,blue,dry,blonde,P420652,"[Fair Skintone, Blue Eyes, Dry Skintype, Blond..."
598,5069912549,5,0.8,I still can’t get over how well they knock the...,fair,blue,dry,blonde,P420652,"[Fair Skintone, Blue Eyes, Dry Skintype, Blond..."
599,5069912549,5,1.0,"Speaking strictly on scent here, the Mango is ...",fair,blue,dry,blonde,P420652,"[Fair Skintone, Blue Eyes, Dry Skintype, Blond..."
620,5069912549,5,1.0,"The Laneige lip sleeping mask, put simply, can...",fair,blue,dry,blonde,P420652,"[Fair Skintone, Blue Eyes, Dry Skintype, Blond..."
...,...,...,...,...,...,...,...,...,...,...
16,8,3,1.0,I have used this products only for 1 week. how...,light,black,combination,brown,P468410,"[Light Skintone, Black Eyes, Combination Skint..."
17,8,4,1.0,I have used this product for 6 moths. i see a ...,light,black,combination,brown,P447773,"[Light Skintone, Black Eyes, Combination Skint..."
18,9,2,0.0,I purchased this product in hope this will cle...,mediumLight,black,normal,black,P469502,"[Mediumlight Skintone, Black Eyes, Normal Skin..."
19,10,4,1.0,"It is an affordable, potent serum containing n...",light,black,oily,black,P442563,"[Light Skintone, Black Eyes, Oily Skintype, Bl..."


# Combining 2 data

In [27]:
result = product_info.merge(reviews_new2,  how='inner', on='product_id')

In [28]:
result.shape

(23616, 20)

In [29]:
result = result.reset_index(drop=True)

# Calculate Cosine Similarity between users

In [30]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

result['user_characteristics_str'] = result['user_characteristics'].apply(lambda x: ', '.join(x))

texts = result.user_characteristics_str.values

vectorizer = TfidfVectorizer(max_features=100)

tfidf_matrix = vectorizer.fit_transform(texts)

user_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

user_sim_matrix_df = pd.DataFrame(user_sim_matrix, index=result['author_id'], columns=result['author_id'])

# Calculate Implicit Rating

In [31]:
from textblob import TextBlob

def get_sentiment_score(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity  # Returns a score between -1 and 1

result['review_text'] = result['review_text'].astype(str).fillna('')

result['sentiment_score'] = result['review_text'].apply(get_sentiment_score)

In [32]:
import numpy as np

def sentiment_to_rating(sentiment_score):
    return np.interp(sentiment_score, [-1, 1], [1, 5])

result['implicit_rating'] = result['sentiment_score'].apply(sentiment_to_rating)

In [33]:
result.info()
result

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23616 entries, 0 to 23615
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   product_id                23616 non-null  object 
 1   product_name              23616 non-null  object 
 2   brand_id                  23616 non-null  int64  
 3   brand_name                23616 non-null  object 
 4   loves_count               23616 non-null  int64  
 5   rating_x                  23616 non-null  float64
 6   reviews                   23616 non-null  float64
 7   size                      23247 non-null  object 
 8   ingredients               23114 non-null  object 
 9   highlights                22431 non-null  object 
 10  primary_category          23616 non-null  object 
 11  author_id                 23616 non-null  object 
 12  rating_y                  23616 non-null  int64  
 13  helpfulness               11102 non-null  float64
 14  review

Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,rating_x,reviews,size,ingredients,highlights,...,helpfulness,review_text,skin_tone,eye_color,skin_type,hair_color,user_characteristics,user_characteristics_str,sentiment_score,implicit_rating
0,P439055,GENIUS Sleeping Collagen Moisturizer,6018,Algenist,33910,4.5413,1321.0,2 oz/ 60 mL,"['Collagen (Vegan)*, Water (Aqua, Eau), Ethylh...","['Vegan', 'Good for: Loss of firmness', 'Colla...",...,1.000000,My skin was glowy and plump after using this! ...,medium,blue,combination,brown,"[Medium Skintone, Blue Eyes, Combination Skint...","Medium Skintone, Blue Eyes, Combination Skinty...",0.625000,4.250000
1,P439055,GENIUS Sleeping Collagen Moisturizer,6018,Algenist,33910,4.5413,1321.0,2 oz/ 60 mL,"['Collagen (Vegan)*, Water (Aqua, Eau), Ethylh...","['Vegan', 'Good for: Loss of firmness', 'Colla...",...,0.875000,I absolutely loved this product. It is a very ...,mediumTan,brown,combination,brown,"[Mediumtan Skintone, Brown Eyes, Combination S...","Mediumtan Skintone, Brown Eyes, Combination Sk...",0.210185,3.420370
2,P439055,GENIUS Sleeping Collagen Moisturizer,6018,Algenist,33910,4.5413,1321.0,2 oz/ 60 mL,"['Collagen (Vegan)*, Water (Aqua, Eau), Ethylh...","['Vegan', 'Good for: Loss of firmness', 'Colla...",...,0.555556,The texture of this product is awful! It never...,lightMedium,brown,combination,brown,"[Lightmedium Skintone, Brown Eyes, Combination...","Lightmedium Skintone, Brown Eyes, Combination ...",-0.166667,2.666667
3,P439055,GENIUS Sleeping Collagen Moisturizer,6018,Algenist,33910,4.5413,1321.0,2 oz/ 60 mL,"['Collagen (Vegan)*, Water (Aqua, Eau), Ethylh...","['Vegan', 'Good for: Loss of firmness', 'Colla...",...,1.000000,I started using this in hopes of getting rid o...,medium,brown,combination,black,"[Medium Skintone, Brown Eyes, Combination Skin...","Medium Skintone, Brown Eyes, Combination Skint...",0.267500,3.535000
4,P439055,GENIUS Sleeping Collagen Moisturizer,6018,Algenist,33910,4.5413,1321.0,2 oz/ 60 mL,"['Collagen (Vegan)*, Water (Aqua, Eau), Ethylh...","['Vegan', 'Good for: Loss of firmness', 'Colla...",...,0.818182,First off I’m a product junkie and have a 12+ ...,medium,brown,combination,blonde,"[Medium Skintone, Brown Eyes, Combination Skin...","Medium Skintone, Brown Eyes, Combination Skint...",0.081500,3.163000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23611,P411388,Superfood Air-Whip Lightweight Moisturizer wit...,6169,Youth To The People,133769,4.0491,1670.0,2 oz / 59 mL,"['Water/Aqua/Eau, Helianthus Annuus (Sunflower...","['Hyaluronic Acid', 'Good for: Pores', 'Plumpi...",...,,I no longer use this but do love it. When I us...,fair,green,combination,brown,"[Fair Skintone, Green Eyes, Combination Skinty...","Fair Skintone, Green Eyes, Combination Skintyp...",0.208194,3.416389
23612,P411388,Superfood Air-Whip Lightweight Moisturizer wit...,6169,Youth To The People,133769,4.0491,1670.0,2 oz / 59 mL,"['Water/Aqua/Eau, Helianthus Annuus (Sunflower...","['Hyaluronic Acid', 'Good for: Pores', 'Plumpi...",...,,"I wanted to love this, and thought this was go...",deep,brown,dry,black,"[Deep Skintone, Brown Eyes, Dry Skintype, Blac...","Deep Skintone, Brown Eyes, Dry Skintype, Black...",0.145076,3.290152
23613,P411388,Superfood Air-Whip Lightweight Moisturizer wit...,6169,Youth To The People,133769,4.0491,1670.0,2 oz / 59 mL,"['Water/Aqua/Eau, Helianthus Annuus (Sunflower...","['Hyaluronic Acid', 'Good for: Pores', 'Plumpi...",...,,I absolutely love this product. I got it in a ...,fair,blue,oily,brown,"[Fair Skintone, Blue Eyes, Oily Skintype, Brow...","Fair Skintone, Blue Eyes, Oily Skintype, Brown...",0.328571,3.657143
23614,P411388,Superfood Air-Whip Lightweight Moisturizer wit...,6169,Youth To The People,133769,4.0491,1670.0,2 oz / 59 mL,"['Water/Aqua/Eau, Helianthus Annuus (Sunflower...","['Hyaluronic Acid', 'Good for: Pores', 'Plumpi...",...,,I’ve been using this moisturizer for a few wee...,fair,brown,dry,brown,"[Fair Skintone, Brown Eyes, Dry Skintype, Brow...","Fair Skintone, Brown Eyes, Dry Skintype, Brown...",0.223810,3.447619


# SVD (with IR)

In [34]:
!pip install surprise



In [35]:
# from surprise import Reader, Dataset, SVD, accuracy
# from surprise.model_selection import train_test_split

# reader = Reader(rating_scale=(1, 5))
# data = Dataset.load_from_df(result[['author_id', 'product_id', 'implicit_rating']], reader)

# trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# algo = SVD()

# algo.fit(trainset)

In [36]:
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import train_test_split

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(result[['author_id', 'product_id', 'implicit_rating']], reader)

trainset = data.build_full_trainset()

algo = SVD()

algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7e2cf51716f0>

In [37]:
testset = trainset.build_anti_testset()

predictions = algo.test(testset)

In [38]:
# Accuracy normal SVD + IR
mse = accuracy.mse(predictions)
rmse = accuracy.rmse(predictions)

MSE: 0.0229
RMSE: 0.1513


In [39]:
pred = pd.DataFrame(predictions)

In [40]:
print(f"Total Users in Similarity Matrix: {user_sim_matrix.shape[0]}")
print(f"Total Users in Predictions: {len(pred['uid'])}")

print(result.isnull().sum())

Total Users in Similarity Matrix: 23616
Total Users in Predictions: 601195
product_id                      0
product_name                    0
brand_id                        0
brand_name                      0
loves_count                     0
rating_x                        0
reviews                         0
size                          369
ingredients                   502
highlights                   1185
primary_category                0
author_id                       0
rating_y                        0
helpfulness                 12514
review_text                     0
skin_tone                       0
eye_color                       0
skin_type                       0
hair_color                      0
user_characteristics            0
user_characteristics_str        0
sentiment_score                 0
implicit_rating                 0
dtype: int64


In [41]:
def get_recommendations(user_id, pred):
    pred_subset = pred.loc[pred['uid'] == user_id]

    pred_subset.sort_values(by=['est'],ascending = False)

    product_list = pred_subset.head(5)['iid'].to_list()

    recommendations = product_info.loc[product_info['product_id'].isin(product_list)]

    return recommendations['product_name']


In [42]:
get_recommendations('2660223134', pred)

Unnamed: 0,product_name
90,GENIUS Liquid Collagen Serum
91,Triple Algae Eye Renewal Balm Eye Cream
434,Skinlongevity Long Life Herb Anti-Aging Face S...
511,The True Cream Aqua Bomb
512,The True Cream Moisturizing Bomb


In [43]:
get_recommendations('7', pred)

Unnamed: 0,product_name
89,GENIUS Sleeping Collagen Moisturizer
90,GENIUS Liquid Collagen Serum
91,Triple Algae Eye Renewal Balm Eye Cream
434,Skinlongevity Long Life Herb Anti-Aging Face S...
511,The True Cream Aqua Bomb


# SVD with IR (optimizes with Cosine Similarity)

In [45]:
import pandas as pd

if isinstance(user_sim_matrix_df, np.ndarray):
    user_sim_matrix_df = pd.DataFrame(user_sim_matrix_df, index=result['author_id'], columns=result['author_id'])


In [46]:
def get_recommendations_SVD(user_id, pred, user_sim_matrix_df):
    if user_id not in user_sim_matrix_df.index:
        return "User not found"

    user_similarities = user_sim_matrix_df.loc[user_id]

    if isinstance(user_similarities, pd.DataFrame):
        user_similarities = user_similarities.iloc[0]

    user_similarities_df = user_similarities.to_frame().reset_index()

    user_similarities_df.columns = ['Column', 'Similarity']

    user_similarities_sorted_df = user_similarities_df.sort_values(by='Similarity', ascending=False)

    top_ten_similarities = user_similarities_sorted_df.head(10)

    similar_users = top_ten_similarities['Column'].tolist()

    # return similar_users

    all_predictions = []

    for similar_user_id in similar_users:
        similar_user_predictions = pred[pred['uid'] == similar_user_id]
        all_predictions.extend(similar_user_predictions.itertuples(index=False))

    # print(all_predictions)

    all_predictions_df = pd.DataFrame(all_predictions, columns=['uid', 'iid', 'r_ui', 'est', 'details'])

    product_list = all_predictions_df['iid'].value_counts().index[:5].tolist()

    recommendations = product_info[product_info['product_id'].isin(product_list)]

    return recommendations[['product_id', 'product_name']]


In [47]:
tes = get_recommendations_SVD(user_id="2660223134", pred=pred, user_sim_matrix_df=user_sim_matrix_df)
print(tes)

     product_id                                       product_name
4214    P456412               Greek Yoghurt Foaming Cream Cleanser
4215    P480612                Santorini Grape Poreless Skin Cream
4216    P456207       Greek Yoghurt Nourishing Probiotic Gel-Cream
4355     P54509  Tonique Confort Re-Hydrating Comforting Toner ...
4356    P453825  Hydra Zen Glow Liquid Lightweight Moisturizer ...


In [48]:
tes = get_recommendations_SVD(user_id="10", pred=pred, user_sim_matrix_df=user_sim_matrix_df)
print(tes)

     product_id                            product_name
89      P439055    GENIUS Sleeping Collagen Moisturizer
5586    P416816       Balancing Force Oil Control Toner
5588    P473322       Dewtopia 20% Acid Night Treatment
5589    P416815  Find Your Balance Oil Control Cleanser
5590    P466142                Lemonade Smoothing Scrub


# Evalution Metrics (RMSE n MSE)

In [49]:
from surprise import accuracy

def get_recommendations_SVD(user_id, pred, user_sim_matrix_df):
    if user_id not in user_sim_matrix_df.index:
        return "User not found"

    user_similarities = user_sim_matrix_df.loc[user_id]

    if isinstance(user_similarities, pd.DataFrame):
        user_similarities = user_similarities.iloc[0]

    user_similarities_df = user_similarities.to_frame().reset_index()

    user_similarities_df.columns = ['Column', 'Similarity']

    user_similarities_sorted_df = user_similarities_df.sort_values(by='Similarity', ascending=False)

    top_ten_similarities = user_similarities_sorted_df.head(10)

    similar_users = top_ten_similarities['Column'].tolist()

    # return similar_users

    all_predictions = []

    for similar_user_id in similar_users:
        similar_user_predictions = pred[pred['uid'] == similar_user_id]
        all_predictions.extend(similar_user_predictions.itertuples(index=False))

    # print(all_predictions)

    all_predictions_df = pd.DataFrame(all_predictions, columns=['uid', 'iid', 'r_ui', 'est', 'details'])

    product_list = all_predictions_df['iid'].value_counts().index[:5].tolist()

    recommendations = product_info[product_info['product_id'].isin(product_list)]
    recommendations = recommendations[['product_id', 'product_name']]

    # Convert all_predictions to a Surprise Dataset object
    all_predictions_dataset = Dataset.load_from_df(all_predictions_df[['uid', 'iid', 'r_ui']], reader)

    # Convert pred to a Surprise Dataset object
    pred_dataset = Dataset.load_from_df(pred[['uid', 'iid', 'r_ui']], reader)

    # Build a full training set for both datasets
    all_predictions_trainset = all_predictions_dataset.build_full_trainset()
    pred_trainset = pred_dataset.build_full_trainset()

    # Compute predictions for both datasets
    all_predictions_estimates = algo.test(all_predictions_trainset.build_testset())
    pred_estimates = algo.test(pred_trainset.build_testset())

    # Compute RMSE and MSE for all_predictions
    all_predictions_rmse = accuracy.rmse(all_predictions_estimates)
    all_predictions_mse = accuracy.mse(all_predictions_estimates)

    # Compute RMSE and MSE for pred
    pred_rmse = accuracy.rmse(pred_estimates)
    pred_mse = accuracy.mse(pred_estimates)

    return recommendations, all_predictions_rmse, all_predictions_mse, pred_rmse, pred_mse

In [51]:
recommendations, all_predictions_rmse, all_predictions_mse, pred_rmse, pred_mse = get_recommendations_SVD('4', pred, user_sim_matrix_df)
print("Recommendations:", recommendations)
print("RMSE for all_predictions:", all_predictions_rmse)
print("MSE for all_predictions:", all_predictions_mse)
print("RMSE for pred:", pred_rmse)
print("MSE for pred:", pred_mse)

RMSE: 0.1574
MSE: 0.0248
RMSE: 0.1513
MSE: 0.0229
Recommendations:      product_id                                       product_name
89      P439055               GENIUS Sleeping Collagen Moisturizer
5589    P416815             Find Your Balance Oil Control Cleanser
5591    P434545                   Glow Cycle Retin-ALT Power Serum
5745    P297524  Clear Improvement Active Charcoal Mask to Clea...
5746    P297516               Checks and Balances Frothy Face Wash
RMSE for all_predictions: 0.15741012189902623
MSE for all_predictions: 0.024777946476266296
RMSE for pred: 0.15127438507997823
MSE for pred: 0.022883939581325542
