# Load Product Data

In [None]:
import pandas as pd
import numpy as np

product_info = pd.read_csv("product_info.csv")
product_info.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8494 entries, 0 to 8493
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   product_id          8494 non-null   object 
 1   product_name        8494 non-null   object 
 2   brand_id            8494 non-null   int64  
 3   brand_name          8494 non-null   object 
 4   loves_count         8494 non-null   int64  
 5   rating              8216 non-null   float64
 6   reviews             8216 non-null   float64
 7   size                6863 non-null   object 
 8   variation_type      7050 non-null   object 
 9   variation_value     6896 non-null   object 
 10  variation_desc      1250 non-null   object 
 11  ingredients         7549 non-null   object 
 12  price_usd           8494 non-null   float64
 13  value_price_usd     451 non-null    float64
 14  sale_price_usd      270 non-null    float64
 15  limited_edition     8494 non-null   int64  
 16  new   

In [None]:
num_missing = product_info.isna().sum()
num_missing

Unnamed: 0,0
product_id,0
product_name,0
brand_id,0
brand_name,0
loves_count,0
rating,278
reviews,278
size,1631
variation_type,1444
variation_value,1598


# Content-based (skincare + makeup only, remove products with no ingredients) | Experiment 1

In [None]:
products = pd.DataFrame(product_info, columns=['product_id','product_name','ingredients','highlights', 'primary_category'])
products.head(5)
products.shape

(8494, 5)

In [None]:
product_info.shape

(8494, 27)

In [None]:
products = products[products['primary_category'].isin(['Skincare', 'Makeup'])]

In [None]:
products = products.dropna(subset='ingredients')
products.shape

(4319, 5)

In [None]:
products = products.reset_index(drop=True)

indices = pd.Series(products.index, index=products['product_name'])

products

Unnamed: 0,product_id,product_name,ingredients,highlights,primary_category
0,P398965,Rose Lip Conditioner,['Polybutene; Hydrogenated Polyisobutene; Dext...,"['Hydrating', 'Good for: Dryness']",Makeup
1,P439055,GENIUS Sleeping Collagen Moisturizer,"['Collagen (Vegan)*, Water (Aqua, Eau), Ethylh...","['Vegan', 'Good for: Loss of firmness', 'Colla...",Skincare
2,P421277,GENIUS Liquid Collagen Serum,"['Collagen (Vegan)*, Water (Aqua, Eau), Propan...","['Vegan', 'Good for: Loss of firmness', 'Colla...",Skincare
3,P467602,Triple Algae Eye Renewal Balm Eye Cream,"['Aqua (Water/Eau), Stearic Acid, Isopropyl Is...",,Skincare
4,P432045,GENIUS Liquid Collagen Lip Treatment,"['Collagen (Vegan)*, Water (Aqua, Eau), Glycer...","['Vegan', 'Good for: Loss of firmness', 'Plump...",Skincare
...,...,...,...,...,...
4314,P474604,NU TONE CORRECTOR Color Corrector with Vitamin E,"['Aqua/Water/Eau, Glycerin, Alcohol Denat., Bu...","['Natural Finish', 'Without Mineral Oil', 'Wit...",Makeup
4315,P483501,Mini Rouge Pur Couture Exclusive Set,['Pentaerythrityl Isostearate/Caprate/Caprylat...,"['Hydrating', 'Satin Finish']",Makeup
4316,P467660,Mascara Volume Effet Faux Cils Radical,"['Aqua/Water, Paraffin, Stearic Acid, Copernic...",,Makeup
4317,P467659,Couture Clutch Eyeshadow Palette,"['Talc, Synthetic Fluorphlogopite, Triethylhex...",,Makeup


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

texts = products.ingredients.values

tfidf_matrix = vectorizer.fit_transform(texts)

tfidf_matrix.shape

(4319, 7102)

In [None]:
from sklearn.metrics.pairwise import linear_kernel

cosine_similarity = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
def get_recommendations(product_name, cosine_sim=cosine_similarity):
    idx = indices[product_name]

    sim_scores = list(enumerate(cosine_similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]

    product_indices = [i[0] for i in sim_scores]

    return products['product_name'].iloc[product_indices]

In [None]:
get_recommendations('GENIUS Sleeping Collagen Moisturizer', cosine_similarity)

Unnamed: 0,product_name
2,GENIUS Liquid Collagen Serum
26,Mini GENIUS Liquid Collagen
9,10 Day Results Kit
4,GENIUS Liquid Collagen Lip Treatment
29,Plump Pout Duo


# Collaborative Filtering (SVD) | Experiment 2

# Load Data

In [None]:
reviews = pd.read_csv("reviews_0-250.csv")
reviews.info()

  reviews = pd.read_csv("reviews_0-250.csv")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253832 entries, 0 to 253831
Data columns (total 19 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Unnamed: 0                253832 non-null  int64  
 1   author_id                 253832 non-null  object 
 2   rating                    253832 non-null  int64  
 3   is_recommended            195179 non-null  float64
 4   helpfulness               112626 non-null  float64
 5   total_feedback_count      253832 non-null  int64  
 6   total_neg_feedback_count  253832 non-null  int64  
 7   total_pos_feedback_count  253832 non-null  int64  
 8   submission_time           253832 non-null  object 
 9   review_text               253311 non-null  object 
 10  review_title              184221 non-null  object 
 11  skin_tone                 202661 non-null  object 
 12  eye_color                 185889 non-null  object 
 13  skin_type                 219592 non-null  o

# Data Preprocessing

In [None]:
columns_to_drop = ['Unnamed: 0', 'total_feedback_count', 'total_neg_feedback_count', 'total_pos_feedback_count', 'submission_time', 'review_title', 'is_recommended', 'product_name', 'brand_name', 'price_usd']
reviews.drop(columns=columns_to_drop, inplace=True)

In [None]:
product_info = product_info[product_info['primary_category'].isin(['Skincare', 'Makeup'])]

product_info = product_info.dropna(subset=['reviews','rating'])

product_stats = reviews.groupby(['product_id'])['product_id'].count().reset_index(name='counts')

product_stats.sort_values('counts', ascending=False)

cutoff = product_stats['counts'].quantile(0.1)

filtered_products = product_stats.loc[product_stats['counts'] > cutoff]

products_set = filtered_products['product_id'].squeeze()

products_subset = product_info.loc[product_info['product_id'].isin(products_set)]

In [None]:
product_info.shape
product_info.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4679 entries, 65 to 8489
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   product_id          4679 non-null   object 
 1   product_name        4679 non-null   object 
 2   brand_id            4679 non-null   int64  
 3   brand_name          4679 non-null   object 
 4   loves_count         4679 non-null   int64  
 5   rating              4679 non-null   float64
 6   reviews             4679 non-null   float64
 7   size                3669 non-null   object 
 8   variation_type      3844 non-null   object 
 9   variation_value     3749 non-null   object 
 10  variation_desc      1189 non-null   object 
 11  ingredients         4223 non-null   object 
 12  price_usd           4679 non-null   float64
 13  value_price_usd     229 non-null    float64
 14  sale_price_usd      184 non-null    float64
 15  limited_edition     4679 non-null   int64  
 16  new       

In [None]:
columns_to_drop = ['variation_type', 'variation_value', 'variation_desc', 'price_usd', 'value_price_usd', 'sale_price_usd', 'limited_edition',
                   'new', 'online_only', 'out_of_stock', 'sephora_exclusive', 'child_count', 'child_max_price', 'child_min_price', 'secondary_category', 'tertiary_category']
product_info.drop(columns=columns_to_drop, inplace=True)

In [None]:
author_stats = reviews.groupby(['author_id'])['author_id'].count().reset_index(name='counts')

author_stats.sort_values('counts', ascending=False)

cutoff= author_stats['counts'].quantile(0.99)

filtered_authors = author_stats.loc[author_stats['counts'] > cutoff]

authors = filtered_authors['author_id'].squeeze()

reviews_subset = reviews.loc[reviews['author_id'].isin(authors)]

In [None]:
reviews_subset

Unnamed: 0,author_id,rating,helpfulness,review_text,skin_tone,eye_color,skin_type,hair_color,product_id
747,5086845963,4,0.960784,Pumpkin spice smells okay. It’s a true pumpkin...,medium,brown,combination,black,P420652
748,5086845963,5,1.000000,Peppermint is a re-release from last year and ...,lightMedium,brown,combination,black,P420652
877,22934451719,5,1.000000,MY FAVORITE LIP SLEEPING MASK. It feels so goo...,medium,brown,oily,black,P420652
878,22934451719,5,0.666667,MY FAVORITE LIP SLEEPING MASK. It feels so goo...,medium,brown,oily,black,P420652
996,5086845963,4,1.000000,Focusing this review on the scent since the fo...,lightMedium,brown,combination,black,P420652
...,...,...,...,...,...,...,...,...,...
251947,1288462295,5,,I got this as a sample & it is phenomenal...I ...,lightMedium,blue,combination,red,P422905
252062,1930716686,5,,Makes eye area very smooth. Has perfect consi...,light,brown,dry,brown,P422905
252367,5826279851,5,,This is the perfect consistency: not too heavy...,light,brown,normal,brown,P422905
252395,1378522783,4,,Excited to receive this in my Play box! I love...,fair,green,oily,blonde,P422905


In [None]:
reviews.shape

(253832, 9)

In [None]:
reviews_subset.shape

(9350, 9)

In [None]:
reviews_subset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9350 entries, 747 to 253517
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   author_id    9350 non-null   object 
 1   rating       9350 non-null   int64  
 2   helpfulness  4773 non-null   float64
 3   review_text  9343 non-null   object 
 4   skin_tone    8296 non-null   object 
 5   eye_color    8081 non-null   object 
 6   skin_type    8722 non-null   object 
 7   hair_color   8055 non-null   object 
 8   product_id   9350 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 730.5+ KB


In [None]:
reviews_subset = reviews_subset.dropna(subset=['skin_tone', 'eye_color', 'skin_type', 'hair_color'])
reviews_subset['user_characteristics'] = reviews_subset.apply(
    lambda row: [
        f'{row["skin_tone"].capitalize()} Skintone',
        f'{row["eye_color"].capitalize()} Eyes',
        f'{row["skin_type"].capitalize()} Skintype',
        f'{row["hair_color"].capitalize()} Hair'
    ], axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_subset['user_characteristics'] = reviews_subset.apply(


In [None]:
reviews_subset.shape
reviews_subset

Unnamed: 0,author_id,rating,helpfulness,review_text,skin_tone,eye_color,skin_type,hair_color,product_id,user_characteristics
747,5086845963,4,0.960784,Pumpkin spice smells okay. It’s a true pumpkin...,medium,brown,combination,black,P420652,"[Medium Skintone, Brown Eyes, Combination Skin..."
748,5086845963,5,1.000000,Peppermint is a re-release from last year and ...,lightMedium,brown,combination,black,P420652,"[Lightmedium Skintone, Brown Eyes, Combination..."
877,22934451719,5,1.000000,MY FAVORITE LIP SLEEPING MASK. It feels so goo...,medium,brown,oily,black,P420652,"[Medium Skintone, Brown Eyes, Oily Skintype, B..."
878,22934451719,5,0.666667,MY FAVORITE LIP SLEEPING MASK. It feels so goo...,medium,brown,oily,black,P420652,"[Medium Skintone, Brown Eyes, Oily Skintype, B..."
996,5086845963,4,1.000000,Focusing this review on the scent since the fo...,lightMedium,brown,combination,black,P420652,"[Lightmedium Skintone, Brown Eyes, Combination..."
...,...,...,...,...,...,...,...,...,...,...
251947,1288462295,5,,I got this as a sample & it is phenomenal...I ...,lightMedium,blue,combination,red,P422905,"[Lightmedium Skintone, Blue Eyes, Combination ..."
252062,1930716686,5,,Makes eye area very smooth. Has perfect consi...,light,brown,dry,brown,P422905,"[Light Skintone, Brown Eyes, Dry Skintype, Bro..."
252367,5826279851,5,,This is the perfect consistency: not too heavy...,light,brown,normal,brown,P422905,"[Light Skintone, Brown Eyes, Normal Skintype, ..."
252395,1378522783,4,,Excited to receive this in my Play box! I love...,fair,green,oily,blonde,P422905,"[Fair Skintone, Green Eyes, Oily Skintype, Blo..."


# Combining 2 data

In [None]:
result = product_info.merge(reviews_subset,  how='inner', on='product_id')

In [None]:
result.shape

(7915, 20)

In [None]:
result = result.reset_index(drop=True)

# Calculate Cosine Similarity between users

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

result['user_characteristics_str'] = result['user_characteristics'].apply(lambda x: ', '.join(x))

texts = result.user_characteristics_str.values

vectorizer = TfidfVectorizer(max_features=100)

tfidf_matrix = vectorizer.fit_transform(texts)

user_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Calculate Implicit Rating

In [None]:
from textblob import TextBlob

def get_sentiment_score(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity  # Returns a score between -1 and 1

result['review_text'] = result['review_text'].astype(str).fillna('')

result['sentiment_score'] = result['review_text'].apply(get_sentiment_score)

In [None]:
import numpy as np

def sentiment_to_rating(sentiment_score):
    return np.interp(sentiment_score, [-1, 1], [1, 5])

result['implicit_rating'] = result['sentiment_score'].apply(sentiment_to_rating)

In [None]:
result.info()
result

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7915 entries, 0 to 7914
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   product_id                7915 non-null   object 
 1   product_name              7915 non-null   object 
 2   brand_id                  7915 non-null   int64  
 3   brand_name                7915 non-null   object 
 4   loves_count               7915 non-null   int64  
 5   rating_x                  7915 non-null   float64
 6   reviews                   7915 non-null   float64
 7   size                      7780 non-null   object 
 8   ingredients               7914 non-null   object 
 9   highlights                7641 non-null   object 
 10  primary_category          7915 non-null   object 
 11  author_id                 7915 non-null   object 
 12  rating_y                  7915 non-null   int64  
 13  helpfulness               4063 non-null   float64
 14  review_t

Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,rating_x,reviews,size,ingredients,highlights,...,helpfulness,review_text,skin_tone,eye_color,skin_type,hair_color,user_characteristics,user_characteristics_str,sentiment_score,implicit_rating
0,P460779,Skinlongevity Long Life Herb Anti-Aging Face S...,5737,bareMinerals,7614,4.4339,3727.0,1.7 oz/ 50 mL,"['Water (Aqua/Eau), Propanediol, Niacinamide, ...","['Vegan', 'Good for: Dullness/Uneven Texture',...",...,1.0,Love this serum-it’s lightweight and really wo...,fair,green,combination,blonde,"[Fair Skintone, Green Eyes, Combination Skinty...","Fair Skintone, Green Eyes, Combination Skintyp...",0.350000,3.700000
1,P460779,Skinlongevity Long Life Herb Anti-Aging Face S...,5737,bareMinerals,7614,4.4339,3727.0,1.7 oz/ 50 mL,"['Water (Aqua/Eau), Propanediol, Niacinamide, ...","['Vegan', 'Good for: Dullness/Uneven Texture',...",...,1.0,Received a sample packet and I am currently or...,light,green,combination,blonde,"[Light Skintone, Green Eyes, Combination Skint...","Light Skintone, Green Eyes, Combination Skinty...",0.216667,3.433333
2,P460779,Skinlongevity Long Life Herb Anti-Aging Face S...,5737,bareMinerals,7614,4.4339,3727.0,1.7 oz/ 50 mL,"['Water (Aqua/Eau), Propanediol, Niacinamide, ...","['Vegan', 'Good for: Dullness/Uneven Texture',...",...,,I’m really surprised at this! I have very dry ...,fairLight,brown,dry,blonde,"[Fairlight Skintone, Brown Eyes, Dry Skintype,...","Fairlight Skintone, Brown Eyes, Dry Skintype, ...",-0.043905,2.912189
3,P460779,Skinlongevity Long Life Herb Anti-Aging Face S...,5737,bareMinerals,7614,4.4339,3727.0,1.7 oz/ 50 mL,"['Water (Aqua/Eau), Propanediol, Niacinamide, ...","['Vegan', 'Good for: Dullness/Uneven Texture',...",...,,I love this serum. My skin has been a little s...,tan,brown,combination,black,"[Tan Skintone, Brown Eyes, Combination Skintyp...","Tan Skintone, Brown Eyes, Combination Skintype...",0.022500,3.045000
4,P460779,Skinlongevity Long Life Herb Anti-Aging Face S...,5737,bareMinerals,7614,4.4339,3727.0,1.7 oz/ 50 mL,"['Water (Aqua/Eau), Propanediol, Niacinamide, ...","['Vegan', 'Good for: Dullness/Uneven Texture',...",...,,I recently received this to try it out and it ...,light,brown,combination,brown,"[Light Skintone, Brown Eyes, Combination Skint...","Light Skintone, Brown Eyes, Combination Skinty...",0.306548,3.613095
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7910,P441644,Mini Superfood Antioxidant Cleanser,6169,Youth To The People,121678,4.2121,5841.0,2 oz/ 59 mL,"['Water/Aqua/Eau, Cocamidopropyl Hydroxysultai...","['Refill Available', 'Community Favorite', 'Go...",...,,"Honestly, it took a few weeks of using this cl...",fair,brown,combination,brown,"[Fair Skintone, Brown Eyes, Combination Skinty...","Fair Skintone, Brown Eyes, Combination Skintyp...",0.227629,3.455258
7911,P441644,Mini Superfood Antioxidant Cleanser,6169,Youth To The People,121678,4.2121,5841.0,2 oz/ 59 mL,"['Water/Aqua/Eau, Cocamidopropyl Hydroxysultai...","['Refill Available', 'Community Favorite', 'Go...",...,,I initially got this brand in a play box and I...,fair,blue,oily,brown,"[Fair Skintone, Blue Eyes, Oily Skintype, Brow...","Fair Skintone, Blue Eyes, Oily Skintype, Brown...",0.275000,3.550000
7912,P441644,Mini Superfood Antioxidant Cleanser,6169,Youth To The People,121678,4.2121,5841.0,2 oz/ 59 mL,"['Water/Aqua/Eau, Cocamidopropyl Hydroxysultai...","['Refill Available', 'Community Favorite', 'Go...",...,,Smells nice. Looks cute. I don’t find that it ...,light,brown,combination,auburn,"[Light Skintone, Brown Eyes, Combination Skint...","Light Skintone, Brown Eyes, Combination Skinty...",0.413333,3.826667
7913,P441644,Mini Superfood Antioxidant Cleanser,6169,Youth To The People,121678,4.2121,5841.0,2 oz/ 59 mL,"['Water/Aqua/Eau, Cocamidopropyl Hydroxysultai...","['Refill Available', 'Community Favorite', 'Go...",...,,I got a decent sample size of this. It made my...,mediumTan,hazel,dry,brown,"[Mediumtan Skintone, Hazel Eyes, Dry Skintype,...","Mediumtan Skintone, Hazel Eyes, Dry Skintype, ...",0.266970,3.533939


# SVD

In [None]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357282 sha256=ac94d94db2fedf79caa6459652980f2477143420ba796e0760c1888a52be1d11
  Stored in directory: /root/.cach

In [None]:
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import train_test_split

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(result[['author_id', 'product_id', 'rating_y']], reader)

trainset = data.build_full_trainset()

algo = SVD()

algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7deba5569930>

In [None]:
testset = trainset.build_anti_testset()

predictions = algo.test(testset)

In [None]:
mse = accuracy.mse(predictions)
rmse = accuracy.rmse(predictions)

MSE: 0.1360
RMSE: 0.3688


In [None]:
pred = pd.DataFrame(predictions)

In [None]:
def get_recommendations(user_id, pred):
    pred_subset = pred.loc[pred['uid'] == user_id]

    pred_subset.sort_values(by=['est'],ascending = False)

    product_list = pred_subset.head(5)['iid'].to_list()

    recommendations = product_info.loc[product_info['product_id'].isin(product_list)]

    return recommendations['product_name']

In [None]:
get_recommendations('2660223134', pred)