In [47]:
import pandas as pd
import ast
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity 

In [48]:
# Load preprocessed data
preprocessed_profiles_rating_path = "../data/preprocessed_profiles_rating.csv"
df_profile_ratings = pd.read_csv(preprocessed_profiles_rating_path)

In [49]:
df_profile_ratings.head()

Unnamed: 0,id,user_id,name,product_id,rating,gender,age,skin_type_face,hair_issue,skin_type_body,allergy_history,preferred_products,avoided_products,specific_needs
0,6,7,Dini Sipahutar,215.0,5.0,Perempuan,18-25,['normal'],['minyak'],['normal'],['tidak ada alergi'],['cruelty freepewangiminyak mineral'],['parabensls'],['rata warna kulitperlindungan matahari']
1,6,7,Dini Sipahutar,22.0,5.0,Perempuan,18-25,['normal'],['minyak'],['normal'],['tidak ada alergi'],['cruelty freepewangiminyak mineral'],['parabensls'],['rata warna kulitperlindungan matahari']
2,6,7,Dini Sipahutar,311.0,5.0,Perempuan,18-25,['normal'],['minyak'],['normal'],['tidak ada alergi'],['cruelty freepewangiminyak mineral'],['parabensls'],['rata warna kulitperlindungan matahari']
3,6,7,Dini Sipahutar,286.0,5.0,Perempuan,18-25,['normal'],['minyak'],['normal'],['tidak ada alergi'],['cruelty freepewangiminyak mineral'],['parabensls'],['rata warna kulitperlindungan matahari']
4,6,7,Dini Sipahutar,82.0,4.0,Perempuan,18-25,['normal'],['minyak'],['normal'],['tidak ada alergi'],['cruelty freepewangiminyak mineral'],['parabensls'],['rata warna kulitperlindungan matahari']


In [50]:
df_profile_ratings['product_id'] = df_profile_ratings['product_id'].fillna(0).astype(int)

In [51]:
df_profile_ratings.head()

Unnamed: 0,id,user_id,name,product_id,rating,gender,age,skin_type_face,hair_issue,skin_type_body,allergy_history,preferred_products,avoided_products,specific_needs
0,6,7,Dini Sipahutar,215,5.0,Perempuan,18-25,['normal'],['minyak'],['normal'],['tidak ada alergi'],['cruelty freepewangiminyak mineral'],['parabensls'],['rata warna kulitperlindungan matahari']
1,6,7,Dini Sipahutar,22,5.0,Perempuan,18-25,['normal'],['minyak'],['normal'],['tidak ada alergi'],['cruelty freepewangiminyak mineral'],['parabensls'],['rata warna kulitperlindungan matahari']
2,6,7,Dini Sipahutar,311,5.0,Perempuan,18-25,['normal'],['minyak'],['normal'],['tidak ada alergi'],['cruelty freepewangiminyak mineral'],['parabensls'],['rata warna kulitperlindungan matahari']
3,6,7,Dini Sipahutar,286,5.0,Perempuan,18-25,['normal'],['minyak'],['normal'],['tidak ada alergi'],['cruelty freepewangiminyak mineral'],['parabensls'],['rata warna kulitperlindungan matahari']
4,6,7,Dini Sipahutar,82,4.0,Perempuan,18-25,['normal'],['minyak'],['normal'],['tidak ada alergi'],['cruelty freepewangiminyak mineral'],['parabensls'],['rata warna kulitperlindungan matahari']


In [52]:
# convert skin type, hair issue, skin type body to numeric value (int)
def convert_skin_type_face(skin_type): 
    skin_type_dict = {'normal': 0, 'kering': 1, 'minyak': 2, 'sensitif': 3, 'kombinasi': 4}
    skin_type_list = ast.literal_eval(skin_type)
    return skin_type_dict.get(skin_type_list[0], 0)

def convert_hair_issue(hair_issue): 
    hair_issue_dict = {'normal': 1, 'ketombe': 1, 'kering': 2, 'minyak': 3, 'rontok': 4, 'cabang': 5}
    hair_issue_list = ast.literal_eval(hair_issue)
    return hair_issue_dict.get(hair_issue_list[0], 0)

def convert_skin_type_body(skin_type): 
    skin_type_dict = {'normal': 0, 'kering': 1, 'minyak': 2, 'kombinasi': 3} 
    skin_type_list = ast.literal_eval(skin_type)
    return skin_type_dict.get(skin_type_list[0], 0)

In [53]:
# functions to the respective columns in the data frame to converting the string values to numeric values based on the provided dictionaries.
df_profile_ratings["skin_type_face"] = df_profile_ratings["skin_type_face"].apply(convert_skin_type_face) 
df_profile_ratings["hair_issue"] = df_profile_ratings["hair_issue"].apply(convert_hair_issue) 
df_profile_ratings["skin_type_body"] = df_profile_ratings["skin_type_body"].apply(convert_skin_type_body)

In [54]:
# Get the minimum and maximum values for each column
column_min = df_profile_ratings[["skin_type_face", "hair_issue", "skin_type_body"]].min()
column_max = df_profile_ratings[["skin_type_face", "hair_issue", "skin_type_body"]].max()

# Perform min-max scaling
df_profile_ratings[["skin_type_face", "hair_issue", "skin_type_body"]] = (
    df_profile_ratings[["skin_type_face", "hair_issue", "skin_type_body"]] - column_min) / (column_max - column_min)

# Round the values to 3 decimal places
df_profile_ratings[["skin_type_face", "hair_issue", "skin_type_body"]] = df_profile_ratings[["skin_type_face", "hair_issue", "skin_type_body"]].round(3)

In [55]:
print(df_profile_ratings)

     id  user_id            name  product_id  rating     gender    age  \
0     6        7  Dini Sipahutar         215     5.0  Perempuan  18-25   
1     6        7  Dini Sipahutar          22     5.0  Perempuan  18-25   
2     6        7  Dini Sipahutar         311     5.0  Perempuan  18-25   
3     6        7  Dini Sipahutar         286     5.0  Perempuan  18-25   
4     6        7  Dini Sipahutar          82     4.0  Perempuan  18-25   
..   ..      ...             ...         ...     ...        ...    ...   
427  64       67          Wilona         294     5.0  Perempuan  18-38   
428  64       67          Wilona         315     4.0  Perempuan  18-39   
429  64       67          Wilona         166     4.0  Perempuan  18-40   
430  64       67          Wilona         168     5.0  Perempuan  18-41   
431  64       67          Wilona         262     4.0  Perempuan  18-42   

     skin_type_face  hair_issue  skin_type_body             allergy_history  \
0              0.00        0.50 

In [56]:
# Drop rows with NaN values
# df_profile_ratings = df_profile_ratings.dropna(subset=['skin_type_face', 'hair_issue', 'skin_type_body'])

In [57]:
# Convert user_id to numeric before grouping
df_profile_ratings['user_id'] = df_profile_ratings['user_id'].astype(int)

# Calculate the mean vector for each user
user_vectors = df_profile_ratings.groupby('user_id')[['skin_type_face', 'hair_issue', 'skin_type_body']].mean().round(3)

# Reset the index
user_vectors.reset_index(inplace=True)

In [58]:
print(user_vectors)

    user_id  skin_type_face  hair_issue  skin_type_body
0         7            0.00        0.50           0.000
1         8            0.75        0.25           0.333
2        11            0.00        0.00           0.667
3        12            1.00        0.75           0.000
4        13            0.50        0.00           0.000
5        14            0.50        0.00           0.000
6        15            0.50        0.25           0.000
7        16            0.00        0.25           0.000
8        17            0.00        0.00           0.000
9        18            0.50        0.25           0.000
10       19            0.50        0.00           0.000
11       20            0.50        0.00           0.333
12       21            1.00        0.75           0.000
13       22            0.00        0.75           0.000
14       23            0.75        0.25           0.000
15       24            0.50        0.00           0.333
16       25            0.50        0.25         

In [59]:
# Ubah tipe data user_id ke integer (jika belum integer)
user_vectors['user_id'] = user_vectors['user_id'].astype(int)

# Filter hanya data user_id yang ada dalam file CSV
user_vectors = user_vectors[user_vectors['user_id'].isin(df_profile_ratings['user_id'].unique())]

# Simpan user_id sebelum dihapus dari user_vectors
user_ids = user_vectors['user_id']

# Hapus kolom user_id sebelum menghitung similaritas
user_vectors = user_vectors.drop('user_id', axis=1)

# Hitung similaritas kosinus antara semua pasangan pengguna
user_similarities = cosine_similarity(user_vectors)

# Konversi matriks similaritas ke dataframe pandas
user_similarities = pd.DataFrame(user_similarities, index=user_ids, columns=user_ids)

In [60]:
print(user_similarities)

user_id        7         8         11        12        13        14        15  \
user_id                                                                         
7        1.000000  0.291430  0.000000  0.600000  0.000000  0.000000  0.447214   
8        0.291430  1.000000  0.388184  0.874289  0.874289  0.874289  0.912319   
11       0.000000  0.388184  1.000000  0.000000  0.000000  0.000000  0.000000   
12       0.600000  0.874289  0.000000  1.000000  0.800000  0.800000  0.983870   
13       0.000000  0.874289  0.000000  0.800000  1.000000  1.000000  0.894427   
14       0.000000  0.874289  0.000000  0.800000  1.000000  1.000000  0.894427   
15       0.447214  0.912319  0.000000  0.983870  0.894427  0.894427  1.000000   
16       1.000000  0.291430  0.000000  0.600000  0.000000  0.000000  0.447214   
17       0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
18       0.447214  0.912319  0.000000  0.983870  0.894427  0.894427  1.000000   
19       0.000000  0.874289 

In [61]:
# Get all unique items
items = df_profile_ratings['product_id'].unique()

# Get all unique user IDs from the preprocessed data
unique_user_ids = df_profile_ratings['user_id'].unique()

In [62]:
# Initialize a dictionary to store the recommendation values for each user
recommendations = {}

# Iterate over each user ID from the preprocessed data
for user_id in unique_user_ids:
    # Check if the user ID exists in the user_vectors dataframe and has at least one rating
    if user_id in user_vectors.index and user_id in df_profile_ratings['user_id'].values:
        # Calculate the sum of similarity values for the current user
        similarity_sum = user_similarities.loc[user_id].sum()
        
        # Initialize a dictionary to store the prediction values for each item
        predictions = {}
        
        # Check if the user has some similarity with other users
        if similarity_sum > 0:
            # Iterate over each item
            for item in items:
                # Filter the ratings for the current user and item
                ratings = df_profile_ratings[(df_profile_ratings['user_id'] == user_id) & (df_profile_ratings['product_id'] == item)]
                
                # If the user has not rated the item
                if ratings.empty:
                    # Calculate the sum of the ratings weighted by the similarity values
                    rating_sum = (ratings['rating'] * user_similarities.loc[user_id]).sum()
                    # Add the prediction value to the dictionary
                    predictions[item] = rating_sum / similarity_sum
        
        # Add the prediction values for the current user to the recommendations dictionary
        recommendations[user_id] = predictions
    else:
        # If user has no similarity scores or no ratings, assign an empty dictionary
        recommendations[user_id] = {}

# Convert the recommendations dictionary to a pandas dataframe
recommendations_df = pd.DataFrame(recommendations)

# Transpose the dataframe so that user IDs are rows and product IDs are columns
recommendations_df = recommendations_df.T

# Round the values to 3 decimal places
recommendations_df = recommendations_df.round(3)

In [63]:
# Iterate over each user ID
for user_id in unique_user_ids:
    # Check if the user ID exists in the recommendations dataframe
    if user_id in recommendations_df.index:
        # Look up the user's name in the df_profile_ratings dataframe
        user_name = df_profile_ratings.loc[df_profile_ratings['user_id'] == user_id, 'name'].values[0]

        # Select the row for the specified user
        user_recs = recommendations_df.loc[user_id]

        # Sort the series in descending order and select the top 16 values
        top_recs = user_recs.sort_values(ascending=False).iloc[:10].index.tolist()

        # Print the recommended product IDs and names
        print("Top 10 recommended products for user_id = {} : ({})".format(user_id, user_name))
        print("Product IDs:")
        print(top_recs)
        print("\n")
    else:
        print(f"No recommendations found for user_id = {user_id}")

Top 10 recommended products for user_id = 7 : (Dini Sipahutar)
Product IDs:
[58, 33, 57, 211, 13, 305, 50, 116, 149, 12]


Top 10 recommended products for user_id = 8 : (Gladys)
Product IDs:
[7, 65, 30, 48, 313, 259, 137, 73, 232, 45]


Top 10 recommended products for user_id = 11 : (Suandika Napitupulu)
Product IDs:
[58, 65, 30, 48, 313, 259, 137, 73, 232, 45]


Top 10 recommended products for user_id = 12 : (Elisa Tambunan)
Product IDs:
[58, 83, 48, 313, 259, 137, 73, 232, 45, 65]


Top 10 recommended products for user_id = 13 : (Emy Sonia Sinambela)
Product IDs:
[24, 75, 48, 313, 259, 137, 73, 232, 45, 65]


Top 10 recommended products for user_id = 14 : (Stefhani Kezia)
Product IDs:
[58, 273, 30, 48, 313, 259, 137, 73, 232, 45]


Top 10 recommended products for user_id = 15 : (Josep Phyto Napitupulu)
Product IDs:
[58, 34, 259, 137, 73, 232, 45, 65, 75, 187]


Top 10 recommended products for user_id = 16 : (Samuel Simanjuntak)
Product IDs:
[58, 65, 30, 48, 313, 259, 137, 73, 232, 45