In [30]:
import pandas as pd
import ast
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [31]:
# Load preprocessed data
preprocessed_profiles_rating_path = "../data/preprosessing_data/preprocessed_profiles_rating.csv"
df_profile_ratings = pd.read_csv(preprocessed_profiles_rating_path)

In [32]:
# convert product_id data type to int
df_profile_ratings['product_id'] = df_profile_ratings['product_id'].fillna(0).astype(int)

In [33]:
# convert skin type, hair issue, skin type body to numeric value (int)
def convert_skin_type_face(skin_type): 
    skin_type_dict = {'normal': 0, 'kering': 1, 'minyak': 2, 'sensitif': 3, 'kombinasi': 4}
    skin_type_list = ast.literal_eval(skin_type)
    return skin_type_dict.get(skin_type_list[0], 0)

def convert_hair_issue(hair_issue): 
    hair_issue_dict = {'normal': 1, 'ketombe': 1, 'kering': 2, 'minyak': 3, 'rontok': 4, 'cabang': 5}
    hair_issue_list = ast.literal_eval(hair_issue)
    return hair_issue_dict.get(hair_issue_list[0], 0)

def convert_skin_type_body(skin_type): 
    skin_type_dict = {'normal': 0, 'kering': 1, 'minyak': 2, 'kombinasi': 3} 
    skin_type_list = ast.literal_eval(skin_type)
    return skin_type_dict.get(skin_type_list[0], 0)

In [34]:
# functions to the respective columns in the data frame to converting the string values to numeric values based on the provided dictionaries.
df_profile_ratings["skin_type_face"] = df_profile_ratings["skin_type_face"].apply(convert_skin_type_face) 
df_profile_ratings["hair_issue"] = df_profile_ratings["hair_issue"].apply(convert_hair_issue) 
df_profile_ratings["skin_type_body"] = df_profile_ratings["skin_type_body"].apply(convert_skin_type_body)
display(df_profile_ratings)

Unnamed: 0,id,user_id,name,product_id,rating,gender,age,skin_type_face,hair_issue,skin_type_body,allergy_history,preferred_products,avoided_products,specific_needs
0,6,7,Dini Sipahutar,215,5.0,Perempuan,18-25,0,3,0,['tidak ada alergi'],['cruelty freepewangiminyak mineral'],['parabensls'],['rata warna kulitperlindungan matahari']
1,6,7,Dini Sipahutar,22,5.0,Perempuan,18-25,0,3,0,['tidak ada alergi'],['cruelty freepewangiminyak mineral'],['parabensls'],['rata warna kulitperlindungan matahari']
2,6,7,Dini Sipahutar,311,5.0,Perempuan,18-25,0,3,0,['tidak ada alergi'],['cruelty freepewangiminyak mineral'],['parabensls'],['rata warna kulitperlindungan matahari']
3,6,7,Dini Sipahutar,286,5.0,Perempuan,18-25,0,3,0,['tidak ada alergi'],['cruelty freepewangiminyak mineral'],['parabensls'],['rata warna kulitperlindungan matahari']
4,6,7,Dini Sipahutar,82,4.0,Perempuan,18-25,0,3,0,['tidak ada alergi'],['cruelty freepewangiminyak mineral'],['parabensls'],['rata warna kulitperlindungan matahari']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,49,50,Maryono,315,5.0,Laki-laki,18-25,2,3,0,['tidak ada alergi'],['bahan alami'],['paraben'],['hilang jerawat']
210,50,51,Putri Napitupulu,302,4.0,Perempuan,18-25,4,1,1,"['wangi', 'kandung', 'kimia']","['bahan alami', 'minyak mineral']",['wangi'],"['rata', 'warna', 'kulit', 'hilang', 'jerawat'..."
211,50,51,Putri Napitupulu,35,5.0,Perempuan,18-25,4,1,1,"['wangi', 'kandung', 'kimia']","['bahan alami', 'minyak mineral']",['wangi'],"['rata', 'warna', 'kulit', 'hilang', 'jerawat'..."
212,50,51,Putri Napitupulu,82,5.0,Perempuan,18-25,4,1,1,"['wangi', 'kandung', 'kimia']","['bahan alami', 'minyak mineral']",['wangi'],"['rata', 'warna', 'kulit', 'hilang', 'jerawat'..."


In [35]:
# Get the minimum and maximum values for each column
column_min = df_profile_ratings[["skin_type_face", "hair_issue", "skin_type_body"]].min()
column_max = df_profile_ratings[["skin_type_face", "hair_issue", "skin_type_body"]].max()

# Perform min-max scaling
df_profile_ratings[["skin_type_face", "hair_issue", "skin_type_body"]] = (
    df_profile_ratings[["skin_type_face", "hair_issue", "skin_type_body"]] - column_min) / (column_max - column_min)

# Round the values to 3 decimal places
df_profile_ratings[["skin_type_face", "hair_issue", "skin_type_body"]] = df_profile_ratings[["skin_type_face", "hair_issue", "skin_type_body"]].round(2)

display(df_profile_ratings)

Unnamed: 0,id,user_id,name,product_id,rating,gender,age,skin_type_face,hair_issue,skin_type_body,allergy_history,preferred_products,avoided_products,specific_needs
0,6,7,Dini Sipahutar,215,5.0,Perempuan,18-25,0.0,0.50,0.00,['tidak ada alergi'],['cruelty freepewangiminyak mineral'],['parabensls'],['rata warna kulitperlindungan matahari']
1,6,7,Dini Sipahutar,22,5.0,Perempuan,18-25,0.0,0.50,0.00,['tidak ada alergi'],['cruelty freepewangiminyak mineral'],['parabensls'],['rata warna kulitperlindungan matahari']
2,6,7,Dini Sipahutar,311,5.0,Perempuan,18-25,0.0,0.50,0.00,['tidak ada alergi'],['cruelty freepewangiminyak mineral'],['parabensls'],['rata warna kulitperlindungan matahari']
3,6,7,Dini Sipahutar,286,5.0,Perempuan,18-25,0.0,0.50,0.00,['tidak ada alergi'],['cruelty freepewangiminyak mineral'],['parabensls'],['rata warna kulitperlindungan matahari']
4,6,7,Dini Sipahutar,82,4.0,Perempuan,18-25,0.0,0.50,0.00,['tidak ada alergi'],['cruelty freepewangiminyak mineral'],['parabensls'],['rata warna kulitperlindungan matahari']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,49,50,Maryono,315,5.0,Laki-laki,18-25,0.5,0.50,0.00,['tidak ada alergi'],['bahan alami'],['paraben'],['hilang jerawat']
210,50,51,Putri Napitupulu,302,4.0,Perempuan,18-25,1.0,0.00,0.33,"['wangi', 'kandung', 'kimia']","['bahan alami', 'minyak mineral']",['wangi'],"['rata', 'warna', 'kulit', 'hilang', 'jerawat'..."
211,50,51,Putri Napitupulu,35,5.0,Perempuan,18-25,1.0,0.00,0.33,"['wangi', 'kandung', 'kimia']","['bahan alami', 'minyak mineral']",['wangi'],"['rata', 'warna', 'kulit', 'hilang', 'jerawat'..."
212,50,51,Putri Napitupulu,82,5.0,Perempuan,18-25,1.0,0.00,0.33,"['wangi', 'kandung', 'kimia']","['bahan alami', 'minyak mineral']",['wangi'],"['rata', 'warna', 'kulit', 'hilang', 'jerawat'..."


In [36]:
# Convert user_id to numeric before grouping
df_profile_ratings['user_id'] = df_profile_ratings['user_id'].astype(int)

# Calculate the mean vector for each user
user_vectors = df_profile_ratings.groupby('user_id')[['skin_type_face', 'hair_issue', 'skin_type_body']].mean().round(2)

# Reset the index
user_vectors.reset_index(inplace=True)

display(user_vectors)

Unnamed: 0,user_id,skin_type_face,hair_issue,skin_type_body
0,7,0.0,0.5,0.0
1,8,0.75,0.25,0.33
2,11,0.0,0.0,0.67
3,12,1.0,0.75,0.0
4,13,0.5,0.0,0.0
5,14,0.5,0.0,0.0
6,15,0.5,0.25,0.0
7,16,0.0,0.25,0.0
8,17,0.0,0.0,0.0
9,18,0.5,0.25,0.0


In [37]:
# Ubah tipe data user_id ke integer (jika belum integer)
user_vectors['user_id'] = user_vectors['user_id'].astype(int)

# Filter hanya data user_id yang ada dalam file CSV
user_vectors = user_vectors[user_vectors['user_id'].isin(df_profile_ratings['user_id'].unique())]

# Simpan user_id sebelum dihapus dari user_vectors
user_ids = user_vectors['user_id']

# Hapus kolom user_id sebelum menghitung similaritas
user_vectors = user_vectors.drop('user_id', axis=1)

# Hitung similaritas kosinus antara semua pasangan pengguna
user_similarities = cosine_similarity(user_vectors)

# Konversi matriks similaritas ke dataframe pandas
user_similarities = pd.DataFrame(user_similarities, index=user_ids, columns=user_ids).round(2)

print(user_similarities)

user_id    7     8     11    12    13    14    15    16   17    18  ...    43  \
user_id                                                             ...         
7        1.00  0.29  0.00  0.60  0.00  0.00  0.45  1.00  0.0  0.45  ...  0.23   
8        0.29  1.00  0.39  0.88  0.88  0.88  0.91  0.29  0.0  0.91  ...  0.99   
11       0.00  0.39  1.00  0.00  0.00  0.00  0.00  0.00  0.0  0.00  ...  0.30   
12       0.60  0.88  0.00  1.00  0.80  0.80  0.98  0.60  0.0  0.98  ...  0.88   
13       0.00  0.88  0.00  0.80  1.00  1.00  0.89  0.00  0.0  0.89  ...  0.92   
14       0.00  0.88  0.00  0.80  1.00  1.00  0.89  0.00  0.0  0.89  ...  0.92   
15       0.45  0.91  0.00  0.98  0.89  0.89  1.00  0.45  0.0  1.00  ...  0.93   
16       1.00  0.29  0.00  0.60  0.00  0.00  0.45  1.00  0.0  0.45  ...  0.23   
17       0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.0  0.00  ...  0.00   
18       0.45  0.91  0.00  0.98  0.89  0.89  1.00  0.45  0.0  1.00  ...  0.93   
19       0.00  0.88  0.00  0

In [38]:
# Get all unique items
items = df_profile_ratings['product_id'].unique()

# Get all unique user IDs from the preprocessed data
unique_user_ids = df_profile_ratings['user_id'].unique()
print(items)

[215  22 311 286  82 140 112  63  51  21  18   5   8  68  92  85 216 307
 280 276 236  58  24 300   7 315 294 134  42 123 167 168 188 199  84 283
  16 309  47  20 263  35 135 191   6 295 122 314  46 312 306  59  36 138
 298 316 166 310  56  23 161 207 252  88  32 142 120 208  94 302 163 228
 193 147 141   4 160  78   9 114 304 234 218 285 299  14  15  37  25  81
  83 121 156 159  66  60 164 165  57 211  13 305  50 116 149  33  12 175
  44 130 273  48]


In [39]:
# Initialize a dictionary to store the recommendation values for each user
recommendations = {}

# Iterate over each user ID from the preprocessed data
for user_id in unique_user_ids:
    # Calculate the sum of similarities for the current user
    similarity_sum = user_similarities.loc[user_id].sum()
    predictions = {}

    if similarity_sum > 0:  # Check if similarity_sum is greater than 0 to avoid division by zero
        for item in items:
            # Filter ratings for the item from other users
            other_user_ratings = df_profile_ratings[df_profile_ratings['product_id'] == item]

            # Calculate the sum of ratings weighted by similarity values
            rating_sum = 0
            for other_user_id in other_user_ratings['user_id']:
                if other_user_id != user_id:
                    rating = other_user_ratings[other_user_ratings['user_id'] == other_user_id]['rating'].values[0]
                    similarity = user_similarities.loc[user_id, other_user_id]
                    rating_sum += rating * similarity

            # Calculate the predicted rating for the item
            predictions[item] = rating_sum / similarity_sum

    # Store the predictions for the current user
    recommendations[user_id] = predictions

# Convert the recommendations dictionary to a pandas dataframe
recommendations_df = pd.DataFrame(recommendations)
recommendations_df = recommendations_df.T
recommendations_df = recommendations_df.round(2)

# Fill NaN values and 0 values with 0
recommendations_df = recommendations_df.fillna(0).replace(0, 0)

print(recommendations_df)

     215   22    311   286   82    140   112   63    51    21   ...   50   \
7   0.60  0.00  0.25  0.07  1.78  0.00  0.00  0.29  0.00  0.25  ...  0.00   
8   0.57  0.05  0.19  0.15  1.42  0.48  0.05  0.29  0.03  0.19  ...  0.14   
11  0.22  0.00  0.00  0.00  0.65  0.17  0.00  0.25  0.00  0.00  ...  0.45   
12  0.67  0.11  0.28  0.21  1.47  0.48  0.11  0.33  0.06  0.28  ...  0.09   
13  0.55  0.00  0.15  0.12  1.41  0.57  0.00  0.25  0.00  0.15  ...  0.12   
14  0.55  0.00  0.15  0.12  1.21  0.40  0.00  0.25  0.00  0.15  ...  0.12   
15  0.47  0.08  0.25  0.19  1.56  0.50  0.08  0.31  0.05  0.25  ...  0.09   
16  0.95  0.35  0.60  0.42  1.78  0.28  0.35  0.51  0.21  0.60  ...  0.00   
17  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  ...  0.00   
18  0.47  0.08  0.25  0.19  1.56  0.50  0.08  0.31  0.05  0.25  ...  0.09   
19  0.55  0.00  0.15  0.12  1.21  0.57  0.00  0.25  0.00  0.15  ...  0.12   
20  0.49  0.00  0.12  0.09  1.26  0.49  0.00  0.25  0.00  0.12  ...  0.19   

In [40]:
# Hitung MAE
absolute_errors = []
for user_id in unique_user_ids:
    user_actual_ratings = df_profile_ratings.loc[df_profile_ratings['user_id'] == user_id, ['product_id', 'rating']]
    user_actual_ratings = user_actual_ratings.dropna(subset=['product_id', 'rating'])  # Hapus baris dengan data kosong
    user_actual_ratings = user_actual_ratings.set_index('product_id')['rating'].to_dict()

    user_predicted_ratings = recommendations_df.loc[user_id].to_dict()

    for product_id in user_actual_ratings.keys():
        if product_id in user_predicted_ratings:
            actual_rating = user_actual_ratings[product_id]
            predicted_rating = user_predicted_ratings[product_id]
            absolute_error = abs(actual_rating - predicted_rating)
            absolute_errors.append(absolute_error)

# Hitung MAE hanya jika ada nilai dalam absolute_errors
if absolute_errors:
    mae = np.mean(absolute_errors)
    print(f"Mean Absolute Error (MAE): {mae:.3f}")
else:
    print("Tidak ada data untuk menghitung MAE.")

Mean Absolute Error (MAE): 4.113


In [41]:
# Iterate over each user ID
for user_id in unique_user_ids:
    # Look up the user's name in the df_profile_ratings dataframe
    user_name = df_profile_ratings.loc[df_profile_ratings['user_id'] == user_id, 'name'].values[0]

    # Select the row for the specified user
    user_recs = recommendations_df.loc[user_id]

    # Sort the series in descending order and select the top 10 values
    top_recs = user_recs.sort_values(ascending=False).iloc[:10].index.tolist()

    # Print the recommended product IDs and names
    print("Top 10 recommended products for user_id = {} : ({})".format(user_id, user_name))
    print("Product IDs:")
    print(top_recs)
    print("\n")

Top 10 recommended products for user_id = 7 : (Dini Sipahutar)
Product IDs:
[82, 58, 135, 215, 20, 134, 315, 316, 168, 314]


Top 10 recommended products for user_id = 8 : (Gladys)
Product IDs:
[82, 58, 42, 309, 316, 215, 135, 140, 199, 84]


Top 10 recommended products for user_id = 11 : (Suandika Napitupulu)
Product IDs:
[316, 58, 82, 24, 46, 32, 120, 4, 13, 50]


Top 10 recommended products for user_id = 12 : (Elisa Tambunan)
Product IDs:
[82, 58, 215, 309, 135, 316, 84, 140, 42, 199]


Top 10 recommended products for user_id = 13 : (Emy Sonia Sinambela)
Product IDs:
[82, 58, 309, 42, 316, 140, 215, 135, 300, 24]


Top 10 recommended products for user_id = 14 : (Stefhani Kezia)
Product IDs:
[58, 82, 42, 199, 316, 309, 215, 135, 300, 84]


Top 10 recommended products for user_id = 15 : (Josep Phyto Napitupulu)
Product IDs:
[82, 58, 309, 42, 316, 140, 84, 199, 215, 300]


Top 10 recommended products for user_id = 16 : (Samuel Simanjuntak)
Product IDs:
[82, 58, 215, 68, 135, 311, 21, 2