In [1]:
import pandas as pd
import ast
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load preprocessed data
preprocessed_profiles_rating_path = "../data/preprocessed_profiles_rating.csv"
df_profile_ratings = pd.read_csv(preprocessed_profiles_rating_path)

In [3]:
# convert product_id data type to int
df_profile_ratings['product_id'] = df_profile_ratings['product_id'].fillna(0).astype(int)

In [4]:
# convert skin type, hair issue, skin type body to numeric value (int)
def convert_skin_type_face(skin_type): 
    skin_type_dict = {'normal': 0, 'kering': 1, 'minyak': 2, 'sensitif': 3, 'kombinasi': 4}
    skin_type_list = ast.literal_eval(skin_type)
    return skin_type_dict.get(skin_type_list[0], 0)

def convert_hair_issue(hair_issue): 
    hair_issue_dict = {'normal': 1, 'ketombe': 1, 'kering': 2, 'minyak': 3, 'rontok': 4, 'cabang': 5}
    hair_issue_list = ast.literal_eval(hair_issue)
    return hair_issue_dict.get(hair_issue_list[0], 0)

def convert_skin_type_body(skin_type): 
    skin_type_dict = {'normal': 0, 'kering': 1, 'minyak': 2, 'kombinasi': 3} 
    skin_type_list = ast.literal_eval(skin_type)
    return skin_type_dict.get(skin_type_list[0], 0)

In [5]:
# functions to the respective columns in the data frame to converting the string values to numeric values based on the provided dictionaries.
df_profile_ratings["skin_type_face"] = df_profile_ratings["skin_type_face"].apply(convert_skin_type_face) 
df_profile_ratings["hair_issue"] = df_profile_ratings["hair_issue"].apply(convert_hair_issue) 
df_profile_ratings["skin_type_body"] = df_profile_ratings["skin_type_body"].apply(convert_skin_type_body)

In [6]:
# Get the minimum and maximum values for each column
column_min = df_profile_ratings[["skin_type_face", "hair_issue", "skin_type_body"]].min()
column_max = df_profile_ratings[["skin_type_face", "hair_issue", "skin_type_body"]].max()

# Perform min-max scaling
df_profile_ratings[["skin_type_face", "hair_issue", "skin_type_body"]] = (
    df_profile_ratings[["skin_type_face", "hair_issue", "skin_type_body"]] - column_min) / (column_max - column_min)

# Round the values to 3 decimal places
df_profile_ratings[["skin_type_face", "hair_issue", "skin_type_body"]] = df_profile_ratings[["skin_type_face", "hair_issue", "skin_type_body"]].round(2)

In [7]:
print(df_profile_ratings)

     id  user_id            name  product_id  rating     gender    age  \
0     6        7  Dini Sipahutar         215     5.0  Perempuan  18-25   
1     6        7  Dini Sipahutar          22     5.0  Perempuan  18-25   
2     6        7  Dini Sipahutar         311     5.0  Perempuan  18-25   
3     6        7  Dini Sipahutar         286     5.0  Perempuan  18-25   
4     6        7  Dini Sipahutar          82     4.0  Perempuan  18-25   
..   ..      ...             ...         ...     ...        ...    ...   
427  64       67          Wilona         294     5.0  Perempuan  18-38   
428  64       67          Wilona         315     4.0  Perempuan  18-39   
429  64       67          Wilona         166     4.0  Perempuan  18-40   
430  64       67          Wilona         168     5.0  Perempuan  18-41   
431  64       67          Wilona         262     4.0  Perempuan  18-42   

     skin_type_face  hair_issue  skin_type_body             allergy_history  \
0              0.00        0.50 

In [8]:
# Convert user_id to numeric before grouping
df_profile_ratings['user_id'] = df_profile_ratings['user_id'].astype(int)

# Calculate the mean vector for each user
user_vectors = df_profile_ratings.groupby('user_id')[['skin_type_face', 'hair_issue', 'skin_type_body']].mean().round(2)

# Reset the index
user_vectors.reset_index(inplace=True)

print(user_vectors)

    user_id  skin_type_face  hair_issue  skin_type_body
0         7            0.00        0.50            0.00
1         8            0.75        0.25            0.33
2        11            0.00        0.00            0.67
3        12            1.00        0.75            0.00
4        13            0.50        0.00            0.00
5        14            0.50        0.00            0.00
6        15            0.50        0.25            0.00
7        16            0.00        0.25            0.00
8        17            0.00        0.00            0.00
9        18            0.50        0.25            0.00
10       19            0.50        0.00            0.00
11       20            0.50        0.00            0.33
12       21            1.00        0.75            0.00
13       22            0.00        0.75            0.00
14       23            0.75        0.25            0.00
15       24            0.50        0.00            0.33
16       25            0.50        0.25         

In [9]:
# Ubah tipe data user_id ke integer (jika belum integer)
user_vectors['user_id'] = user_vectors['user_id'].astype(int)

# Filter hanya data user_id yang ada dalam file CSV
user_vectors = user_vectors[user_vectors['user_id'].isin(df_profile_ratings['user_id'].unique())]

# Simpan user_id sebelum dihapus dari user_vectors
user_ids = user_vectors['user_id']

# Hapus kolom user_id sebelum menghitung similaritas
user_vectors = user_vectors.drop('user_id', axis=1)

# Hitung similaritas kosinus antara semua pasangan pengguna
user_similarities = cosine_similarity(user_vectors)

# Konversi matriks similaritas ke dataframe pandas
user_similarities = pd.DataFrame(user_similarities, index=user_ids, columns=user_ids).round(2)

print(user_similarities)

user_id    7     8     11    12    13    14    15    16   17    18  ...    58  \
user_id                                                             ...         
7        1.00  0.29  0.00  0.60  0.00  0.00  0.45  1.00  0.0  0.45  ...  0.83   
8        0.29  1.00  0.39  0.88  0.88  0.88  0.91  0.29  0.0  0.91  ...  0.73   
11       0.00  0.39  1.00  0.00  0.00  0.00  0.00  0.00  0.0  0.00  ...  0.00   
12       0.60  0.88  0.00  1.00  0.80  0.80  0.98  0.60  0.0  0.98  ...  0.94   
13       0.00  0.88  0.00  0.80  1.00  1.00  0.89  0.00  0.0  0.89  ...  0.55   
14       0.00  0.88  0.00  0.80  1.00  1.00  0.89  0.00  0.0  0.89  ...  0.55   
15       0.45  0.91  0.00  0.98  0.89  0.89  1.00  0.45  0.0  1.00  ...  0.87   
16       1.00  0.29  0.00  0.60  0.00  0.00  0.45  1.00  0.0  0.45  ...  0.83   
17       0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.0  0.00  ...  0.00   
18       0.45  0.91  0.00  0.98  0.89  0.89  1.00  0.45  0.0  1.00  ...  0.87   
19       0.00  0.88  0.00  0

In [10]:
# Get all unique items
items = df_profile_ratings['product_id'].unique()

# Get all unique user IDs from the preprocessed data
unique_user_ids = df_profile_ratings['user_id'].unique()
print(items)

[215  22 311 286  82 140 112  63  51  21  18   5   8  68  92  85 216 307
 280 276 236  58  24 300   7 315 294 134  42 123 167 168 188 199  84 283
  16 309  47  20 263 219  61  78  93  62  39  43 282 316 165 169 261  35
 135 191   6 295 122 314  46 312 306  59  36 138 298 166 310  56  23 161
 207 252  88  32 142  40 179  95 141 163 170 120 208  94 302 228 193 147
 171  69  79  87 114 274 260   4 160   9   0 304 234 218 285 299  14  15
  37  25  81  83 121 156 159  66  60 164  57 211  13 305  50 116 149  33
  12 175  44 130 273 217  30  48 313 259 137  73 232  45  65  75  34 187
  67 265 255  28 233 214  64  74  96 278  11 267 258 195 229 292 254 251
 227 178 139  99  98  97  71  55  26 146 262]


In [11]:
# Initialize a dictionary to store the recommendation values for each user
recommendations = {}

# Iterate over each user ID from the preprocessed data
for user_id in unique_user_ids:
    # Calculate the sum of similarities for the current user
    similarity_sum = user_similarities.loc[user_id].sum()
    predictions = {}

    if similarity_sum > 0:  # Check if similarity_sum is greater than 0 to avoid division by zero
        for item in items:
            # Filter ratings for the item from other users
            other_user_ratings = df_profile_ratings[df_profile_ratings['product_id'] == item]

            # Calculate the sum of ratings weighted by similarity values
            rating_sum = 0
            for other_user_id in other_user_ratings['user_id']:
                if other_user_id != user_id:
                    rating = other_user_ratings[other_user_ratings['user_id'] == other_user_id]['rating'].values[0]
                    similarity = user_similarities.loc[user_id, other_user_id]
                    rating_sum += rating * similarity

            # Calculate the predicted rating for the item
            predictions[item] = rating_sum / similarity_sum

    # Store the predictions for the current user
    recommendations[user_id] = predictions

# Convert the recommendations dictionary to a pandas dataframe
recommendations_df = pd.DataFrame(recommendations)
recommendations_df = recommendations_df.T
recommendations_df = recommendations_df.round(2)

# Fill NaN values and 0 values with 0
recommendations_df = recommendations_df.fillna(0).replace(0, 0)

print(recommendations_df)

     215   22    311   286   82    140   112   63    51    21   ...   178  \
7   0.70  0.00  0.83  0.17  1.11  0.21  0.00  0.19  0.12  0.32  ...  0.17   
8   0.52  0.04  0.46  0.19  1.26  0.52  0.04  0.21  0.05  0.25  ...  0.04   
11  0.16  0.00  0.43  0.00  0.96  0.28  0.00  0.17  0.09  0.00  ...  0.13   
12  0.64  0.08  0.58  0.25  1.20  0.52  0.08  0.24  0.09  0.33  ...  0.06   
13  0.49  0.00  0.33  0.18  1.31  0.59  0.00  0.19  0.00  0.22  ...  0.00   
14  0.49  0.00  0.33  0.18  1.15  0.46  0.00  0.19  0.00  0.22  ...  0.00   
15  0.48  0.06  0.52  0.24  1.30  0.54  0.06  0.23  0.07  0.31  ...  0.04   
16  0.93  0.22  1.05  0.39  1.11  0.39  0.22  0.32  0.26  0.54  ...  0.17   
17  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  ...  0.00   
18  0.48  0.06  0.52  0.24  1.30  0.54  0.06  0.23  0.07  0.31  ...  0.04   
19  0.49  0.00  0.33  0.18  1.15  0.59  0.00  0.19  0.00  0.22  ...  0.00   
20  0.42  0.00  0.35  0.14  1.23  0.52  0.00  0.19  0.02  0.18  ...  0.03   

In [12]:
# Hitung MAE
absolute_errors = []
for user_id in unique_user_ids:
    user_actual_ratings = df_profile_ratings.loc[df_profile_ratings['user_id'] == user_id, ['product_id', 'rating']]
    user_actual_ratings = user_actual_ratings.dropna(subset=['product_id', 'rating'])  # Hapus baris dengan data kosong
    user_actual_ratings = user_actual_ratings.set_index('product_id')['rating'].to_dict()

    user_predicted_ratings = recommendations_df.loc[user_id].to_dict()

    for product_id in user_actual_ratings.keys():
        if product_id in user_predicted_ratings:
            actual_rating = user_actual_ratings[product_id]
            predicted_rating = user_predicted_ratings[product_id]
            absolute_error = abs(actual_rating - predicted_rating)
            absolute_errors.append(absolute_error)

# Hitung MAE hanya jika ada nilai dalam absolute_errors
if absolute_errors:
    mae = np.mean(absolute_errors)
    print(f"Mean Absolute Error (MAE): {mae:.3f}")
else:
    print("Tidak ada data untuk menghitung MAE.")

Mean Absolute Error (MAE): 4.180


In [13]:
# Iterate over each user ID
for user_id in unique_user_ids:
    # Look up the user's name in the df_profile_ratings dataframe
    user_name = df_profile_ratings.loc[df_profile_ratings['user_id'] == user_id, 'name'].values[0]

    # Select the row for the specified user
    user_recs = recommendations_df.loc[user_id]

    # Sort the series in descending order and select the top 10 values
    top_recs = user_recs.sort_values(ascending=False).iloc[:10].index.tolist()

    # Print the recommended product IDs and names
    print("Top 10 recommended products for user_id = {} : ({})".format(user_id, user_name))
    print("Product IDs:")
    print(top_recs)
    print("\n")

Top 10 recommended products for user_id = 7 : (Dini Sipahutar)
Product IDs:
[82, 58, 311, 42, 215, 135, 316, 168, 47, 315]


Top 10 recommended products for user_id = 8 : (Gladys)
Product IDs:
[82, 58, 316, 42, 62, 168, 68, 315, 140, 135]


Top 10 recommended products for user_id = 11 : (Suandika Napitupulu)
Product IDs:
[58, 316, 82, 62, 168, 47, 46, 87, 274, 122]


Top 10 recommended products for user_id = 12 : (Elisa Tambunan)
Product IDs:
[82, 58, 316, 215, 68, 42, 311, 135, 140, 62]


Top 10 recommended products for user_id = 13 : (Emy Sonia Sinambela)
Product IDs:
[82, 58, 316, 42, 62, 309, 140, 78, 168, 68]


Top 10 recommended products for user_id = 14 : (Stefhani Kezia)
Product IDs:
[58, 82, 316, 42, 62, 168, 78, 68, 141, 199]


Top 10 recommended products for user_id = 15 : (Josep Phyto Napitupulu)
Product IDs:
[82, 58, 316, 42, 168, 140, 309, 311, 315, 215]


Top 10 recommended products for user_id = 16 : (Samuel Simanjuntak)
Product IDs:
[82, 311, 58, 215, 42, 68, 135, 316,