In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tabulate import tabulate

In [2]:
# Load preprocessed data
preprocessed_product_path = "../data/preprosessing_data/preprocessed_products.csv"
products_df = pd.read_csv(preprocessed_product_path)

In [3]:
# Define columns to consider for content-based filtering
columns_to_consider = ['category_id', 'subcategory_id', 'skin_type']

In [4]:
# Combine selected columns into a single string for TF-IDF calculation
products_df['combined_features'] = products_df[columns_to_consider].apply(lambda x: ' '.join(x.astype(str)), axis=1)

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the combined features to TF-IDF vectors
tfidf_matrix = tfidf_vectorizer.fit_transform(products_df['combined_features'])

In [5]:
from tabulate import tabulate

# Initialize variables to count TP and FP
TP = 0
FP = 0

# Define threshold for relevance
threshold = 0.5

# Loop through each product as query
for i in range(len(products_df)):
    # Define query vector
    query_vector = tfidf_matrix[i]

    # Calculate cosine similarity with all other products
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix)

    # Sort the cosine similarities
    similar_indices = cosine_similarities.argsort()[0][-17:-1]  # Get indices of top 16 most similar items

    # Prepare data for recommendation table
    recommendation_table_data = []
    for idx in similar_indices:
        if cosine_similarities[0][idx] > 0:  # Ensuring similarity is greater than 0
            similarity_score = round(cosine_similarities[0][idx], 2)
            product_name = products_df['name'][idx]
            recommendation_table_data.append([similarity_score, product_name])
            if similarity_score >= threshold:
                TP += 1
            else:
                FP += 1

    # Sort recommendation table data based on similarity score
    recommendation_table_data = sorted(recommendation_table_data, key=lambda x: x[0], reverse=True)

    # Prepare data for TF-IDF weights table
    tfidf_table_data = []
    feature_names = tfidf_vectorizer.get_feature_names_out()
    feature_weights = tfidf_matrix[i].toarray()[0]
    for feature, weight in zip(feature_names, feature_weights):
        tfidf_table_data.append([feature, round(weight, 2)])

    # Print recommendations for the current product
    print("\nRecommendations for product:", products_df['name'][i])
    print(tabulate(recommendation_table_data, headers=["Similarity Score", "Product Name"][:16]))
    print("\nTF-IDF Weights:")
    print(tabulate(tfidf_table_data, headers=["Feature", "Weight"]))

# Calculate Precision
precision = (TP / (FP + TP)) * 100
print("\n" + 100 * "=")
print("Precision:", precision)


Recommendations for product: ['zinc', 'refreshing', 'cool', 'anti', 'ketombe']
  Similarity Score  Product Name
------------------  --------------------------------------------------------------------------------
              1     ['zinc', 'active', 'fresh', 'anti', 'ketombe']
              1     ['zinc', 'clean', 'active', 'anti', 'ketombe']
              1     ['wardah', 'anti', 'dandruff', 'shampoo', 'anti', 'ketombe']
              1     ['clear', 'ice', 'cool', 'menthol', 'anti', 'ketombe']
              1     ['serasoft', 'dandruff', 'treatment', 'anti', 'ketombe']
              1     ['selsun', 'yellow', 'double', 'impcat', 'rambut', 'ketombe']
              1     ['lifebuoy', 'shampoo', 'biru', 'anti', 'dandruff']
              1     ['clear', 'shampoo', 'cool', 'sport', 'menthol', 'men', 'anti', 'ketombe']
              1     ['head', 'shoulder', 'cool', 'menthol', 'anti', 'dandruff']
              1     ['sunsilk', 'lidah', 'buaya', 'tea', 'tree', 'anti', 'ketombe']
      