In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import kagglehub
import os
from sklearn.metrics import average_precision_score

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    df = df.sample(frac=0.01, random_state=42).reset_index(drop=True)
    df = df[['name', 'main_category', 'sub_category', 'ratings']].fillna('')
    df['text_features'] = df['name'] + ' ' + df['main_category'] + ' ' + df['sub_category']
    return df

In [4]:
def train_recommender(df):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(df['text_features'])
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return vectorizer, similarity_matrix

In [5]:
def recommend(product_name, df, vectorizer, similarity_matrix, top_n=5):
    if product_name not in df['name'].values:
        return []
    idx = df[df['name'] == product_name].index[0]
    scores = list(enumerate(similarity_matrix[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    recommendations = [df.iloc[i[0]]['name'] for i in scores]
    return recommendations

In [6]:
def evaluate_recommendations(df, vectorizer, similarity_matrix):
    y_true_list = []
    y_scores_list = []
    sample_size = min(100, len(df))
    sampled_products = df['name'].sample(sample_size)

    for product in sampled_products:
        recommendations = recommend(product, df, vectorizer, similarity_matrix, top_n=5)
        relevant_items = df[df['main_category'] == df[df['name'] == product]['main_category'].values[0]]['name'].tolist()
        y_true = [1 if rec in relevant_items else 0 for rec in recommendations]
        y_scores = list(range(len(recommendations), 0, -1))

        if sum(y_true) > 0:
            y_true_list.append(y_true)
            y_scores_list.append(y_scores)

    map_score = np.mean([average_precision_score(y_true, y_score) for y_true, y_score in zip(y_true_list, y_scores_list)])
    return map_score

In [7]:
if __name__ == "__main__":
    path = kagglehub.dataset_download("lokeshparab/amazon-products-dataset")
    dataset_path = "/root/.cache/kagglehub/datasets/lokeshparab/amazon-products-dataset/versions/2"
    file_path = os.path.join(path, 'Amazon-Products.csv')
    df = load_data(file_path)

    vectorizer, similarity_matrix = train_recommender(df)

    # Evaluación del modelo con Mean Average Precision (MAP)
    map_score = evaluate_recommendations(df, vectorizer, similarity_matrix)
    print(f"Mean Average Precision (MAP): {map_score}")

    # Ejemplo de recomendación
    product_name = df['name'].iloc[0]  # Primer producto del dataset
    recommendations = recommend(product_name, df, vectorizer, similarity_matrix)
    print(f"Recomendaciones para {product_name}: {recommendations}")

Mean Average Precision (MAP): 0.9496527777777777
Recomendaciones para MyBlush Women Top: ['ONLY Women Dress', 'Puma Women T-Shirt', 'TAGAS Women Western Top', "GO COLORS Women's Regular Bottom", 'Max Women Dress']


In [8]:
# Guardar dataset en un csv
df.to_csv('amazon_products.csv', index=False)