In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [8]:
# Sample DataFrame
data = {'Category': ['A', 'A', 'B', 'B', 'A'],
        'Item': ['Apple', 'Banana', 'Carrot', 'Date', 'Grape']}
df = pd.DataFrame(data)

# String aggregation by 'Category'
result = df.groupby('Category')['Item'].agg(lambda x: ', '.join(x))

print(result)

Category
A    Apple, Banana, Grape
B            Carrot, Date
Name: Item, dtype: object


In [29]:
products_df = pd.read_json('meta_Sports_and_Outdoors.jsonl', lines=True, nrows=10000)
reviews_df = pd.read_json('Sports_and_Outdoors.jsonl', lines=True, nrows=10000)


In [9]:
print(products_df.columns)
print(reviews_df.columns)

Index(['main_category', 'title', 'average_rating', 'rating_number', 'features',
       'description', 'price', 'images', 'videos', 'store', 'categories',
       'details', 'parent_asin', 'bought_together'],
      dtype='object')
Index(['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id',
       'timestamp', 'helpful_vote', 'verified_purchase'],
      dtype='object')


In [30]:
# Kết hợp dữ liệu từ hai DataFrame bằng cột category_id và id
merged_df = pd.merge(products_df, reviews_df, left_on='parent_asin', right_on='parent_asin')

# Xóa cột 'category_id' cuối cùng
#merged_df.drop(columns=['category_id'], inplace=True)

In [34]:
print(merged_df.columns)


Index(['main_category', 'title_x', 'average_rating', 'rating_number',
       'features', 'description', 'price', 'images_x', 'videos', 'store',
       'categories', 'details', 'parent_asin', 'bought_together', 'subtitle',
       'author', 'rating', 'title_y', 'text', 'images_y', 'asin', 'user_id',
       'timestamp', 'helpful_vote', 'verified_purchase'],
      dtype='object')


In [32]:
print(merged_df.head())
print(len(merged_df))


    main_category                                            title_x  \
0  AMAZON FASHION  FOCO NFL Resin 11.5" Team Logo Outdoor Garden ...   
1     Amazon Home  Copco Freezeable Double Wall Insulated Tritan ...   
2  AMAZON FASHION       FOCO Womens NFL Team Color Faux Fur Moccasin   
3  AMAZON FASHION  90 Degree By Reflex High Waist Fleece Lined Le...   
4  AMAZON FASHION  90 Degree By Reflex High Waist Fleece Lined Le...   

   average_rating  rating_number  \
0             4.7            137   
1             4.1            428   
2             4.4           1236   
3             4.4           7489   
4             4.4           7489   

                                            features  \
0  [11.5-Inch tall, Hand painted, Officially lice...   
1  [GENEROUS CAPACITY: holds 18-fluid ounces of l...   
2  [Gripped outsole, Embroidered logo, Material -...   
3  [Pull On closure, Pocket Option. These brushed...   
4  [Pull On closure, Pocket Option. These brushed...   

             

In [36]:
grouped_df = merged_df.groupby('parent_asin')

In [None]:
for parent_asin, frame in grouped_df:
     print(f"First 2 entries for {parent_asin!r}")
     print("------------------------")
     print(frame.head(2), end="\n\n")

In [64]:
# import json
# from json import JSONEncoder
# import numpy
# class NumpyArrayEncoder(JSONEncoder):
#     def default(self, obj):
#         if isinstance(obj, numpy.ndarray):
#             return obj.tolist()
#         return JSONEncoder.default(self, obj)
        
for parent_asin, frame in grouped_df:
    print(parent_asin, frame.iloc[0, frame.columns.get_loc('parent_asin')], frame.iloc[0, frame.columns.get_loc('title_x')], )
    print(frame[['user_id', 'rating']].to_json(orient='records'))

B000069K78 B000069K78 Polar RC3 GPS Sports Watch
[{"user_id":"AEBYEWLSJLC7OEDDSRZEQWY5WAZA","rating":5}]
B000HZGKVG B000HZGKVG Moultrie Universal Digital Timer Feeder, Pack of 1
[{"user_id":"AEK7VZMQTZU7BFZQQDYXNNJ2NMUA","rating":4}]
B00142MD62 B00142MD62 SRAM Supercork Bicycle Bar Tape
[{"user_id":"AGCI7FAH4GL5FI65HYLKWTMFZ2CQ","rating":5}]
B001Q3LTXC B001Q3LTXC Aquasphere Alpha Unisex Adult Swimming Fins - Lightweight Foam Unparalleled Comfort, Improves Posture, Strength, Technique, & Flexibility in Traininng & Competition
[{"user_id":"AGDLD6R6Z6WB5KNG6Z2L6LNRMLZQ","rating":4}]
B0029VOR0M B0029VOR0M Champion Sports Exercise Medicine Balls, 8 Sizes, Leather with No-Slip Grip - Weighted Med Ball Set for Weight Training, Stability, Plyometrics, Cross Training, Core Strength - Heavy Workout Ball
[{"user_id":"AEU47A5R7NFWXCQLJMZA34AM63EA","rating":5}]
B002BK4YJ0 B002BK4YJ0 Hunter Manufacturers Philadelphia Phillies Dog Collar & Leash Set
[{"user_id":"AEVMPQORMKIMWMBQJRKJJVXI3FIA","rating"

In [56]:
import json
import yaml
from elasticsearch import Elasticsearch, helpers
from sentence_transformers import SentenceTransformer

In [80]:
def get_client_es():
    """
    Initializes Elasticsearch client using cloud_id and api_key from config.yml
    """
    with open("config.yml", "r") as file:
        config = yaml.safe_load(file)
    return Elasticsearch(config["cloud_url"], api_key=config["api_key"])

In [96]:
def generate_bulk_actions(index_name, grouped_df):
    """
    Generates bulk actions for Elasticsearch from data batches.
    Adds 'description_embeddings' by encoding the 'description' field.
    """
    for parent_asin, frame in grouped_df:
        document_id = parent_asin
        item_record = frame.iloc[0][['main_category', 'title_x']]
        item_record['reviews'] = frame[['user_id', 'rating']].to_json(orient='records')
        item_text = item_record.to_json(orient='columns')
        #item = {'parent_asin': parent_asin}
        # item["description_embeddings"] = get_text_vector(item["description"])
        yield {"_index": index_name, "_id": document_id, "_source": item_text}

In [97]:
df = pd.DataFrame(list(generate_bulk_actions("foo", grouped_df)))
print(df)

    _index         _id                                            _source
0      foo  B000069K78  {"main_category":"Sports & Outdoors","title_x"...
1      foo  B000HZGKVG  {"main_category":"Sports & Outdoors","title_x"...
2      foo  B00142MD62  {"main_category":"Sports & Outdoors","title_x"...
3      foo  B001Q3LTXC  {"main_category":"Sports & Outdoors","title_x"...
4      foo  B0029VOR0M  {"main_category":"Sports & Outdoors","title_x"...
..     ...         ...                                                ...
238    foo  B0C1HN2MP3  {"main_category":"Sports & Outdoors","title_x"...
239    foo  B0C51VB7X3  {"main_category":"Sports & Outdoors","title_x"...
240    foo  B0C5MD619H  {"main_category":"AMAZON FASHION","title_x":"X...
241    foo  B0C5NR1ZB6  {"main_category":"Sports & Outdoors","title_x"...
242    foo  B0C655KXW4  {"main_category":"Sports & Outdoors","title_x"...

[243 rows x 3 columns]


In [73]:
def index_grouped_df(grouped_df, index_name):
    """
    Indexes data from the JSON file in batches using Elasticsearch helpers.bulk.
    """
    actions = generate_bulk_actions(index_name, grouped_df)
    success, failed = helpers.bulk(get_client_es(), actions)
    print(f"Batch indexed: {success} successful, {failed} failed")

In [98]:
index_grouped_df(grouped_df, "gear_products")

Batch indexed: 243 successful, [] failed


In [35]:
denorm_df = merged_df.groupby('parent_asin')['user_id'].agg(lambda x: ', '.join(x))
print(denorm_df)

parent_asin
B000069K78                         AEBYEWLSJLC7OEDDSRZEQWY5WAZA
B000HZGKVG                         AEK7VZMQTZU7BFZQQDYXNNJ2NMUA
B00142MD62                         AGCI7FAH4GL5FI65HYLKWTMFZ2CQ
B001Q3LTXC                         AGDLD6R6Z6WB5KNG6Z2L6LNRMLZQ
B0029VOR0M                         AEU47A5R7NFWXCQLJMZA34AM63EA
                                    ...                        
B0C1HN2MP3    AFAIJYOUO3NAWLBDIKTQSC3DASWA, AFPTVMYVXZNH3U4F...
B0C51VB7X3    AFCV25KA7XSAJGTZRDML5B7UYOVQ, AFRFWDOOO3YXIEPZ...
B0C5MD619H                         AFJBKPK5W56XWSNPQU2WW66ISWYQ
B0C5NR1ZB6    AHPFHP43AXWRYZZ4HPNCW7I7J3ZQ, AEUAL3EJKUSTNB4Y...
B0C655KXW4                         AGZMKHWSCB3UXDGFUPFRZSL4EAWQ
Name: user_id, Length: 243, dtype: object


In [6]:
# Loại bỏ các hàng có giá trị NaN trong cột 'title'
merged_df.dropna(subset=['title'], inplace=True)

In [None]:
unique_values = merged_df['category_name'].unique().tolist()
print(unique_values)

In [15]:
cats_to_filter = [
"Boys' Clothing",
"Boys' Shoes",
"Boys' Watches",
"Girls' Clothing",
"Girls' Shoes",
"Girls' Watches",
"Men's Shoes",
"Women's Shoes",
'Backpacks',
'Electrical Equipment',
'Electronic Components',
'Gift Cards',
'Headphones & Earbuds',
'Luggage',
'Outdoor Recreation',
'Portable Audio & Video',
'Safety & Security',
'Sports & Fitness',
'Sports & Outdoor Play Toys',
'Sports & Outdoors',
'Sports Nutrition Products',
'Travel Accessories',
'Travel Duffel Bags',
'Travel Tote Bags',
'Vehicle Electronics',
'Wearable Technology',
'Wellness & Relaxation Products'
]
print(cats_to_filter[0])

Boys' Clothing


In [16]:
filtered_df = merged_df.loc[merged_df['category_name'].isin(cats_to_filter)]
print(filtered_df.head())

             asin                                              title  \
21427  B01NCNMEV6  American Heritage 8” Moc Toe Work Boots For Me...   
21428  B07CZ221PW             Men's Speedcross 5 Trail Running Shoes   
21429  B09VCJ7PGR           Women's Terrex AX4 Sneaker - Hiking Shoe   
21430  B01BPL5UAM                                           mens Hvc   
21431  B091ZGPRCW            Unisex-Adult Adilette Clog Slide Sandal   

                                                  imgUrl  \
21427  https://m.media-amazon.com/images/I/71fCpf+5k0...   
21428  https://m.media-amazon.com/images/I/71nJ48O6aF...   
21429  https://m.media-amazon.com/images/I/81IusNGNYB...   
21430  https://m.media-amazon.com/images/I/51R8w-Wqxj...   
21431  https://m.media-amazon.com/images/I/61DouGlK6m...   

                                 productURL  stars  reviews   price  \
21427  https://www.amazon.com/dp/B01NCNMEV6    4.4        0  253.85   
21428  https://www.amazon.com/dp/B07CZ221PW    4.7        0  139

In [7]:
# Chọn các cột cần thiết cho mô hình
product_titles = merged_df['title']

In [8]:
# Bước 1: Chuẩn bị dữ liệu với TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(product_titles)


In [9]:
print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (1426336, 428801)


In [10]:
# Bước 2: Huấn luyện mô hình NearestNeighbors
nn_model = NearestNeighbors(metric='cosine', algorithm='brute')
nn_model.fit(tfidf_matrix)

In [11]:
# Bước 3: Tạo hàm gợi ý sản phẩm
def recommend_products(product_name, num_recommendations=5):
    # Chuyển đổi tên sản phẩm sang vector TF-IDF
    query_vector = vectorizer.transform([product_name])
    
    # Tìm kiếm các sản phẩm tương tự
    distances, indices = nn_model.kneighbors(query_vector, n_neighbors=num_recommendations)
    
    # Lấy thông tin các sản phẩm được gợi ý
    recommendations = merged_df.iloc[indices[0]].copy()
    recommendations['distance'] = distances[0]
    
    return recommendations[['title', 'stars', 'price', 'category_name', 'distance']]


In [12]:
# Thử nghiệm hàm gợi ý sản phẩm
product_name = "Xpedition 30 Inch Multi-Pocket Upright Rolling Duffel Bag "
recommended_products = recommend_products(product_name, num_recommendations=5)
print(recommended_products)

                                                    title  stars  price  \
373016  Xpedition 30 Inch Multi-Pocket Upright Rolling...    4.3  31.84   
14      Xpedition 30 Inch Multi-Pocket Upright Rolling...    4.3  42.00   
373208        8 Pocket Rolling Duffel Bag, Black, 22 inch    4.5  64.09   
373299              Rolling Duffel Bag, Charcoal, 30-Inch    4.0  35.49   
372997                                 Rolling Duffel Bag    4.2  20.49   

             category_name  distance  
373016  Travel Duffel Bags  0.000000  
14               Suitcases  0.000000  
373208  Travel Duffel Bags  0.441438  
373299  Travel Duffel Bags  0.450364  
372997  Travel Duffel Bags  0.452113  


In [13]:
# Bước 3: Tạo hàm gợi ý sản phẩm theo ID và kèm theo hình ảnh
def recommend_products_by_id(product_id, num_recommendations=5):
    # Lấy tên sản phẩm từ ID
    product_name = merged_df.loc[merged_df['asin'] == product_id, 'title'].values[0]
    
    # Chuyển đổi tên sản phẩm sang vector TF-IDF
    query_vector = vectorizer.transform([product_name])
    
    # Tìm kiếm các sản phẩm tương tự
    distances, indices = nn_model.kneighbors(query_vector, n_neighbors=num_recommendations)
    
    # Lấy thông tin các sản phẩm được gợi ý
    recommendations = merged_df.iloc[indices[0]].copy()
    recommendations['distance'] = distances[0]
    
    return recommendations[['title', 'stars', 'price', 'category_name', 'imgUrl', 'distance']]



In [14]:
from IPython.display import display, HTML

def recommend_products_by_id(product_id, num_recommendations=5):
    # Lấy tên sản phẩm từ ID
    product_name = merged_df.loc[merged_df['asin'] == product_id, 'title'].values[0]
    
    # Chuyển đổi tên sản phẩm sang vector TF-IDF
    query_vector = vectorizer.transform([product_name])
    
    # Tìm kiếm các sản phẩm tương tự
    distances, indices = nn_model.kneighbors(query_vector, n_neighbors=num_recommendations)
    
    # Lấy thông tin các sản phẩm được gợi ý
    recommendations = merged_df.iloc[indices[0]].copy()
    recommendations['distance'] = distances[0]
    
    # Hiển thị thông tin sản phẩm và hình ảnh
    for index, row in recommendations.iterrows():
        display(HTML(f"<div style='border: 1px solid #ccc; border-radius: 5px; padding: 10px; margin-bottom: 20px;'>"))
        display(HTML(f"<h3>{row['title']}</h3>"))
        display(HTML(f"<p><b>Stars:</b> {row['stars']}</p>"))
        display(HTML(f"<p><b>Price:</b> ${row['price']}</p>"))
        display(HTML(f"<p><b>Category:</b> {row['category_name']}</p>"))
        display(HTML(f'<img src="{row["imgUrl"]}" alt="{row["title"]}" style="width:200px;height:200px; margin-top: 10px;">'))
        display(HTML(f"<p style='margin-top: 10px;'><b>Distance:</b> {row['distance']}</p>"))
        display(HTML("</div>"))


In [15]:

# Thử nghiệm hàm gợi ý sản phẩm theo ID
product_id = "B01DJLKZBA"  # Thay thế bằng ID sản phẩm cụ thể
recommend_products_by_id(product_id, num_recommendations=5)


In [16]:
from IPython.display import display, HTML

def recommend_products_by_name(product_name, num_recommendations=5):
    # Chuyển đổi tên sản phẩm sang vector TF-IDF
    query_vector = vectorizer.transform([product_name])
    
    # Tìm kiếm các sản phẩm tương tự
    distances, indices = nn_model.kneighbors(query_vector, n_neighbors=num_recommendations)
    
    # Lấy thông tin các sản phẩm được gợi ý
    recommendations = merged_df.iloc[indices[0]].copy()
    recommendations['distance'] = distances[0]
    
    # Hiển thị thông tin sản phẩm và hình ảnh
    for index, row in recommendations.iterrows():
        display(HTML(f"<div style='border: 1px solid #ccc; border-radius: 5px; padding: 10px; margin-bottom: 20px;'>"))
        display(HTML(f"<h3>{row['title']}</h3>"))
        display(HTML(f"<p><b>Stars:</b> {row['stars']}</p>"))
        display(HTML(f"<p><b>Price:</b> ${row['price']}</p>"))
        display(HTML(f"<p><b>Category:</b> {row['category_name']}</p>"))
        display(HTML(f'<img src="{row["imgUrl"]}" alt="{row["title"]}" style="width:200px;height:200px; margin-top: 10px;">'))
        display(HTML(f"<p style='margin-top: 10px;'><b>Distance:</b> {row['distance']}</p>"))
        display(HTML("</div>"))



In [17]:
# Sử dụng hàm với tên sản phẩm cụ thể
product_name = "Xpedition 30 Inch Multi-Pocket Upright Rolling Duffel Bag"  # Thay thế bằng tên sản phẩm cụ thể
recommend_products_by_name(product_name, num_recommendations=5)
