In [2]:
import os
import json
import ast
import pandas as pd
from typing import Dict, List, Tuple

from utils.dataset import parse_args, seed_everything, amazon_dataset2fullname
from utils.text import clean_metadata

In [None]:
def _flatten_categories(categories_value) -> str:
    if isinstance(categories_value, str):
        return categories_value
    if isinstance(categories_value, list):
        parts: List[str] = []
        for item in categories_value:
            if isinstance(item, str):
                if item.strip():
                    parts.append(item.strip())
            elif isinstance(item, list):
                parts.extend([str(x).strip() for x in item if str(x).strip()])
            else:
                s = str(item).strip()
                if s:
                    parts.append(s)
        return ', '.join(parts)
    if isinstance(categories_value, dict):
        parts = [str(v).strip() for v in categories_value.values() if str(v).strip()]
        return ', '.join(parts)
    return ''


def _build_text(obj: dict, dataset_full_name: str) -> str:
    title = str(obj.get('title', '') or '').strip()
    price = str(obj.get('price', '') or '').strip()
    brand = str(obj.get('brand', '') or '').strip()
    categories = _flatten_categories(obj.get('categories', ''))
    description = str(obj.get('description', '') or '').strip()

    segments: List[str] = []
    if title:
        segments.append(f"title is {title}")
    if price:
        segments.append(f"price is {price}")
    if brand:
        segments.append(f"brand is {brand}")
    if categories:
        segments.append(f"categories is {categories}")
    if description:
        segments.append(f"description is {description}")

    if not segments:
        return ''
    return f"The {dataset_full_name} item has the following attributes: \n " + '; '.join(segments)

In [4]:
dataset_name = 'beauty'
dataset_full_name = amazon_dataset2fullname[dataset_name]

meta_path = os.path.join(dataset_name, f'meta_{dataset_full_name}.json')

asin_to_text = {}
asin_to_image = {}

# 1) 라인 파싱 후 DataFrame 구성
rows: List[Dict[str, object]] = []
with open(meta_path, 'r') as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            obj = json.loads(line)
        except json.JSONDecodeError:
            try:
                obj = ast.literal_eval(line)
            except (ValueError, SyntaxError):
                continue

        asin = obj.get('asin') or obj.get('ASIN')
        if not asin:
            continue

        rows.append({
            'asin': asin,
            'title': obj.get('title', ''),
            'price': obj.get('price', ''),
            'brand': obj.get('brand', ''),
            'categories': obj.get('categories', ''),
            'description': obj.get('description', ''),
            'imUrl': obj.get('imUrl', ''),
        })

df = pd.DataFrame(rows)

string_cols = ['asin', 'title', 'price', 'brand', 'categories', 'description', 'imUrl']
for col in string_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).apply(lambda x: x.strip())

# 2) 텍스트 전처리
df = clean_metadata(df)

# 3) 포맷팅 및 사전 구축
for _, row in df.iterrows():
    asin = row['asin']
    image_url = row.get('imUrl', '')
    if isinstance(image_url, str) and image_url:
        asin_to_image[asin] = image_url

    obj_clean = {
        'title': row.get('title', ''),
        'price': row.get('price', ''),
        'brand': row.get('brand', ''),
        'categories': row.get('categories', ''),
        'description': row.get('description', ''),
    }
    text_formatted = _build_text(obj_clean, dataset_full_name)
    if text_formatted:
        asin_to_text[asin] = text_formatted


If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  s = BeautifulSoup(s, 'html.parser').get_text(separator=' ')


In [5]:
df

Unnamed: 0,asin,title,price,brand,categories,description,imUrl
0,0205616461,Bio-Active Anti-Aging Serum (Firming Ultra-Hyd...,,,"[[Beauty, Skin Care, Face, Creams & Moisturize...","As we age, our once youthful, healthy skin suc...",http://ecx.images-amazon.com/images/I/41DecrGO...
1,0558925278,Eco Friendly Ecotools Quality Natural Bamboo C...,,,"[[Beauty, Tools & Accessories, Makeup Brushes ...",Mineral Powder Brush--Apply powder or mineral ...,http://ecx.images-amazon.com/images/I/51L%2BzY...
2,0733001998,Mastiha Body Lotion,,,"[[Beauty, Skin Care, Body, Moisturizers, Lotio...","From the Greek island of Chios, this Mastiha b...",http://ecx.images-amazon.com/images/I/311WK5y1...
3,0737104473,Hello Kitty Lustre Lipstick (See sellers comme...,,,"[[Beauty, Makeup, Lips, Lipstick]]",Limited edition Hello Kitty Lipstick featuring...,http://ecx.images-amazon.com/images/I/31u6Hrzk...
4,0762451459,Stephanie Johnson Mermaid Round Snap Mirror,19.98,,"[[Beauty, Tools & Accessories, Mirrors, Makeup...","The mermaid is an elusive (okay, mythical) cre...",http://ecx.images-amazon.com/images/I/41y2%2BF...
...,...,...,...,...,...,...,...
259199,B00LP2YB8E,2t 2t Edge Crystal Rhinestones Bridal Wedding ...,,,"[[Beauty, Hair Care, Styling Tools, Styling Ac...",Color: White Fullness72 inches Center Gathered...,http://ecx.images-amazon.com/images/I/41E630m-...
259200,B00LOS7MEE,French Manicure Gel Nail Polish Set - Set of 4...,,,"[[Beauty, Makeup, Nails, Nail Polish]]","The secret to long lasting colors, healthy nai...",http://ecx.images-amazon.com/images/I/41skHL1O...
259201,B00LPVG6V0,ResQ Organics Face & Body Wash - Aloe Vera Man...,,,"[[Beauty, Skin Care, Face, Creams & Moisturize...",ResQ Organics Face & Body Wash - With Aloe Ver...,http://ecx.images-amazon.com/images/I/31C1w4Ku...
259202,B00LTDUHJQ,2 Tier Tulle Elbow Wedding Veil with Ribbon Ed...,,,"[[Beauty, Hair Care, Styling Tools, Styling Ac...",Color: White 2 Tier Fullness 72 inches Sewn on...,http://ecx.images-amazon.com/images/I/51%2B%2B...


In [6]:
df['asin'].unique().shape

(259204,)

In [15]:
import pickle
import numpy as np

# gme_qwen2vl2b_text_fp16.pkl
# gme_qwen2vl2b_image_fp16.pkl

with open('beauty/gme_qwen2vl2b_text_fp16.pkl', 'rb') as f:
    text_arr = pickle.load(f)

with open('beauty/gme_qwen2vl2b_image_fp16.pkl', 'rb') as f:
    image_arr = pickle.load(f)



In [None]:
# l2 norm value
l2_norm_text = np.linalg.norm(text_arr, axis=-1)
l2_norm_image = np.linalg.norm(image_arr, axis=-1)

print(l2_norm_text)
print(l2_norm_image)

# l2 normalize
# text_arr = text_arr / np.linalg.norm(text_arr, axis=-1, keepdims=True)
# image_arr = image_arr / np.linalg.norm(image_arr, axis=-1, keepdims=True)

[1. 1. 1. ... 1. 1. 1.]
[1.     0.9995 1.     ... 1.     1.     1.    ]


  image_arr = image_arr / np.linalg.norm(image_arr, axis=-1, keepdims=True)


In [13]:
text_arr

array([[-0.01717  ,  0.002562 , -0.00605  , ..., -0.03607  ,  0.00167  ,
         0.01541  ],
       [ 0.00443  ,  0.01315  , -0.005157 , ..., -0.0508   , -0.003157 ,
         0.010994 ],
       [-0.02972  ,  0.01888  , -0.07007  , ..., -0.01898  ,  0.01587  ,
         0.01412  ],
       ...,
       [-0.0161   ,  0.045    ,  0.00545  , ...,  0.00767  ,  0.01776  ,
         0.01776  ],
       [-0.0042   ,  0.06335  ,  0.005356 , ...,  0.00692  , -0.0002335,
        -0.02011  ],
       [-0.02356  ,  0.05283  ,  0.004704 , ...,  0.01718  ,  0.00644  ,
        -0.00956  ]], dtype=float16)