 # save sample image and json

In [None]:
from tqdm import tqdm
import os
import json
import requests

In [3]:
# load review dataset
file_name = "review.json"
with open(file_name, "r") as f:
    review_dict = json.load(f)

In [5]:
len(review_dict)

4792357

In [36]:
# 이미지 다운로드 및 저장 함수 (Download and save image function)
def download_and_save_image(url, img_file_path, retries=3):
    for _ in range(retries):
        try:
            response = requests.get(url, timeout=10)  # Increase timeout to 10 seconds
            if response.status_code == 200:
                with open(img_file_path, 'wb') as f:
                    f.write(response.content)
                return True  # Return True if download is successful
        except Exception as e:
            print(f"Error downloading image: {e}")
    return False  # Return False if download fails after retries

# 이미지와 JSON 저장을 위한 경로 설정 (Set up paths for saving images and JSON)
def create_directories(parent_asin):
    img_path = f'sample/{parent_asin}/cgi' # cgi 폴더 따로 파서 저장함
    os.makedirs(img_path, exist_ok=True)
    json_path = f'sample/{parent_asin}/review.json'
    return img_path, json_path


# save image and json
"""
- parent_asin 별로 이미지 다운 및 저장
- 총 이미지가 50개 이상인 경우만 저장
- {parent_asin}_{user_id}_{photo_id}.jpg로 저장
- Json 파일은 전체 리뷰 저장 + 이미지의 경우, 'save_path'에 로컬 경로까지 포함해서 저장

"""
for parent_asin, reviews in review_dict.items():
    
    if not isinstance(reviews, list) or not all(isinstance(r, dict) for r in reviews):
        print(f"Invalid format for reviews of {parent_asin}")
        continue

    total_image_count = sum(len(r.get('images', [])) for r in reviews)

    if total_image_count >= 50:
    
        img_path, json_path = create_directories(parent_asin)
    
        sample_dict = {parent_asin: []}

        print('current parent_asin: ', parent_asin)

        for i, review in enumerate(tqdm(reviews)):
            if 'images' in review and isinstance(review['images'], list):
                new_review = review.copy()
                for j, image in enumerate(review['images']):
                    img_file_path = f'{img_path}/{parent_asin}_{i}_{j}.jpg'
                    image['save_path'] = img_file_path
                    download_successful = download_and_save_image(image['medium_image_url'], img_file_path)
                    if not download_successful:
                        print(f"Failed to download image {image['medium_image_url']}")
                sample_dict[parent_asin].append(new_review)

        # JSON 파일로 저장 (Save as JSON file)
        with open(json_path, 'w') as json_file:
            json.dump(sample_dict, json_file, indent=4)

        print("JSON file saved:", json_path)


current parent_asin:  B09NSZ5QMF


100%|██████████| 254/254 [00:03<00:00, 69.17it/s] 


JSON file saved: sample/B09NSZ5QMF/review.json
current parent_asin:  B07RGM3DYC


100%|██████████| 299/299 [00:05<00:00, 54.06it/s] 


JSON file saved: sample/B07RGM3DYC/review.json
current parent_asin:  B07BWS4CSM


100%|██████████| 1167/1167 [00:17<00:00, 66.93it/s]


JSON file saved: sample/B07BWS4CSM/review.json
current parent_asin:  B07HJ84J9M


100%|██████████| 680/680 [00:25<00:00, 26.84it/s]


JSON file saved: sample/B07HJ84J9M/review.json
current parent_asin:  B01DDC83C8


100%|██████████| 830/830 [02:43<00:00,  5.07it/s]


JSON file saved: sample/B01DDC83C8/review.json
current parent_asin:  B07V6PKCCG


100%|██████████| 1528/1528 [00:20<00:00, 73.11it/s] 


JSON file saved: sample/B07V6PKCCG/review.json
current parent_asin:  B07TVHSDMQ


 72%|███████▏  | 122253/169965 [1:21:20<31:44, 25.05it/s]  


ConnectionError: HTTPSConnectionPool(host='images-na.ssl-images-amazon.com', port=443): Max retries exceeded with url: /images/I/A11d65YRe+L._SL800_.jpg (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f8e048c9b80>: Failed to establish a new connection: [Errno 60] Operation timed out'))

In [32]:
# meta dataload
meta_dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Clothing_Shoes_and_Jewelry", split="full", trust_remote_code=True)
print(meta_dataset[0])

Loading dataset shards:   0%|          | 0/31 [00:00<?, ?it/s]

{'main_category': 'AMAZON FASHION', 'title': "BALEAF Women's Long Sleeve Zip Beach Coverup UPF 50+ Sun Protection Hooded Cover Up Shirt Dress with Pockets", 'average_rating': 4.2, 'rating_number': 422, 'features': ['90% Polyester, 10% Spandex', 'Zipper closure', 'Machine Wash', 'Long sleeve sun protection coverups--UPF 50+ blocks the sun from burning', 'Zipped v-neckline--fashionable V neck and smooth 1/4 zipper allows to staying place as you like', 'Two drop-in side pockets--hold your phone or keys well，no worries of falling out', 'Hoodie with non-slip drawcord--Enhancing hooded design is convenient to wrap your face and enough space to put your head and hair easily', 'A flattering coverups company you spend all day on the beach，traveling with lovers or busying around house. Recommended For everyday leisure or daily exercise'], 'description': [], 'price': '31.99', 'images': {'hi_res': ['https://m.media-amazon.com/images/I/61KIZjb54AL._AC_UL1500_.jpg', 'https://m.media-amazon.com/image

In [None]:
save_path = f'sample/{parent_asin}/meta.json'

meta_dict = meta_dataset[parent_asin] 

with open(save_path, "w") as f:
    json.dump(meta_dict, f)