In [None]:
%%writefile review.yaml
api:
    - https://tiki.vn/api/v2/reviews?limit=20&include=comments,contribute_info,attribute_vote_summary&product_id=260811707
    - https://shopee.vn/api/v2/item/get_ratings?itemid=17489310712&shopid=35445890

In [None]:
%%writefile claude.yaml

urls:
  shopee:
    - itemid: 27603632331
      shopid: 945535886
      name: "Kit Bàn Phím Cơ Layout Alice Cidoo ABM066"
    - itemid: 29560903606
      shopid: 88201679
      name: "iPhone 16 Pro Max 256GB"
    - itemid: 27312277956
      shopid: 188032506
      name: "Máy in nhiệt mini N43BT ECOKING"
    - itemid: 5873954476
      shopid: 88201679
      name: "MacBook Air M1 2020"
    - itemid: 27253416536
      shopid: 718577709
      name: "Máy tính đồng bộ Acer Extensa"
    - itemid: 24509035804
      shopid: 743787121
      name: "Bộ PC Gaming Robot CGO"

  tiki:
    - product_id: 197665886
      name: "Apple AirPods 3 2022"
    - product_id: 271973464
      name: "iPhone 15 Pro Max"
    - product_id: 271362157
      name: "Kindle Paperwhite 5 11th gen"
    - product_id: 273697142
      name: "Đồng hồ thông minh Android H10 2023"
    - product_id: 248280563
      name: "Dell Inspiron 15 3511"
    - product_id: 57422397
      name: "Android TV Sony 4K 55 inch KD-55X9500H"
    - product_id: 274057335
      name: "LG StanbyME Go 27 inch"
    - product_id: 275078723
      name: "HP Elite Mini 600 G9"
    - product_id: 93181715
      name: "PC XGame8184 i3-8100"
    - product_id: 273259161
      name: "Samsung Galaxy A05s"
    - product_id: 276425742
      name: "Máy lạnh Nagakawa Inverter NIS-C09R2T29"
    - product_id: 275127045
      name: "Máy giặt LG AI DD FV1410S4W1"
    - product_id: 274038337
      name: "POCO C65"
    - product_id: 275717177
      name: "Realme C61"
    - product_id: 197216310
      name: "Product from API URL"

api_endpoints:
  tiki: "https://tiki.vn/api/v2/reviews?limit=20&include=comments,contribute_info,attribute_vote_summary&product_id={product_id}"
  shopee: "https://shopee.vn/api/v2/item/get_ratings?itemid={itemid}&shopid={shopid}"

In [3]:
import yaml
import re
import requests
import json
import os
from time import sleep
from datetime import datetime

# Headers to mimic browser behavior
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'en-US,en;q=0.9',
    'Referer': 'https://tiki.vn/',
    'Connection': 'keep-alive',
}

SHOPEE_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'en-US,en;q=0.9',
    'Referer': 'https://shopee.vn/',
    'Connection': 'keep-alive',
    'x-api-source': 'pc',
    'x-requested-with': 'XMLHttpRequest',
    'x-shopee-language': 'vi'
}

def extract_ids(url):
    """Extract product IDs from URLs using regex"""
    if 'tiki.vn' in url:
        # Extract both spid and product_id for Tiki
        spid_match = re.search(r'spid=(\d+)', url)
        product_match = re.search(r'p(\d+)\.html', url)
        
        if spid_match and product_match:
            return {
                'platform': 'tiki',
                'spid': spid_match.group(1),
                'product_id': product_match.group(1),
                'itemID': f"tiki_{product_match.group(1)}"  # Added itemID for Tiki
            }

    elif 'shopee.vn' in url:
        # Extract both shopid and itemid for Shopee
        shopid_match = re.search(r'i\.(\d+)\.', url)
        itemid_match = re.search(r'\.(\d+)$', url)
        if shopid_match and itemid_match:
            return {
                'platform': 'shopee',
                'shopid': shopid_match.group(1),
                'itemid': itemid_match.group(1),
                'itemID': f"shopee_{itemid_match.group(1)}"  # Added itemID for Shopee
            }
    return None

def get_reviews(platform, ids):
    """Fetch reviews based on platform and IDs"""
    if platform == 'tiki':
        url = (
            f"https://tiki.vn/api/v2/reviews"
            f"?limit=20"
            f"&include=comments,contribute_info,attribute_vote_summary"
            f"&sort=score|desc,id|desc,stars|all"
            f"&page=1"
            f"&spid={ids['spid']}"
            f"&product_id={ids['product_id']}"
        )
        response = requests.get(url, headers=HEADERS)
        if response.status_code == 200:
            data = response.json()
            # Add itemID to each review
            if 'data' in data:
                for review in data['data']:
                    review['itemID'] = ids['itemID']
            return data
        
    elif platform == 'shopee':
        url = f"https://shopee.vn/api/v2/item/get_ratings?itemid={ids['itemid']}&shopid={ids['shopid']}"
        response = requests.get(url, headers=SHOPEE_HEADERS)
        if response.status_code == 200:
            data = response.json()
            # Add itemID to each rating
            if 'data' in data and 'ratings' in data['data']:
                for rating in data['data']['ratings']:
                    rating['itemID'] = ids['itemID']
            return data
    
    return None

def save_reviews(platform, ids, data):
    """Save reviews to file system"""
    # Create directory if it doesn't exist
    base_dir = "D:\\FIA1471\\data\\review"
    platform_dir = os.path.join(base_dir, platform)
    os.makedirs(platform_dir, exist_ok=True)
    
    # Use itemID in filename for consistency
    filename = f"{ids['itemID']}.json"
    filepath = os.path.join(platform_dir, filename)
    
    # Save data
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    
    return filepath

def main():
    # Read YAML file
    with open('urls.yaml', 'r', encoding='utf-8') as file:
        data = yaml.safe_load(file)
    
    # Process each URL
    for url in data.get('urls', []):
        print(f"\nProcessing URL: {url}")
        
        # Extract IDs
        ids = extract_ids(url)
        if not ids:
            print("Could not extract IDs from URL")
            continue
            
        print(f"Extracted IDs: {ids}")
        
        # Get reviews
        reviews = get_reviews(ids['platform'], ids)
        if not reviews:
            print("Failed to fetch reviews")
            continue
            
        # Save reviews
        filepath = save_reviews(ids['platform'], ids, reviews)
        print(f"Saved reviews to: {filepath}")
        
        # Sleep to avoid rate limiting
        sleep(2)

if __name__ == "__main__":
    main()


Processing URL: https://tiki.vn/apple-airpods-3-2022-sac-lightning-mpny3-p197665885.html?spid=197665886
Extracted IDs: {'platform': 'tiki', 'spid': '197665886', 'product_id': '197665885', 'itemID': 'tiki_197665885'}
Saved reviews to: D:\FIA1471\data\review\tiki\tiki_197665885.json

Processing URL: https://tiki.vn/apple-iphone-15-pro-max-p271973414.html?spid=271973464
Extracted IDs: {'platform': 'tiki', 'spid': '271973464', 'product_id': '271973414', 'itemID': 'tiki_271973414'}
Saved reviews to: D:\FIA1471\data\review\tiki\tiki_271973414.json

Processing URL: https://tiki.vn/combo-may-doc-sach-all-new-kindle-paperwhite-5-11th-gen-va-bao-da-for-kids-ban-kids-khong-quang-cao-hang-nhap-khau-p271362153.html?spid=271362157
Extracted IDs: {'platform': 'tiki', 'spid': '271362157', 'product_id': '271362153', 'itemID': 'tiki_271362153'}
Saved reviews to: D:\FIA1471\data\review\tiki\tiki_271362153.json

Processing URL: https://tiki.vn/dong-ho-thong-minh-android-lap-sim-4g-nghe-goi-doc-lap-dinh-v

In [8]:
import glob
glob.glob('data\\review/**/*.json', recursive=True)

['data\\review\\shopee\\shopee_24509035804.json',
 'data\\review\\shopee\\shopee_27253416536.json',
 'data\\review\\shopee\\shopee_27312277956.json',
 'data\\review\\shopee\\shopee_27603632331.json',
 'data\\review\\shopee\\shopee_29560903606.json',
 'data\\review\\shopee\\shopee_5873954476.json',
 'data\\review\\tiki\\tiki_197665885.json',
 'data\\review\\tiki\\tiki_248280562.json',
 'data\\review\\tiki\\tiki_26140236.json',
 'data\\review\\tiki\\tiki_271362153.json',
 'data\\review\\tiki\\tiki_271973414.json',
 'data\\review\\tiki\\tiki_273258825.json',
 'data\\review\\tiki\\tiki_273880674.json',
 'data\\review\\tiki\\tiki_274037360.json',
 'data\\review\\tiki\\tiki_274057334.json',
 'data\\review\\tiki\\tiki_275078722.json',
 'data\\review\\tiki\\tiki_275127044.json',
 'data\\review\\tiki\\tiki_275716541.json',
 'data\\review\\tiki\\tiki_276425741.json',
 'data\\review\\tiki\\tiki_57422387.json']

In [9]:
import glob
import json
import os

def process_json_files(pattern):
    # Lấy danh sách các file JSON
    json_files = glob.glob(pattern, recursive=True)
    
    for file_path in json_files:
        # Trích xuất ID từ tên file
        filename = os.path.basename(file_path)  # Lấy tên file
        file_id = filename.split('_')[1].replace('.json', '')  # Lấy phần ID
        
        # Đọc nội dung file JSON
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Thêm trường ID vào data
        data['id'] = file_id
        
        # Ghi lại file với nội dung mới
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        
        print(f"Processed {filename} - Added ID: {file_id}")

# Chạy function với pattern đã cho
pattern = 'data\\review/**/*.json'
process_json_files(pattern)

Processed shopee_24509035804.json - Added ID: 24509035804
Processed shopee_27253416536.json - Added ID: 27253416536
Processed shopee_27312277956.json - Added ID: 27312277956
Processed shopee_27603632331.json - Added ID: 27603632331
Processed shopee_29560903606.json - Added ID: 29560903606
Processed shopee_5873954476.json - Added ID: 5873954476
Processed tiki_197665885.json - Added ID: 197665885
Processed tiki_248280562.json - Added ID: 248280562
Processed tiki_26140236.json - Added ID: 26140236
Processed tiki_271362153.json - Added ID: 271362153
Processed tiki_271973414.json - Added ID: 271973414
Processed tiki_273258825.json - Added ID: 273258825
Processed tiki_273880674.json - Added ID: 273880674
Processed tiki_274037360.json - Added ID: 274037360
Processed tiki_274057334.json - Added ID: 274057334
Processed tiki_275078722.json - Added ID: 275078722
Processed tiki_275127044.json - Added ID: 275127044
Processed tiki_275716541.json - Added ID: 275716541
Processed tiki_276425741.json - 

In [15]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import json
from pathlib import Path

from config import *

import glob
data = glob.glob('data\\review/**/*.json', recursive=True)

uri = URI
client = MongoClient(uri, server_api=ServerApi('1'))
db = client['datashop']
review = db['review']
# Prepare the data by loading each JSON file into a list of documents
documents = []
for file_path in data:
    with open(file_path, 'r', encoding='utf-8') as f:
        document = json.load(f)  # Load JSON data from the file
        document['file_name'] = Path(file_path).name  # Optionally add the file name to the document
        documents.append(document)

# Insert the documents into the 'review' collection
if documents:  # Ensure there's data to insert
    review.insert_many(documents)

print(f'Inserted {len(documents)} documents into the review collection.')

Inserted 20 documents into the review collection.


In [14]:
# Read all files and prepare documents
data = glob.glob('data\\review/**/*.json', recursive=True)

documents = [
    {**json.load(open(file_path, 'r', encoding='utf-8')), 'file_name': Path(file_path).name}
    for file_path in data
]
documents[0].keys()

dict_keys(['error', 'data', 'id', 'file_name'])

In [17]:
review.count_documents({})

20