In [3]:
# This code scrapes data in every iteration from every URL in the "product_URLs.csv" file and creates a new CSV file each time.

import requests
import pandas as pd
import re
import csv

def extract_ids(url):

    # Regular expression pattern to extract numbers between "i" and "?sp_atk"
    pattern = r'i\.(\d+)\.(\d+)\?sp_atk'
    
    # Find the match using the pattern
    match = re.search(pattern, url)
    
    if match:
        shop_id = match.group(1)
        item_id = match.group(2)
        return shop_id, item_id
    
    else:
        return None, None
    
def read_specific_row(csv_file, row_number):

    with open(csv_file, 'r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)

        for count, row in enumerate(reader, 1):     # Start counting from 1

            if count == row_number:                 # Check if it's the desired row

                if row:
                    return row[0]
                else:
                    return None
                
    return None

def scrape_product(shop_id, item_id):

    url = "https://shopee.com.my/api/v2/item/get_ratings"           # API
    limit = 4
    offset = 0

    reviews = []


    session = requests.Session()     
    session.headers.update({"Cookie": "_gcl_au=xxxx; SPC_IA=-1; SPC_EC=-; SPC_F=xxxx; SPC_U=-; SPC_T_ID=xxxx; SPC_T_IV=xxxx; SPC_SI=xxxx; _ga=xxxx; _gid=xxxx; cto_lwid=xxxx; _fbp=xxxx; _hjid=xxxx; SPC_SIxxxx=xxxx"})

    while True:

        params = {
            "itemid": item_id,
            "shopid": shop_id,
            "offset": offset,
            "limit": limit,
            "filter": "0",
            "flag": "0",
            "sort": "0",
            "append": "0",
            "before_bundle": "",
            "language": "en",
        }

        response = session.get(url, params=params).json()

        if response["error"]:
            print(f"Error: {response['error']}")
            break

        elif response["data"]["ratings"]:
            print(offset)
            print (response["data"]["ratings"])

            for rating in response["data"]["ratings"]:

                comment = rating.get('comment', '').replace('\n', ' ')
                rating['comment'] = comment

                reviews.append(rating)
                offset += limit

        else:
            break
    
    return reviews


def main():
    
    # Open CSV file of URL list
    csv_file = 'product_URLs.csv'
    row_number = 1                                                  # Change this to the desired row number

    current_URL = read_specific_row(csv_file, row_number)

    url = "https://shopee.com.my/api/v2/item/get_ratings"           # API
    limit = 4                                                       # What happens here is basically, gakwa sha limit of 4. Tapos sa dalom may ara command nga offset += limit
    offset = 0                                                      # This means na ang starting review is now 4. Ang offset is the one indicator kung diin masugod kwa review, and it is being updated constantly.
    
    base_filename = 'scraped_reviews_'
    base_filenumber = 7

    while current_URL is not None:                                  # While the CSV file still contains a url based on the row number
        print(row_number)   
        print(current_URL)

        csv_filename = f'{base_filename}{base_filenumber}.csv'

        # Get the shop_id & item_id of the current URL
        shop_id, item_id = extract_ids(current_URL)
        print(shop_id + " " + item_id)

        # Scrape the current URL
        scraped_data = scrape_product(shop_id, item_id)

        df = pd.DataFrame(scraped_data)
        print(f"Retrieved {len(scraped_data)} reviews.")

        # Select columns to keep in the final CSV file
        df = df[['shopid', 'itemid', 'userid', 'rating_star', 'comment']]
        print(df.columns)

        # Save the DataFrame as a CSV file. Reviews start from 7.
        df.to_csv(csv_filename, index=False)
        base_filenumber += 1

        # Prepare for the next iteration
        row_number += 1
        current_URL = read_specific_row(csv_file, row_number)


if __name__ == "__main__":
    main()




1
﻿https://shopee.ph/itel-P55-RAM-24GB(8-16GB)-ROM-128GB-90HZ-Refresh-50MP-Dual-Camera-5000mAh-18W-i.655822401.24706665159?sp_atk=b9f4aaad-7d97-458d-9ecf-31d95232a059&xptdk=b9f4aaad-7d97-458d-9ecf-31d95232a059
655822401 24706665159
0
[{'orderid': 160970031209590, 'itemid': 24706665159, 'cmtid': 14155766905, 'ctime': 1707465452, 'rating': 1, 'userid': 124795138, 'shopid': 655822401, 'comment': 'Performance:Not yet tested\nProduct Quality:Supperb Value for money\nBest Feature:Smoothness\n\nitem is protected,mula sa pag open ok nman di ko PA lang nalagnay ng Sim,I think wlang Problem,complete accessories,fast deliver 2 days lang,thank you till next,', 'rating_star': 5, 'status': 2, 'mtime': 1709015327, 'editable': 1, 'opt': 2, 'filter': 7, 'mentioned': [], 'is_hidden': False, 'can_follow_up': None, 'follow_up': None, 'submit_time': 1707465452, 'author_username': 'castlejhay', 'author_portrait': 'af52c3227dd675465d0f36e5815cce31', 'author_shopid': 124793314, 'anonymous': False, 'images': [

KeyboardInterrupt: 