Import Library

In [49]:
import os 
import time
import json
import math
from datetime import datetime
from google_play_scraper import reviews, Sort
import pandas as pd
from tqdm.notebook import tqdm
from langdetect import detect, LangDetectException

Configuration

In [50]:
APP_ID = "com.gojek.app"
NUM_REVIEWS = 100000
BATCH_SIZE = 200
OUTPUT_DIR = "../data/raw"
BATCH_PREFIX = "gojek_reviews"
sleep_between_batches = 1.0  # seconds
max_retries = 5
lang_filter = "id"  



In [51]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

def language(text):
    if not text or not isinstance(text, str):
        return False
    try:
        lang = detect(text)
        return lang == lang_filter
    except LangDetectException:
        return False

In [52]:
def fetch_batch(continuation_token=None, count=BATCH_SIZE):
    for attempt in range(1, max_retries+1):
        try:
            result, next_token = reviews(
                APP_ID,
                lang="id",
                country="id",
                sort=Sort.NEWEST,
                count=count,
                continuation_token=continuation_token
            )
            return result, next_token
        except Exception as e:
            wait = 2 ** attempt
            print(f"Error fetching reviews: {e}. Retrying ({attempt + 1}/{max_retries})...")
            time.sleep(wait)
    raise RuntimeError("Max retries exceeded while fetching reviews.")

In [53]:
def review(r):
    return {
        "reviewId": r["reviewId"],
        "userName": r["userName"],
        "content": r["content"],
        "score": r["score"],
        "thumbsUpCount": r["thumbsUpCount"],
        "at": r["at"].isoformat(),
        "replyContent": r["replyContent"],
        "repliedAt": r["repliedAt"].isoformat() if r["repliedAt"] else None,
        "version": r["reviewCreatedVersion"]
    }

In [54]:
def save_batch(df, batch_idx):
    fname = os.path.join(OUTPUT_DIR, f"{BATCH_PREFIX}_{batch_idx:04d}.csv")
    df.to_csv(fname, index=False)
    print(f"Saved {len(df)} reviews to {fname}")

In [55]:
exist = sorted([f for f in os.listdir(OUTPUT_DIR) if f.startswith(BATCH_PREFIX) and f.endswith(".csv")])
if exist:
    last_batch = exist[-1]
    batch_idx = int(last_batch.replace(BATCH_PREFIX + "_", "").replace(".csv", ""))
    count_scraped = sum(pd.read_csv(os.path.join(OUTPUT_DIR, f)).shape[0] for f in exist)
    print(f"Found {len(exist)} existing batches, last batch index {batch_idx}, total reviews scraped {count_scraped}")
    start_batch_idx = batch_idx + 1
else:
    count_scraped = 0
    start_batch_idx = 0
    print("No existing batches found, starting fresh.")

No existing batches found, starting fresh.


In [56]:
continuation_token = None
batch_idx = start_batch_idx
pbar = tqdm(total=NUM_REVIEWS, initial=count_scraped, desc="Scraping Reviews")
while count_scraped < NUM_REVIEWS:
    try:
        items, continuation_token = fetch_batch(continuation_token, BATCH_SIZE)
    except RuntimeError as e:
        print(e)
        break
    normalized = []
    for it in items:
        rec = review(it)
        if lang_filter:
            content = rec["content"] or ""
            if content.strip() == "":
                continue
            if language(content):
                normalized.append(rec)
        else:
            normalized.append(rec)

    if len(normalized) == 0:
        print("No reviews in the desired language found in this batch, stopping.")
        break

    df_batch = pd.DataFrame(normalized)
    save_batch(df_batch, batch_idx)
    count_scraped += len(df_batch)
    pbar.update(len(df_batch))
    batch_idx += 1
    time.sleep(sleep_between_batches)

    if not continuation_token:
        print("No more continuation token, reached the end of reviews.")
        break

pbar.close()
print(f"Scraping completed. Total reviews scraped: {count_scraped}")

Scraping Reviews:   0%|          | 0/100000 [00:00<?, ?it/s]

Saved 132 reviews to ../data/raw\gojek_reviews_0000.csv
Saved 139 reviews to ../data/raw\gojek_reviews_0001.csv
Saved 147 reviews to ../data/raw\gojek_reviews_0002.csv
Saved 137 reviews to ../data/raw\gojek_reviews_0003.csv
Saved 136 reviews to ../data/raw\gojek_reviews_0004.csv
Saved 155 reviews to ../data/raw\gojek_reviews_0005.csv
Saved 136 reviews to ../data/raw\gojek_reviews_0006.csv
Saved 141 reviews to ../data/raw\gojek_reviews_0007.csv
Saved 142 reviews to ../data/raw\gojek_reviews_0008.csv
Saved 125 reviews to ../data/raw\gojek_reviews_0009.csv
Saved 129 reviews to ../data/raw\gojek_reviews_0010.csv
Saved 132 reviews to ../data/raw\gojek_reviews_0011.csv
Saved 121 reviews to ../data/raw\gojek_reviews_0012.csv
Saved 141 reviews to ../data/raw\gojek_reviews_0013.csv
Saved 150 reviews to ../data/raw\gojek_reviews_0014.csv
Saved 141 reviews to ../data/raw\gojek_reviews_0015.csv
Saved 122 reviews to ../data/raw\gojek_reviews_0016.csv
Saved 149 reviews to ../data/raw\gojek_reviews_0

In [59]:
import os
import pandas as pd

OUTPUT_DIR = "../data/raw"
BATCH_PREFIX = "gojek_reviews_"

def batch_merge(output_fname="../data/gojek_reviews_full.csv"):
    all_files = sorted([
        os.path.join(OUTPUT_DIR, f)
        for f in os.listdir(OUTPUT_DIR)
        if f.startswith(BATCH_PREFIX) and f.endswith(".csv")
    ])

    if not all_files:
        print("No batch files found to merge.")
        return None

    df_list = []
    for f in all_files:
        try:
            df = pd.read_csv(f, dtype=str)
            df_list.append(df)
            print(f"Loaded {os.path.basename(f)} ({len(df)} rows)")
        except Exception as e:
            print(f"Error reading {f}: {e}")

    if not df_list:
        print("No valid dataframes to merge.")
        return None

    full_df = pd.concat(df_list, ignore_index=True)
    full_df.drop_duplicates(subset=["reviewId"], inplace=True)
    full_df = full_df[full_df["content"].notna() & (full_df["content"].str.strip() != "")]

    if "score" in full_df.columns:
        full_df["score"] = full_df["score"].astype(int)

    # Pastikan folder output-nya ada
    os.makedirs(os.path.dirname(output_fname), exist_ok=True)
    full_df.to_csv(output_fname, index=False, encoding="utf-8-sig")

    print(f"\nMerged {len(all_files)} batches → {output_fname}")
    print(f"Total unique reviews: {len(full_df):,}")
    return full_df


# Jalankan
merged_df = batch_merge()


Loaded gojek_reviews_0000.csv (132 rows)
Loaded gojek_reviews_0001.csv (139 rows)
Loaded gojek_reviews_0002.csv (147 rows)
Loaded gojek_reviews_0003.csv (137 rows)
Loaded gojek_reviews_0004.csv (136 rows)
Loaded gojek_reviews_0005.csv (155 rows)
Loaded gojek_reviews_0006.csv (136 rows)
Loaded gojek_reviews_0007.csv (141 rows)
Loaded gojek_reviews_0008.csv (142 rows)
Loaded gojek_reviews_0009.csv (125 rows)
Loaded gojek_reviews_0010.csv (129 rows)
Loaded gojek_reviews_0011.csv (132 rows)
Loaded gojek_reviews_0012.csv (121 rows)
Loaded gojek_reviews_0013.csv (141 rows)
Loaded gojek_reviews_0014.csv (150 rows)
Loaded gojek_reviews_0015.csv (141 rows)
Loaded gojek_reviews_0016.csv (122 rows)
Loaded gojek_reviews_0017.csv (149 rows)
Loaded gojek_reviews_0018.csv (133 rows)
Loaded gojek_reviews_0019.csv (135 rows)
Loaded gojek_reviews_0020.csv (131 rows)
Loaded gojek_reviews_0021.csv (126 rows)
Loaded gojek_reviews_0022.csv (125 rows)
Loaded gojek_reviews_0023.csv (129 rows)
Loaded gojek_rev