In [1]:
import asyncio
import os
import json
import pandas as pd
from datetime import datetime
from playwright.async_api import async_playwright, TimeoutError
from dotenv import load_dotenv
import random

In [6]:
import pandas as pd
import os

df = pd.read_csv("all_instagram_data.csv")

In [None]:
###############################################################################
# 1)  Helper: pull /info/ for ONE shortcode  (re-uses GLOBAL_SEM + jitter)
###############################################################################
async def _fetch_shortcode(context, code: str, idx: int, total: int):
    """
    Same job as _fetch_one, but 'code' is already the Instagram shortcode.

    Returns
    -------
    idx, code, data|None
    """
    # anti-bot jitter
    await asyncio.sleep(random.uniform(*JITTER_RANGE))

    async with GLOBAL_SEM:                       # <- keeps tabs ≤ MAX_PARALLEL_PAGES
        page = await context.new_page()
        try:
            async with page.expect_response(
                lambda r: "/api/v1/media/" in r.url and "/info/" in r.url,
                timeout=5_000
            ) as ri:
                await page.goto(f"https://www.instagram.com/{handle}/p/{code}?img_index=1")

            resp  = await ri.value
            data  = await resp.json()
            return idx, code, data

        except TimeoutError:
            print(f"⏱️ Timeout: /p/{code}")
            return idx, code, None

        except Exception as e:
            print(f"❌ {code}: {e}")
            return idx, code, None

        finally:
            await page.close()


###############################################################################
# 2)  New entry-point: scrape every shortcode in a CSV
###############################################################################
async def refresh_shortcodes(
    csv_path: str,
    output_json: str,
    context: "BrowserContext",
):
    """
    Reads a CSV with a 'shortcode' column and refreshes the metadata for each
    post via the /info/ endpoint.
    """
    df         = pd.read_csv(csv_path)
    codes      = df["shortcode"].dropna().unique().tolist()
    total      = len(codes)
    print(f"🔄 Refreshing {total} shortcodes …")

    # keep previous results if they exist
    store = {}
    if os.path.exists(output_json):
        with open(output_json) as f:
            store = json.load(f)

    # ── FAN-OUT ──────────────────────────────────────────────────────────
    tasks = [
        asyncio.create_task(_fetch_shortcode(context, c, i, total))
        for i, c in enumerate(codes, start=1)
    ]

    # ── FAN-IN ───────────────────────────────────────────────────────────
    done = 0
    for fut in asyncio.as_completed(tasks):
        idx, code, data = await fut
        done += 1
        print(f"📊 finished {done}/{total}")

        if data is None:        # timeout / error
            continue

        store[code] = data      # same shape as before

    # persist
    with open(output_json, "w") as f:
        json.dump(store, f, indent=2)

    print(f"✅  Saved {len(store)} refreshed posts → {output_json}")


In [10]:

print("Starting shortcode refresh …")
async with async_playwright() as p:
    browser = await p.chromium.launch(headless=False)
    device  = p.devices["Pixel 5"]
    context = await browser.new_context(**device)
    page    = await context.new_page()

    # (optional) cookie load / login exactly as in your old routine
    await signon(page, username, password)

    # refresh every shortcode
    await refresh_shortcodes(
        csv_path="/path/to/your/shortcodes.csv",
        output_json="/path/to/refreshed.json",
        context=context,
    )

    await browser.close()



Starting shortcode refresh …


TargetClosedError: BrowserType.launch: Target page, context or browser has been closed
Browser logs:

╔════════════════════════════════════════════════════════════════════════════════════════════════╗
║ Looks like you launched a headed browser without having a XServer running.                     ║
║ Set either 'headless: true' or use 'xvfb-run <your-playwright-app>' before running Playwright. ║
║                                                                                                ║
║ <3 Playwright Team                                                                             ║
╚════════════════════════════════════════════════════════════════════════════════════════════════╝
Call log:
  - <launching> /home/asdf/.cache/ms-playwright/chromium-1161/chrome-linux/chrome --disable-field-trial-config --disable-background-networking --disable-background-timer-throttling --disable-backgrounding-occluded-windows --disable-back-forward-cache --disable-breakpad --disable-client-side-phishing-detection --disable-component-extensions-with-background-pages --disable-component-update --no-default-browser-check --disable-default-apps --disable-dev-shm-usage --disable-extensions --disable-features=AcceptCHFrame,AutoExpandDetailsElement,AvoidUnnecessaryBeforeUnloadCheckSync,CertificateTransparencyComponentUpdater,DeferRendererTasksAfterInput,DestroyProfileOnBrowserClose,DialMediaRouteProvider,ExtensionManifestV2Disabled,GlobalMediaControls,HttpsUpgrades,ImprovedCookieControls,LazyFrameLoading,LensOverlay,MediaRouter,PaintHolding,ThirdPartyStoragePartitioning,Translate --allow-pre-commit-input --disable-hang-monitor --disable-ipc-flooding-protection --disable-popup-blocking --disable-prompt-on-repost --disable-renderer-backgrounding --force-color-profile=srgb --metrics-recording-only --no-first-run --enable-automation --password-store=basic --use-mock-keychain --no-service-autorun --export-tagged-pdf --disable-search-engine-choice-screen --unsafely-disable-devtools-self-xss-warnings --no-sandbox --user-data-dir=/tmp/playwright_chromiumdev_profile-clsMTC --remote-debugging-pipe --no-startup-window
  -   - <launched> pid=8538
  -   - [pid=8538][err] [8538:8538:0425/160921.055198:ERROR:ozone_platform_x11.cc(245)] Missing X server or $DISPLAY
  -   - [pid=8538][err] [8538:8538:0425/160921.056474:ERROR:env.cc(257)] The platform failed to initialize.  Exiting.
