In [4]:
import asyncio
import os
import json
import pandas as pd
from datetime import datetime
from playwright.async_api import async_playwright, TimeoutError
from dotenv import load_dotenv
import random

########## CONFIGURATION ##########

#set seed to unix timestamp
random.seed(int(datetime.now().timestamp()))

# Load environment variables
load_dotenv(override=True)
username = os.getenv("USERNAME")
password = os.getenv("PASSWORD")

# Asyncio settings
# Set the maximum number of concurrent pages
MAX_PARALLEL_PAGES = 4
# Set the range for random jitter
# This is used to add a random delay before each request to avoid being blocked
JITTER_RANGE = (1, 2)


MAX_SCROLL_ATTEMPTS = 30
REPEAT_SCROLL_ELEMENTS_LIMIT = 5

GLOBAL_SEM = asyncio.Semaphore(MAX_PARALLEL_PAGES)

#######

In [3]:
build_shortcode_index("../scraper/ig_scraperv3/json/all_instagram_data.json")

🔎 Built handle-lookup for 2963 shortcodes


{'DIe1T1bg2Wf': 'cristiano',
 'DIXBP7gA9uA': 'cristiano',
 'DIL-t6qoUNX': 'cristiano',
 'DIKJQTdgCtD': 'cristiano',
 'DICffjKAj3o': 'cristiano',
 'DH9ZbAnAUF2': 'cristiano',
 'DH0eE6RgBLg': 'cristiano',
 'DHy-NAqAJ-d': 'cristiano',
 'DHj4YLnI0ui': 'cristiano',
 'DHg6maaxCjL': 'cristiano',
 'DHcDuWfIDRz': 'cristiano',
 'DHZUBFdgBPP': 'cristiano',
 'DHWttpnAWgt': 'cristiano',
 'DHMegD4gG_E': 'cristiano',
 'DHCGhubAPkQ': 'cristiano',
 'DG8bD0Co7_6': 'cristiano',
 'DG2yopigqIn': 'cristiano',
 'DG1Rz95AIPn': 'cristiano',
 'DGxqbv9oVTD': 'cristiano',
 'DIUiaE9BgZJ': 'kyliejenner',
 'DIP18DMva3s': 'kyliejenner',
 'DH84u-TxBfT': 'kyliejenner',
 'DHoSX0DJbf6': 'kyliejenner',
 'DHl_kUwSh6_': 'kyliejenner',
 'DHY1dVYJuf7': 'kyliejenner',
 'DHWE1X4RQDV': 'kyliejenner',
 'DHToGbyJKls': 'kyliejenner',
 'DHRHD84RQL2': 'kyliejenner',
 'DHPQkZupTmn': 'kyliejenner',
 'DHOQFKpRpHg': 'kyliejenner',
 'DHLmYtnxcWM': 'kyliejenner',
 'DHJgU5Iv2v2': 'kyliejenner',
 'DHCaYBBy9UD': 'kyliejenner',
 'DG36MQeS7AK':

In [2]:
import os, json, re

def build_shortcode_index(old_json_path: str) -> dict[str, str]:
    """
    Returns mapping {shortcode: account_handle} using the old data file.

    If the same shortcode appears under multiple handles, the first one wins.
    """
    index = {}
    if not os.path.exists(old_json_path):
        print(f"⚠️ old-json file '{old_json_path}' not found; cannot build account lookup.")
        return index

    with open(old_json_path) as f:
        old = json.load(f)

    pattern = re.compile(r"/p/([^/]+)/")   # capture whatever’s between '/p/' and the next '/'

    for handle, posts in old.items():
        for post_path in posts.keys():    # e.g. "/cristiano/p/DIe1T1bg2Wf/"
            m = pattern.search(post_path)
            if not m:
                continue
            code = m.group(1)              # "DIe1T1bg2Wf"
            index.setdefault(code, handle)

    print(f"🔎 Built handle-lookup for {len(index)} shortcodes")
    return index


########

In [5]:
# Define login function
async def signon(page, username, password):
    try:
        await page.goto("https://www.instagram.com/accounts/login/", wait_until="networkidle")
        await page.wait_for_selector('input[name="username"]', timeout=10000)
        await page.fill('input[name="username"]', username)
        await page.fill('input[name="password"]', password)
        await page.wait_for_timeout(500)
        print("Hitting login button")

        # Wait until the element with the aria-label "Log in" is visible.
        await page.wait_for_selector("div[aria-label='Log in']", state="visible")
        # Now attempt a click on it.
        await page.click("div[aria-label='Log in']")

        #wait for networkidle
        await page.wait_for_load_state("networkidle")
        
        #look for this: <span class="x1lliihq x193iq5w x6ikm8r x10wlt62 xlyipyv xuxw1ft">Save info</span>
        # the button is its 5th parent
        await page.wait_for_selector("span:has-text('Save info')", timeout=15000)
        # Click the button
        await page.click("span:has-text('Save info')", timeout=5000)
        
        # await page.wait_for_load_state("networkidle")
        
        
        if "login" in page.url:
            raise ValueError("Login failed: Incorrect username or password.")
        print("✅ Successfully logged in")
    except TimeoutError:
        raise TimeoutError("Login timed out. Check credentials or network connection.")


In [7]:
async def _fetch_shortcode(context, handle: str, code: str,
                           idx: int, total: int):
    """
    Hits   https://www.instagram.com/{handle}/p/{code}?img_index=1
    """
    await asyncio.sleep(random.uniform(*JITTER_RANGE))

    async with GLOBAL_SEM:
        page = await context.new_page()
        try:
            async with page.expect_response(
                lambda r: "/api/v1/media/" in r.url and "/info/" in r.url,
                timeout=5_000
            ) as ri:
                url = f"https://www.instagram.com/{handle}/p/{code}?img_index=1"
                await page.goto(url)

            data = await (await ri.value).json()
            return idx, code, data

        except TimeoutError:
            print(f"⏱️ Timeout: /{handle}/p/{code}")
            return idx, code, None

        except Exception as e:
            print(f"❌ {code}: {e}")
            return idx, code, None

        finally:
            await page.close()


async def refresh_shortcodes(
    csv_path: str,
    old_json_path: str,
    output_json: str,
    context: "BrowserContext",
):
    df      = pd.read_csv(csv_path)

    print(f"🔎 Found {len(df)} shortcodes in {csv_path}")
    codes   = df["shortcode"].dropna().unique().tolist()
    total   = len(codes)
    print(f"Filtered to {total} unique shortcodes")

    # ---------- NEW: build reverse index --------------------------
    handle_by_code = build_shortcode_index(old_json_path)

    # keep previous refresh if any
    store = {}
    if os.path.exists(output_json):
        with open(output_json) as f:
            store = json.load(f)

    # ---------- FAN-OUT -------------------------------------------
    tasks = []
    skipped = 0
    for i, code in enumerate(codes, start=1):
        handle = handle_by_code.get(code)
        if not handle:
            skipped += 1
            continue                      # we don't know whose post this is
        tasks.append(
            asyncio.create_task(_fetch_shortcode(context, handle, code, i, total))
        )

    if skipped:
        print(f"⚠️  {skipped} shortcodes had no handle in old JSON and were skipped")

    # ---------- FAN-IN --------------------------------------------
    for done_count, fut in enumerate(asyncio.as_completed(tasks), start=1):
        idx, code, data = await fut
        print(f"📊 finished {done_count}/{len(tasks)}")

        if data:
            store[code] = data

    with open(output_json, "w") as f:
        json.dump(store, f, indent=2)

    print(f"✅  Saved {len(store)} refreshed posts → {output_json}")


In [None]:

print("Starting shortcode refresh …")
async with async_playwright() as p:
    browser = await p.chromium.launch(headless=False)
    device  = p.devices["Pixel 5"]
    context = await browser.new_context(**device)
    page    = await context.new_page()

    # (optional) cookie load / login exactly as in your old routine
    await signon(page, username, password)

    # refresh every shortcode
    await refresh_shortcodes(
        csv_path="all_instagram_data.csv",
        output_json="refreshed.json",
        old_json_path="../scraper/ig_scraperv3/json/all_instagram_data.json",
        context=context,
    )

    await browser.close()



Starting shortcode refresh …
Hitting login button
✅ Successfully logged in
🔎 Found 1318 shortcodes in all_instagram_data.csv
Filtered to 1300 unique shortcodes
⚠️ old-json file 'all_instagram_data.json' not found; cannot build account lookup.
⚠️  1300 shortcodes had no handle in old JSON and were skipped
✅  Saved 499 refreshed posts → refreshed.json
