In [2]:
!python3 -m pip install playwright pandas python-dotenv
!python3 -m playwright install


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


In [5]:
import asyncio
import os
import json
import pandas as pd
from playwright.async_api import async_playwright, TimeoutError
from dotenv import load_dotenv

# Load environment variables
load_dotenv(override=True)
username = os.getenv("USERNAME")
password = os.getenv("PASSWORD")

# Define login function
async def signon(page, username, password):
    try:
        await page.goto("https://www.instagram.com/accounts/login/", wait_until="networkidle")
        await page.wait_for_selector('input[name="username"]', timeout=10000)
        await page.fill('input[name="username"]', username)
        await page.fill('input[name="password"]', password)
        await page.wait_for_timeout(500)
        await page.locator('[role="button"][aria-label-id="replay"]').click()
        await page.wait_for_timeout(5000)
        if "login" in page.url:
            raise ValueError("Login failed: Incorrect username or password.")
        print("✅ Successfully logged in")
    except TimeoutError:
        raise TimeoutError("Login timed out. Check credentials or network connection.")

# Define helper to get total posts
async def get_total_posts(page):
    total_posts = await page.evaluate("""
        () => {
            const element = document.querySelector('header section ul li span');
            return element ? parseInt(element.innerText.replace(',', '')) : null;
        }
    """)
    return total_posts if total_posts is not None else float('inf')

# Scrape posts for a single user
async def scrape_instagram_posts(userhandle: str, max_posts: int, context, page):
    profile_url = f"https://www.instagram.com/{userhandle}/"
    await page.goto(profile_url)
    await page.wait_for_load_state("networkidle")

    total_posts = await get_total_posts(page)
    scrape_limit = min(max_posts, total_posts)
    print(f"🔎 {userhandle}: Total posts {total_posts}, scraping up to {scrape_limit}...")

    unique_posts = {}
    scroll_attempts = 0
    MAX_SCROLL_ATTEMPTS = 20

    while len(unique_posts) < scrape_limit and scroll_attempts < MAX_SCROLL_ATTEMPTS:
        candidate_elements = await page.query_selector_all("a:has(div._aagu)")
        for element in candidate_elements:
            href = await element.get_attribute("href")
            if href and "/p/" in href and href not in unique_posts:
                unique_posts[href] = None
        print(f"🔄 Scrolled {scroll_attempts + 1}x — Collected: {len(unique_posts)}")
        if len(unique_posts) >= scrape_limit:
            break
        await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
        await asyncio.sleep(1.5)
        scroll_attempts += 1

    post_hrefs = list(unique_posts.keys())[:scrape_limit]
    results = {}

    for i, href in enumerate(post_hrefs, start=1):
        post_url = f"https://www.instagram.com{href}"
        new_page = await context.new_page()
        try:
            async with new_page.expect_response(
                lambda response: "/api/v1/media/" in response.url and "/info/" in response.url,
                timeout=5000
            ) as response_info:
                await new_page.goto(post_url)
            response = await response_info.value
            data = await response.json()
            results[href] = data
            print(f"✅ ({i}/{len(post_hrefs)}) {href}")
        except TimeoutError:
            print(f"⏱️ Timeout: {href}")
        finally:
            await new_page.close()

    return results

# Batch scrape users from a CSV and save to JSON
async def scrape_users_from_csv(csv_path: str, max_posts_per_user: int, output_json: str):
    df = pd.read_csv(csv_path, header=None)
    usernames = df[0].dropna().unique().tolist()

    if os.path.exists(output_json):
        with open(output_json, 'r') as f:
            all_results = json.load(f)
    else:
        all_results = {}

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        device = p.devices["Pixel 5"]
        context = await browser.new_context(**device)
        page = await context.new_page()

        if os.path.exists("cookies.json"):
            print("🔄 Loading cookies...")
            with open("cookies.json", "r") as f:
                cookies = json.load(f)
            await context.add_cookies(cookies)
        else:
            print("🔐 Logging in...")
            await signon(page, username, password)
            cookies = await context.cookies()
            with open("cookies.json", "w") as f:
                json.dump(cookies, f)

        for user in usernames:
            if user in all_results:
                print(f"⏩ Skipping {user} (already scraped)")
                continue
            try:
                result = await scrape_instagram_posts(user, max_posts_per_user, context, page)
                all_results[user] = result
                with open(output_json, 'w') as f:
                    json.dump(all_results, f, indent=2)
            except Exception as e:
                print(f"❌ Error scraping {user}: {e}")

        await browser.close()
        print(f"\n✅ All scraping complete. Results saved to {output_json}")



In [6]:
await scrape_users_from_csv("usernames.csv", max_posts_per_user=20, output_json="all_instagram_data.json")

🔄 Loading cookies...
🔎 cristiano: Total posts 3869, scraping up to 20...
🔄 Scrolled 1x — Collected: 16
🔄 Scrolled 2x — Collected: 16
🔄 Scrolled 3x — Collected: 26
✅ (1/20) /cristiano/p/DIe1T1bg2Wf/
✅ (2/20) /cristiano/p/DIXBP7gA9uA/
✅ (3/20) /cristiano/p/DIL-t6qoUNX/
✅ (4/20) /cristiano/p/DIKJQTdgCtD/
✅ (5/20) /cristiano/p/DICffjKAj3o/
✅ (6/20) /cristiano/p/DH9ZbAnAUF2/
✅ (7/20) /cristiano/p/DH0eE6RgBLg/
✅ (8/20) /cristiano/p/DHy-NAqAJ-d/
✅ (9/20) /cristiano/p/DHj4YLnI0ui/
✅ (10/20) /cristiano/p/DHg6maaxCjL/
✅ (11/20) /cristiano/p/DHcDuWfIDRz/
✅ (12/20) /cristiano/p/DHZUBFdgBPP/
✅ (13/20) /cristiano/p/DHWttpnAWgt/
⏱️ Timeout: /cristiano/p/DHTt2KDoBTH/
✅ (15/20) /cristiano/p/DHMegD4gG_E/
✅ (16/20) /cristiano/p/DHCGhubAPkQ/
✅ (17/20) /cristiano/p/DG8bD0Co7_6/
✅ (18/20) /cristiano/p/DG2yopigqIn/
✅ (19/20) /cristiano/p/DG1Rz95AIPn/
✅ (20/20) /cristiano/p/DGxqbv9oVTD/
🔎 kyliejenner: Total posts 7190, scraping up to 20...
🔄 Scrolled 1x — Collected: 21
✅ (1/20) /kyliejenner/p/DIUiaE9BgZJ/
✅ (