## Scraping Instagram:

This notebook demonstrates how to download captions from public Instagram profiles using `Instaloader`. The process includes fetching posts, optionally filtering by date, and saving captions to JSON files.

I am saving the captions of the posts of my followed and favorite LA food pages. These captions is what I analyze in the next [notebook](./restaurant_extractor.ipynb) to extract the restaurant information.

In [None]:
import instaloader
import json
from typing import List
from datetime import datetime
import concurrent.futures

In [None]:
def fetch_posts(profile):
    """
    Fetch posts from a profile.
    
    Parameters:
    - profile: instaloader.Profile: The profile to fetch posts from.
    
    Returns:
    - List[instaloader.Post]: A list of posts from the profile.
    """
    return list(profile.get_posts())

def filter_post_by_date(post, from_datetime, to_datetime):
    """
    Check if a post's date is within the specified range and return its caption if it is.
    
    Parameters:
    - post: instaloader.Post: The post to check.
    - from_datetime: datetime: The start date.
    - to_datetime: datetime: The end date.
    
    Returns:
    - str: The caption of the post if it is within the date range, else None.
    """
    post_date = post.date
    if from_datetime <= post_date <= to_datetime:
        return post.caption
    return None

def get_instagram_captions_by_date(username: str, from_date: str, to_date: str) -> List[str]:
    """
    Get the captions of all posts from a public Instagram profile within a specified date range.
    
    Parameters:
    - username: str: The username of the public Instagram profile.
    - from_date: str: The start date in "MM/DD/YYYY" format.
    - to_date: str: The end date in "MM/DD/YYYY" format.
    
    Returns:
    - List[str]: A list of captions from the profile's posts within the specified date range.
    
    Example usage:
        captions = get_instagram_captions_by_date('username', '01/01/2021', '12/31/2021')
        print(captions)
    """
    try:
        # Parse the date strings into datetime objects
        from_datetime = datetime.strptime(from_date, "%m/%d/%Y")
        to_datetime = datetime.strptime(to_date, "%m/%d/%Y")
        
        # Create an instance of Instaloader
        L = instaloader.Instaloader()
        
        # Load the profile
        profile = instaloader.Profile.from_username(L.context, username)
        
        # Fetch posts in parallel
        with concurrent.futures.ThreadPoolExecutor() as executor:
            posts = fetch_posts(profile)
            
            # Filter posts by date and get captions in parallel
            futures = [
                executor.submit(filter_post_by_date, post, from_datetime, to_datetime)
                for post in posts
            ]
            
            # Collect captions from completed futures
            captions = [future.result() for future in concurrent.futures.as_completed(futures) if future.result() is not None]
        
        return captions
    except instaloader.exceptions.ProfileNotExistsException:
        print(f"The profile '{username}' does not exist.")
        return []
    except instaloader.exceptions.PrivateProfileNotFollowedException:
        print(f"The profile '{username}' is private and cannot be accessed.")
        return []
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

def get_instagram_captions(username: str) -> List[str]:
    """
    Get the captions of all posts from a public Instagram profile.

    Parameters:
    - username: str: The username of the public Instagram profile.

    Returns:
    - List[str]: A list of captions from the profile's posts.
    """
    # Create an instance of Instaloader
    L = instaloader.Instaloader()

    # Load the profile
    profile = instaloader.Profile.from_username(L.context, username)

    # List to hold captions
    captions = []

    # Iterate over each post in the profile
    for post in profile.get_posts():
        # Append the caption to the list
        captions.append(post.caption)

    return captions

def save_captions_to_file(captions: List[str], filename: str) -> None:
    """
    Save captions to a file in JSON format.

    Parameters:
    - captions: List[str]: The list of captions to save.
    - filename: str: The name of the file to save the captions to.
    """
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(captions, f, ensure_ascii=False, indent=4)


def save_instagram_captions(username: str, filename: str) -> None:
    """
    Get the captions of all posts from a public Instagram profile and save them to a file.

    Parameters:
    - username: str: The username of the public Instagram profile.
    - filename: str: The name of the file to save the captions to.
    """
    # Create an instance of Instaloader
    L = instaloader.Instaloader()

    # Load the profile
    profile = instaloader.Profile.from_username(L.context, username)

    # Open the file in append mode
    with open(filename, 'a', encoding='utf-8') as f:
        # Iterate over each post in the profile
        for post in profile.get_posts():
            # Write the caption to the file as a new line
            f.write(json.dumps(post.caption) + ',\n')

In [None]:
newer_account_names = ["dinertheory","la.ethnic.eats","lacoffeelist","thelacountdown"]

for account in newer_account_names:
    save_instagram_captions(account, f"posts/{account}.json")

In [None]:
older_account_names = ["infatuation_la","kcrwgoodfood","ricklox"]

for account in older_account_names:
    captions = get_instagram_captions_by_date(account, '01/01/2023', '05/14/2024')
    save_captions_to_file(captions, f"{account}_captions.json")