In [5]:
import csv
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.service import Service
import os
from PIL import Image
from io import BytesIO
import uuid
from dotenv import load_dotenv




In [25]:
USERNAME = os.getenv("INSTAGRAM_USERNAME")
PASSWORD = os.getenv("INSTAGRAM_PASSWORD")
BASE_URL = "https://www.instagram.com"
PROFILE = "loker_it"
CSV_FILE = "instagram_posts.csv"
SCROLL_PAUSE_TIME=2


In [27]:
def login():
    driver.get(f"{BASE_URL}/accounts/login/")
    wait.until(EC.presence_of_element_located((By.NAME, "username"))).send_keys(USERNAME)
    driver.find_element(By.NAME, "password").send_keys(PASSWORD + Keys.RETURN)
    time.sleep(5)  # Wait for login to complete

def navigate_to_profile():
    driver.get(f"{BASE_URL}/{PROFILE}/")

def download_image(url, filename):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()

        with open(filename, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)

        print(f"Image downloaded: {filename}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download image: {e}")

def get_posts():
    posts = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "article div img")))
    return posts

def combine_images_vertically(images_list):
    # Open the images and find the maximum width and the total height
    imgs = [Image.open(BytesIO(requests.get(url).content)) for url in images_list]
    max_width = max(i.width for i in imgs)
    total_height = sum(i.height for i in imgs)
    combined_image = Image.new('RGB', (max_width, total_height))
    y_offset = 0
    for img in imgs:
        combined_image.paste(img, (0, y_offset))
        y_offset += img.height
    return combined_image

def scroll_to_end(driver, wait):
    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    while True:
        # Scroll down to the bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load the page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        
        if new_height == last_height:
            # Check if the 'Load More' button is present
            try:
                load_more_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[text()='Load More']")))
                load_more_button.click()
                # Wait for new posts to load after clicking 'Load More'
                time.sleep(SCROLL_PAUSE_TIME)
                # Reset last_height since new content is expected to load
                last_height = driver.execute_script("return document.body.scrollHeight")
            except TimeoutException:
                # If Load More button is not found, we assume we've reached the end
                print("Reached the end of the page or there is no 'Load More' button.")
                # Now scroll back to the top
                driver.execute_script("window.scrollTo(0, 0);")
                # Wait before starting the loop again
                time.sleep(SCROLL_PAUSE_TIME)
                # Check if the page height has changed after scrolling up
                new_height = driver.execute_script("return document.body.scrollHeight")
                if new_height == last_height:
                    # If the height hasn't changed, we have truly reached the end
                    print("Confirmed the end of the page after scrolling back to the top.")
                    break
                else:
                    # Update last_height in case new content appeared while scrolling up
                    last_height = new_height
        else:
            # Update last_height for the next loop iteration
            last_height = new_height
            
def extract_and_download_posts(retry_limit=5):
    # Navigate to the profile
    navigate_to_profile()
    time.sleep(2)

    post_counter = 0
    with open("posts.csv", mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)

        # If the file is new, write the header
        if file.tell() == 0:
            writer.writerow(['Image URL', 'Timestamp', 'Filename'])
        posts = get_posts()
        if posts:
            first_post = posts[0]
            driver.execute_script("arguments[0].scrollIntoView();", first_post)
            wait.until(EC.element_to_be_clickable(first_post))
            driver.execute_script("arguments[0].click();", first_post)
            time.sleep(2)  # wait for the post to open

        while True:
            image_urls = []  # List to store URLs of images in a multi-image post
            try:
                # Wait until the image is loaded in the opened post
                wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[role='dialog'] img")))
                
                # Extract the information
                image_element = driver.find_element(By.CSS_SELECTOR, "div[role='dialog'] img")
                image_url = image_element.get_attribute('src')
                image_urls.append(image_url)  # Add the first image URL to the list

                timestamp_element = driver.find_element(By.CSS_SELECTOR, "div[role='dialog'] time")
                timestamp = timestamp_element.get_attribute('datetime')
                year = timestamp.split('-')[0]
                 # Check if the post is from the year 2022
                if year < '2022':
                    print(f"Encountered a post from {year}, which is before 2022. Ending the process.")
                    break                
                if '2022' not in timestamp:
                    # If the post is not from 2022, go to the next post
                    next_button = driver.find_element(By.XPATH, "//div[contains(@class,'_aaqg _aaqh')]//button[@type='button']")
                    next_button.click()
                    time.sleep(1)
                    continue


                # Continue fetching images if more are available
                while True:
                    try:
                        next_image_button = driver.find_element(By.XPATH, "//button[@aria-label='Next']")
                        next_image_button.click()
                        time.sleep(1)  # Wait for the next image to load
                        image_element = driver.find_element(By.CSS_SELECTOR, "div[role='dialog'] img")
                        image_url = image_element.get_attribute('src')
                        if image_url not in image_urls:
                            image_urls.append(image_url)
                    except NoSuchElementException:
                        break  # No more images in the carousel

                filename_prefix = timestamp.split('T')[0] + "_post_" + PROFILE + str(post_counter) 
                if len(image_urls) > 1:
                    # Combine images if there are multiple
                    combined_image = combine_images_vertically(image_urls)
                    combined_filename = f"{filename_prefix}_combined.jpg"
                    combined_image_path = f"downloaded_images/{combined_filename}"
                    combined_image.save(combined_image_path)
                    writer.writerow(['; '.join(image_urls), timestamp, combined_filename])
                else:
                    # Download and save the single image
                    single_filename = f"{filename_prefix}_image.jpg"
                    single_image_path = f"downloaded_images/{single_filename}"
                    download_image(image_urls[0], single_image_path)
                    writer.writerow([image_urls[0], timestamp, single_filename])

                post_counter += 1

                # Navigate to the next post
                next_post_button = driver.find_element(By.XPATH, "//div[contains(@class,'_aaqg _aaqh')]//button[@type='button']")
                next_post_button.click()
                time.sleep(1)

                # Log progress intermittently
                if post_counter % 20 == 0:
                    print(f"Processed {post_counter} posts. Taking a short break.")

            except Exception as e:
                print(f"An error occurred at post {post_counter}: {e}. Moving to the next post.")
                try:
                    next_post_button = driver.find_element(By.XPATH, "//div[contains(@class,'_aaqg _aaqh')]//button[@type='button']")
                    next_post_button.click()
                    time.sleep(1)
                except Exception as next_post_exception:
                    print(f"Failed to navigate to the next post: {next_post_exception}. Exiting...")
                    break

            # Check if we have reached the end of the posts
            try:
                driver.find_element(By.XPATH, "//div[contains(@class,'_aaqg _aaqh')]//button[@type='button']")
            except NoSuchElementException:
                print("Reached the end of the posts or no next button found. Ending the process.")
                break

        print("Finished extracting posts.")

In [None]:
# Inisialisasi driver menggunakan ChromeDriverManager
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
wait = WebDriverWait(driver, 3)
login()

In [28]:
# Start the process
extract_and_download_posts()