Question 1: By using both a coding and a non-coding-based approaches, scrape a total of 75 pins
from the Pinterest about something you are passionate. Make sure your scraped data contains both
images and users’ reactions (e.g., number of shares, comments, repins) to images (10 points)

I worked specifically on food images and scraped 66 image URLs using a Python-based Pinterest scraper. In addition to that, I used Apify — a no-code platform — to scrape more images for the second dataset.

In [None]:
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import json
import pandas as pd
from random import randint
import requests
from selenium import webdriver
from time import sleep

# List of topics to scrape
topics = [
    "traditional indian food",
    "south indian food",
    "north indian thali",
    "indian sweets",
    "street food india",
    "homemade indian dishes"
]

# Scroll depth and image limit per celebrity
scroll_depth = 35
image_target = 20

# Bluetooth bug circumnavigate (keep existing options)
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-logging"])

# Initialize Selenium driver
driver = webdriver.Chrome(options=options)

# Initialize a list to store results
all_output = []

# Loop over each celebrity to scrape data
for topic in topics:
    print(f"Scraping images for: {topic}")

    # Create the Pinterest search URL for the current celebrity
    query = topic.replace(" ", "%20")
    url = "https://pinterest.com/search/pins/?q=" + query
    driver.get(url)

    # Scroll the page to load more pins
    for _ in range(1, scroll_depth):
        driver.execute_script("window.scrollTo(1,100000)")
        print(f"Scrolling for {topic}...")
        sleep(randint(1, 4))

    # Parse the page with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    results = soup.find_all("a")

    # Extract URLs for the pins
    urls = []
    for ele in results:
        href = ele.get('href')
        if "/pin/" in href:
            urls.append(f"https://www.pinterest.com{href}")

    # Remove duplicates
    urls = list(set(urls))

    # Spoof user-agent to avoid being blocked
    ua = UserAgent()
    headers = {'User-Agent': str(ua.chrome)}

    # Track the number of images scraped for the current celebrity
    topic_output = []

    for ind, url in enumerate(urls):
        if len(topic_output) >= image_target:  # Stop when images target is met
            break

        print(f"Scraping URL for {topic}: {url}")
        # Scrape page with spoofed user-agent
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Get the image URL from the page
        img = soup.find("img", class_="hCL kVc L4E MIw")
        img_url = img.get("src") if img else None

        all_scripts = soup.find_all("script")
        data = {}

        # Extract Pinterest data from the scripts
        for script in all_scripts:
            if "requestParameters" in script.text:
                try:
                    raw_json = json.loads(script.get_text())
                    request_param = raw_json["requestParameters"]
                    if request_param["name"] == "CloseupDetailQuery":
                        response = raw_json["response"]
                        data_raw = response["data"]
                        pin_query = data_raw["v3GetPinQuery"]
                        data = pin_query["data"]
                        break
                except:
                    continue

        # Initialize a dictionary to store pin data
        pin_dict = {
            "topic": topic,
            "url": url,
            "img_url": img_url,
        }

        if data:
            # Extract details from the scraped data
            pin_dict["title"] = data.get("gridTitle", "")
            pin_dict["description"] = data.get("closeupUnifiedDescription", "")
            pin_dict["share_count"] = data.get("shareCount", 0)
            pin_dict["repin_count"] = data.get("repinCount", 0)

            # Extract poster data
            poster_raw = data.get("pinner", {})
            pin_dict["poster_username"] = poster_raw.get("username", "")
            pin_dict["poster_followers"] = poster_raw.get("followerCount", 0)

            # Extract comment and reaction data
            agg_pin_data = data.get("aggregatedPinData", {})
            pin_dict["comment_count"] = agg_pin_data.get("commentCount", 0)

            # Reactions
            reacts_raw = data.get("reactionCountsData", [])
            reactions = 0
            for r in reacts_raw:
                key_str = f"react_type_{r['reactionType']}"
                pin_dict[key_str] = r["reactionCount"]
                reactions += r["reactionCount"]
            pin_dict["reactions"] = reactions

        # Add the image data to the celeb_output list
        topic_output.append(pin_dict)

        # Random wait to avoid getting blocked
        sleep(randint(5, 30))

    # Add the current celebrity's output to the main output list
    all_output.extend(topic_output)

# Convert the results to a DataFrame and save to CSV
pin_df = pd.DataFrame(all_output)
pin_df.to_csv('TIF_Results.csv', index=True, index_label="Index")

# Close the Selenium driver
driver.quit()

print("Scraping completed.")

