In [None]:
%pip install selenium
%pip install webdriver-manager
%pip install bs4
%pip install pandas

Collecting selenium
  Downloading selenium-4.25.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.2-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.25.0-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m65.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.26.2-py3-none-any.whl (475 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m476.0/476.0 kB[0m [31m44.

In [None]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd

# Setup Chrome options
options = Options()
options.add_argument('--headless')  # Run in headless mode (no GUI)
options.add_argument('--disable-gpu')
options.add_argument('--window-size=1920x1080')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

# Initialize the web driver with options
driver = webdriver.Chrome(options=options)

# URL for scraping
base_url = "https://appfigures.com/top-apps/ios-app-store/united-states/iphone/top-free"
scraped_data = []

# Visit the scraping URL
driver.get(base_url)
time.sleep(5)  # Initial wait for the page to load

# Scroll parameters
SCROLL_PAUSE_TIME = 1
SCROLL_INCREMENT = 300  # Amount to scroll down each time
MAX_ATTEMPTS = 20  # Max attempts to scroll and load more content

# Function to get current number of loaded app elements
def get_current_app_elements():
    return driver.find_elements(By.XPATH, '//*[@id="app-root"]/span/div[4]//div[contains(@class, "s-1362551351-0")]')

# Scroll down in small increments to load all apps
previous_data_count = 0
attempts = 0

while attempts < MAX_ATTEMPTS:
    # Get the current loaded elements count
    app_elements = get_current_app_elements()
    current_data_count = len(app_elements)

    # Print current count
    print(f"Attempt {attempts + 1}: Loaded elements = {current_data_count}")

    # If no new content is loaded after scrolling, increase the attempt counter
    if current_data_count == previous_data_count:
        attempts += 1
    else:
        attempts = 0  # Reset attempts if new content is loaded
        previous_data_count = current_data_count

    # Scroll down by a small increment
    driver.execute_script("window.scrollBy(0, arguments[0]);", SCROLL_INCREMENT)
    time.sleep(SCROLL_PAUSE_TIME)

print(f"Page loading completed. Total number of applications loaded: {previous_data_count}")

# Extract data from the fully loaded page
app_elements = get_current_app_elements()

# Iterate over each app container
for rank, element in enumerate(app_elements, start=1):
    # Extract the app title
    try:
        app_title_elem = element.find_element(By.XPATH, './/a[contains(@class, "s-4262409-0")]')
        app_title = app_title_elem.get_attribute('title').strip() if app_title_elem else None
    except Exception as e:
        app_title = None

    # Extract the developer account name
    try:
        developer_account_elem = element.find_element(By.XPATH, './/div[contains(@class, "s1376732636-0")]')
        developer_account_text = developer_account_elem.text.strip()
        developer_account = developer_account_text.split("·")[-1].strip() if "·" in developer_account_text else developer_account_text
    except Exception as e:
        developer_account = None

    # Extract the app link for detailed page
    try:
        app_link_elem = element.find_element(By.XPATH, ".//a[contains(@class, 's-4262409-0')]")
        app_link = app_link_elem.get_attribute('href').strip() if app_link_elem else None
        if app_link and not app_link.startswith('http'):
            app_link = driver.current_url.rsplit('/', 1)[0] + '/' + app_link
    except Exception as e:
        app_link = None

    ios_app_store_id = None
    estimated_downloads = None
    estimated_revenue = None

    # If app link is available, open in a new tab to get detailed information
    if app_link:
        driver.execute_script("window.open('');")
        driver.switch_to.window(driver.window_handles[-1])
        driver.get(app_link)
        time.sleep(5)  # Wait for the page to load fully

        # Parse page source with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Extract iOS App Store ID using BeautifulSoup
        try:
            ios_app_store_id_elem = soup.select_one("span.s-1674543659-0.s1901059984-1")
            ios_app_store_id = ios_app_store_id_elem.get_text().strip() if ios_app_store_id_elem else None
        except Exception as e:
            ios_app_store_id = None

        # Extract Estimated Downloads using XPath
        try:
            estimated_downloads_elem = driver.find_element(
                By.XPATH,
                '//*[@id="react-components-overlay-provider-root"]/span/span/div/div/div[1]/div[3]/div[1]/div[2]/div[1]/div[1]/div/div[2]/div[1]/div'
            )
            estimated_downloads = estimated_downloads_elem.text.strip() if estimated_downloads_elem else None
        except Exception as e:
            estimated_downloads = None

        # Extract Estimated Revenue using XPath
        try:
            estimated_revenue_elem = driver.find_element(
                By.XPATH,
                '//*[@id="react-components-overlay-provider-root"]/span/span/div/div/div[1]/div[3]/div[1]/div[2]/div[1]/div[2]/div/div[2]/div[1]/div'
            )
            estimated_revenue = estimated_revenue_elem.text.strip() if estimated_revenue_elem else None
        except Exception as e:
            estimated_revenue = None

        driver.close()  # Close the detailed page tab
        driver.switch_to.window(driver.window_handles[0])  # Switch back to the main tab

    # Append data to the list
    scraped_data.append({
        'scraping_url': base_url,
        'scraping_timestamp': pd.Timestamp.now(),
        'country': 'United States',
        'device': 'iPhone',
        'category': 'Top Overall',
        'segment': 'Free',  # Fixed to "Free" since this is the "top-free" category
        'rank': rank,
        'app_title': app_title,
        'developer_account': developer_account,
        'app_link': app_link,
        'ios_app_store_id': ios_app_store_id,
        'estimated_downloads': estimated_downloads,
        'estimated_revenue': estimated_revenue
    })

    # Print statement for progress
    print(f"Rank: {rank}, App Title: {app_title}")

# Close the driver
driver.quit()

# Create a pandas DataFrame from the scraped data
df = pd.DataFrame(scraped_data)

