In [None]:
%pip install selenium
%pip install webdriver-manager
%pip install bs4
%pip install pandas



In [None]:
import requests
from random import randint
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementClickInterceptedException, TimeoutException
from google.cloud import storage
from datetime import datetime
import pandas as pd
import os
import re
import time
import logging
from google.cloud import logging as cloud_logging

# Set up Google Cloud Logging
cloud_client = cloud_logging.Client()
cloud_client.setup_logging()

# Create a custom logger for Google Cloud
logger = logging.getLogger('ApptweakScraper')
logger.setLevel(logging.INFO)

# ScrapeOps API configuration
SCRAPEOPS_API_KEY = 'ScrapeOPS_API_KEY'
SCRAPEOPS_PROXY_URL = 'https://proxy.scrapeops.io/v1/'

def get_user_agent_list():
    response = requests.get(f'http://headers.scrapeops.io/v1/user-agents?api_key={SCRAPEOPS_API_KEY}')
    json_response = response.json()
    return json_response.get('result', [])

def get_random_user_agent(user_agent_list):
    random_index = randint(0, len(user_agent_list) - 1)
    return user_agent_list[random_index]

def get_scrapeops_proxy_url():
    return f"{SCRAPEOPS_PROXY_URL}?api_key={SCRAPEOPS_API_KEY}"

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Retrieve User-Agent List From ScrapeOps
user_agent_list = get_user_agent_list()
chrome_options.add_argument(f'user-agent={get_random_user_agent(user_agent_list)}')

# ScrapeOps Proxy Configuration
scrapeops_proxy = get_scrapeops_proxy_url()
chrome_options.add_argument(f'--proxy-server={scrapeops_proxy}')

# Function to generate Parquet filename
def generate_parquet_filename(search):
    date_string = datetime.now().strftime("%Y_%m_%d")
    clean_search = re.sub(r'[^a-zA-Z0-9]', '', search).lower()
    now = datetime.utcnow()
    epoch = datetime(1970, 1, 1)
    seconds_since_epoch = str(int((now - epoch).total_seconds()))
    parquet_filename = f'{date_string}__{seconds_since_epoch}__{clean_search}.parquet'
    return parquet_filename

# Function to upload the Parquet file to GCS
def upload_to_gcs(dataframe, bucket_name, folder_name, file_name):
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(f"{folder_name}/{file_name}")

    temp_file = f"/tmp/{file_name}"
    dataframe.to_parquet(temp_file, index=False)
    blob.upload_from_filename(temp_file)
    os.remove(temp_file)
    logger.info(f"File '{file_name}' uploaded to GCS bucket '{bucket_name}' in folder '{folder_name}'.")

# Function to download specific Parquet files from GCS based on the search character
def download_specific_parquets(bucket_name, folder_name, search_character):
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    parquet_files = [file.name for file in bucket.list_blobs(prefix=folder_name) if file.name.endswith(f'__{search_character}.parquet')]

    existing_dfs = []
    for file_name in parquet_files:
        local_file = f"/tmp/{os.path.basename(file_name)}"
        blob = bucket.blob(file_name)
        blob.download_to_filename(local_file)
        df = pd.read_parquet(local_file)
        existing_dfs.append(df)
        os.remove(local_file)  # Clean up local file

    if existing_dfs:
        combined_df = pd.concat(existing_dfs, ignore_index=True)
    else:
        combined_df = pd.DataFrame()

    return combined_df, parquet_files

# Function to delete old Parquet files from GCS
def delete_parquet_files_from_gcs(bucket_name, folder_name, parquet_files):
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    for file_name in parquet_files:
        blob = bucket.blob(file_name)
        blob.delete()
        logger.info(f"Deleted file: {file_name} from GCS.")

# Set up WebDriver
driver = webdriver.Chrome(options=chrome_options)

# Open the website
url = 'https://www.apptweak.com/en/free-tools/keyword-auto-suggestions'
driver.get(url)

# Wait for the page to load
wait = WebDriverWait(driver, 7200)

# Get today's date in format YYYY_MM_DD
today_date = datetime.today().strftime('%Y_%m_%d')

# Initialize GCS bucket and folder information
bucket_name = 'apptweak_scraper'
batch_folder_name = 'first_batch_of_searches'

# Iterate over all letters (e.g., 'a' to 'z', '0' to '9')
for search in 'b':  # You can loop through 'abcdefghijklmnopqrstuvwxyz0123456789'
    # Download existing data from GCS for the specific search character
    existing_data, parquet_files = download_specific_parquets(bucket_name, batch_folder_name + '/' + today_date, search)

    master_df = pd.DataFrame(columns=['search', 'store', 'country', 'language', 'next_word_letter', 'rank', 'suggestion', 'scraping_url', 'scraping_timestamp'])

    # Retrieve available options for stores and countries
    store_options = get_dropdown_options('store')
    country_options = get_dropdown_options('country')[2:50]

    # Function to select options from a dropdown
    def select_from_dropdown(dropdown_id, option_text):
        select = Select(wait.until(EC.presence_of_element_located((By.ID, dropdown_id))))
        try:
            select.select_by_visible_text(option_text)
            logger.info(f"Selected '{option_text}' from dropdown '{dropdown_id}'.")
        except Exception as e:
            logger.error(f"Error selecting '{option_text}' from dropdown '{dropdown_id}': {e}")

    # Function to get options from a dropdown
    def get_dropdown_options(dropdown_id):
        select = Select(wait.until(EC.presence_of_element_located((By.ID, dropdown_id))))
        options = [option.text for option in select.options]
        return options

    # Function to get available languages for a specific country
    def get_available_languages_for_country(country_name):
        select_from_dropdown('country', country_name)
        time.sleep(1)  # Wait for the language dropdown to update based on the selected country
        language_options = get_dropdown_options('language')
        return language_options

    # Iterate over all stores
    for store_name in store_options:
        # Iterate over all countries
        for country_name in country_options:
            # Get available languages for the current country
            language_options = get_available_languages_for_country(country_name)

            # Iterate over all languages available for the current country
            for language_name in language_options:
                logger.info(f"Processing: Store='{store_name}', Country='{country_name}', Language='{language_name}'")

                # Wait for the 'keyword' input field to be available and enter the letter or digit
                keyword_input = wait.until(EC.presence_of_element_located((By.ID, 'keyword')))
                keyword_input.clear()
                keyword_input.send_keys(search)
                keyword_input.send_keys(Keys.RETURN)

                # Select the store, country, and language
                select_from_dropdown('store', store_name)
                select_from_dropdown('country', country_name)
                select_from_dropdown('language', language_name)

                # Wait for the progress bar to disappear
                try:
                    wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, '.progress-bar__inner')))
                except TimeoutException:
                    logger.warning(f"Progress bar did not disappear within 60 seconds for search '{search}'.")

                # Retry clicking the "Suggest" button
                suggest_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.js-keyword-auto-suggestion')))
                for _ in range(3):
                    try:
                        suggest_button.click()
                        break
                    except ElementClickInterceptedException:
                        time.sleep(2)
                        logger.info(f"Retrive data by clicking 'Suggest' button for search '{search}'.")

                # Wait for search results section
                results_section = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.free-tools__kw-lists-container')))

                # Select all <ul> elements with id starting with 'result_'
                ul_elements = driver.find_elements(By.XPATH, "//ul[starts-with(@id, 'result_')]")

                # Process each <ul> and extract its <li> elements
                data = []
                for index, ul in enumerate(ul_elements):
                    li_elements = ul.find_elements(By.TAG_NAME, 'li')

                    # Generate the header based on the index
                    next_word_letter = chr(96 + index) if index != 0 else None

                    if li_elements:
                        for rank, li in enumerate(li_elements[1:], start=1):  # Ignore the first <li>
                            suggestion = li.text.strip()

                            # Append data for each suggestion
                            data.append([search, store_name, country_name, language_name, next_word_letter, rank, suggestion, url, datetime.now().strftime('%Y_%m_%d %H:%M:%S')])

                # Append new data to master DataFrame
                master_df = pd.concat([master_df, pd.DataFrame(data, columns=['search', 'store', 'country', 'language', 'next_word_letter', 'rank', 'suggestion', 'scraping_url', 'scraping_timestamp'])], ignore_index=True)

    # Combine existing data with new data and remove duplicates
    combined_df = pd.concat([existing_data, master_df], ignore_index=True).drop_duplicates()

    # Delete the specific Parquet files from GCS after uploading the new combined data
    delete_parquet_files_from_gcs(bucket_name, batch_folder_name + '/' + today_date, parquet_files)

    # Save the combined DataFrame to Parquet and upload to GCS
    parquet_filename = generate_parquet_filename(search)
    upload_to_gcs(combined_df, bucket_name, batch_folder_name + '/' + today_date, parquet_filename)

# Clean up
driver.quit()


INFO:ApptweakScraper:Selected 'Albania' from dropdown 'country'.
INFO:ApptweakScraper:Processing: Store='App Store', Country='Albania', Language='English (UK)'
INFO:ApptweakScraper:Selected 'App Store' from dropdown 'store'.
INFO:ApptweakScraper:Selected 'Albania' from dropdown 'country'.
INFO:ApptweakScraper:Selected 'English (UK)' from dropdown 'language'.
INFO:ApptweakScraper:Retrive data by clicking 'Suggest' button for search 'b'.
INFO:ApptweakScraper:Retrive data by clicking 'Suggest' button for search 'b'.
INFO:ApptweakScraper:Retrive data by clicking 'Suggest' button for search 'b'.
INFO:ApptweakScraper:Selected 'Armenia' from dropdown 'country'.
INFO:ApptweakScraper:Processing: Store='App Store', Country='Armenia', Language='English (UK)'
INFO:ApptweakScraper:Selected 'App Store' from dropdown 'store'.
INFO:ApptweakScraper:Selected 'Armenia' from dropdown 'country'.
INFO:ApptweakScraper:Selected 'English (UK)' from dropdown 'language'.
INFO:ApptweakScraper:Retrive data by clic

### Load Data

In [None]:
from google.cloud import bigquery, storage

# Initialize clients
gcs_client = storage.Client()
bq_client = bigquery.Client()

# Define your parameters
project_id = "web-scraping-2024"
dataset_id = "autosuggest"
bucket_name = "apptweak_scraper"
apptweak_dir_prefix = "first_batch_of_searches/2024_09_23/"

def list_parquet_files(bucket_name, prefix):
    """List all parquet files in a specific GCS bucket and prefix."""
    bucket = gcs_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)
    return [blob.name for blob in blobs if blob.name.endswith('.parquet')]

# Load all files from data/history/ directory into a single table named 'companies_details'
apptweak_details_files = list_parquet_files(bucket_name, apptweak_dir_prefix)

apptweak_table_ref = bq_client.dataset(dataset_id).table("apptweak_first_batch")
apptweak_job_config = bigquery.LoadJobConfig(
    source_format=bigquery.SourceFormat.PARQUET,  # Parquet file format
    autodetect=True
)

# Start the load job for history files
load_job = bq_client.load_table_from_uri(
    [f"gs://{bucket_name}/{file}" for file in apptweak_details_files],
    apptweak_table_ref,
    job_config=apptweak_job_config
)

# Wait for the job to complete
load_job.result()

# Check the result
apptweak_details_table = bq_client.get_table(apptweak_table_ref)
print(f"Loaded {apptweak_details_table.num_rows} rows into {dataset_id}:apptweak_first_batch.")


Loaded 81514 rows into autosuggest:apptweak_first_batch.


In [None]:
from google.cloud import bigquery, storage

# Initialize clients
gcs_client = storage.Client()
bq_client = bigquery.Client()

# Define your parameters
project_id = "web-scraping-2024"
dataset_id = "autosuggest"
bucket_name = "apptweak_scraper"
apptweak_dir_prefix = "first_batch_of_searches/2024_09_23/"
specific_file_name = "2024_09_23__1727111422__b.parquet"

def list_parquet_files(bucket_name, prefix, specific_file):
    """List specific parquet file in a GCS bucket and prefix."""
    bucket = gcs_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)
    return [blob.name for blob in blobs if blob.name.endswith('.parquet') and specific_file in blob.name]

# Load the specific file
apptweak_details_files = list_parquet_files(bucket_name, apptweak_dir_prefix, specific_file_name)

if not apptweak_details_files:
    print("No files found.")
else:
    apptweak_table_ref = bq_client.dataset(dataset_id).table("apptweak_first_batch")
    apptweak_job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.PARQUET,  # Parquet file format
        autodetect=True
    )

    # Start the load job for the specific file
    load_job = bq_client.load_table_from_uri(
        f"gs://{bucket_name}/{apptweak_details_files[0]}",
        apptweak_table_ref,
        job_config=apptweak_job_config
    )

    # Wait for the job to complete
    load_job.result()

    # Check the result
    apptweak_details_table = bq_client.get_table(apptweak_table_ref)
    print(f"Loaded {apptweak_details_table.num_rows} rows into {dataset_id}:apptweak_first_batch.")


Loaded 112480 rows into autosuggest:apptweak_first_batch.
