# What is this

Allows you to interact with google search and extract top K links.

In [4]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import random
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
import re


In [24]:
def initialize_driver():
    options = Options()
    # options.add_argument('--headless=new') #comment this out if you wanna debug visually
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    return webdriver.Chrome(options=options)

def get_search_query_url(query):
    base_url = 'https://www.google.com/search?q='
    query_string = query.replace(" ", "%2B")
    return f"{base_url}{query_string}#ip=1"

def scroll_to_bottom(driver):
    last_scroll_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_scroll_height = driver.execute_script("return document.body.scrollHeight")
        if new_scroll_height == last_scroll_height:
            break
        last_scroll_height = new_scroll_height

def extract_links_from_page(driver, search_term):
    page_source = driver.page_source
    hrefs = driver.find_elements(By.PARTIAL_LINK_TEXT, search_term)
    links = [element.get_attribute("href") for element in hrefs]
    
    soup = BeautifulSoup(page_source, 'html.parser')
    pattern = re.compile(r'<a\s+jsname="UWckNb"\s+href="([^"]+)"')
    matches = re.findall(pattern, page_source)
    links.extend(matches)
    
    return links
    
def fetch_links_for_query(query, top_n, search_term = ''):
    all_links = []
    driver = initialize_driver()

    current_url = ""
    attempt = 0

    while len(all_links) < top_n:
        print(f"Attempt {attempt + 1}")
        attempt += 1

        if attempt == 1:
            query_url = get_search_query_url(query)
            driver.get(query_url)
        else:
            try:
                next_button = driver.find_element(By.XPATH, '//a[@id="pnnext"]')
                next_button.click()
                time.sleep(2)

                # Wait for the page to load and ensure it is different from the previous one
                while driver.current_url == current_url:
                    time.sleep(1)
            except Exception:
                print("No more pages found.")
                break

        current_url = driver.current_url
        print(f"Fetching from: {current_url}")

        scroll_to_bottom(driver)
        links = extract_links_from_page(driver, search_term)
        all_links.extend(links)

        if len(all_links) >= top_n:
            break

    driver.quit()
    return all_links[:top_n]  # Return only the top N links


# Prompt user for query and number of URLs
query = input("Enter the query: ")
top_n = int(input("Enter the number of top URLs to fetch: "))
## not super effective. yer better off getting all the urls and then searching through them via regex/etc.
# search_term = input("Enter text you want contained in url (optional): ") 
links = fetch_links_for_query(query, top_n)

print(f"Found {len(links)} links for '{query}'")

Enter the query:  alaibvaiuerbgancaperngf
Enter the number of top URLs to fetch:  1000


Attempt 1
Fetching from: https://www.google.com/search?q=alaibvaiuerbgancaperngf#ip=1
Attempt 2
No more pages found.
Found 43 links for 'alaibvaiuerbgancaperngf'


In [25]:
print(f"Found {len(links)} links for '{query}'")

Found 43 links for 'alaibvaiuerbgancaperngf'


## observations/ extensions:

1. If no subfilter, then it links to a bunch of interesting things, like related google searchers, google maps, google images, flights (?), etc. Also some "None"s are returned.
2. I could look for "tail links". Make a scrappy google indexer via a bunch of random queries, try and categorize them and group by occurence (soft grouping)...
    - probably need to do this. maybe not at the model level, but def at the eda level.
  
## but what does this do rn:
1. given a query, it searches through all pages and gets links in a list. good enough for basic thing.