In [1]:
import re
import requests
from bs4 import BeautifulSoup
from openai import OpenAI
from dotenv import dotenv_values
from IPython.display import Image, display
from googlesearch import search


In [2]:
config = dotenv_values(".env")
client = OpenAI(api_key=config["APIKEY"])

def icelandic_to_english(product_name):
    response = client.chat.completions.create(
        model = "gpt-4o",
        temperature = 0.8,
        messages = [
            {"role":"system","content":"You are a translator who translates the Product name from Icelandic to English and just provide the translated product name and nothing else."},
            {"role":"user","content":product_name}
        ]
    )
    return response.choices[0].message.content.strip()


In [15]:
def search_product_links_google(product_name, brand, sku, max_links=5):
    print(f"\n🔍 Searching for: {product_name} | {brand} | {sku}")
    query = f"{product_name} {brand} {sku} site:*.com"
    print("🔎 Query:", query)

    raw_links = []
    try:
        for result in search(query, num_results=15):
            if result.startswith("http") and "/search?" not in result:
                raw_links.append(result)
    except Exception as e:
        print("❌ Google search failed:", e)
        return []

    print("🌐 Fetched Links:")
    for link in raw_links:
        print("  -", link)

    # ✅ Keep only typical product detail pages
    allowed_patterns = re.compile(r"(product|products|detail|dp|sku|/p/|/item/|/shop/)", re.IGNORECASE)

    filtered_links = []
    for url in raw_links:
        if not allowed_patterns.search(url):
            continue
        try:
            r = requests.get(url, timeout=8)
            if r.status_code == 200 and (
                sku.lower() in r.text.lower()
                or brand.lower() in r.text.lower()
                or product_name.split()[0].lower() in r.text.lower()
            ):
                filtered_links.append(url)
            if len(filtered_links) >= max_links:
                break
        except Exception:
            continue

    return filtered_links



In [16]:
def extract_images_from_urls(links, global_limit=20):
    print("\n🖼️ Extracting images from source URLs...")
    image_urls = []
    allowed_ext = (".jpg", ".jpeg", ".png", ".webp")
    blocked_terms = ["sprite", "icon", "logo", "tracking", "facebook", "pixel", "blank"]

    for url in links:
        try:
            res = requests.get(url, timeout=10)
            if res.status_code != 200:
                continue

            soup = BeautifulSoup(res.text, 'html.parser')
            imgs = soup.find_all('img')

            for tag in imgs:
                src = tag.get('src') or tag.get('data-src') or tag.get('data-original')
                if not src:
                    continue
                src = src.strip()
                if src.startswith('//'):
                    src = 'https:' + src
                if not src.startswith('http'):
                    continue
                if not src.lower().endswith(allowed_ext):
                    continue
                if any(blocked in src.lower() for blocked in blocked_terms):
                    continue

                image_urls.append(src)
                if len(image_urls) >= global_limit:
                    break
        except Exception:
            continue

        if len(image_urls) >= global_limit:
            break

    return image_urls


In [17]:
product_name = "Samsung Örbylgjuofn 23 lítra"
brand = "Samsung"
sku = "SAM-MS23K3513ASEE"

# Translate
product_name_en = icelandic_to_english(product_name)

# Get valid product links
valid_links = search_product_links_google(product_name_en, brand, sku, max_links=5)

# Get image URLs
image_urls = extract_images_from_urls(valid_links, global_limit=10)

print("\n✅ Final Image URLs:")
for url in image_urls:
    print(url)



🔍 Searching for: Samsung Microwave 23 Liter | Samsung | SAM-MS23K3513ASEE
🔎 Query: Samsung Microwave 23 Liter Samsung SAM-MS23K3513ASEE site:*.com
🌐 Fetched Links:

🖼️ Extracting images from source URLs...

✅ Final Image URLs:


In [14]:
print("\n🖼️ Displaying Images:")
for url in image_urls:
    try:
        display(Image(url=url, width=250))
    except Exception:
        print(f"⚠️ Failed to load image: {url}")



🖼️ Displaying Images:


In [26]:
from googlesearch import search
import re
import requests

def search_product_links_google(product_name, brand, sku, max_links=5):
    print(f"\n🔍 Searching for: {product_name}, {sku}")
    query = f"{product_name} {brand} {sku} site:*.com"
    print("🔎 Query:", query)

    raw_links = []
    try:
        for result in search(query, num_results=15):
            if result.startswith("http") and "/search?" not in result:
                raw_links.append(result)
    except Exception as e:
        print("❌ Google search failed:", e)
        return []

    print("🌐 Fetched Links:")
    for link in raw_links:
        print("  -", link)

    # ✅ Keep only typical product detail pages
    allowed_patterns = re.compile(r"(product|products|detail|dp|sku|/p/|/item/|/shop/)", re.IGNORECASE)

    filtered_links = []
    for url in raw_links:
#         if not allowed_patterns.search(url):
#             continue
        try:
            r = requests.get(url, timeout=8)
            if r.status_code == 200 and (
                sku.lower() in r.text.lower()
                or brand.lower() in r.text.lower()
                or product_name.split()[0].lower() in r.text.lower()
            ):
                filtered_links.append(url)
            if len(filtered_links) >= max_links:
                break
        except Exception:
            continue

    return filtered_links


In [27]:
product_name = "Samsung Örbylgjuofn 23 lítra"
brand = "Samsung"
sku = "SAM-MS23K3513ASEE"

product_name_en = icelandic_to_english(product_name)  # If needed, or use product_name directly

valid_links = search_product_links_google(product_name_en, brand, sku, max_links=5)

print("\n✅ Final Valid Product Links:")
for link in valid_links:
    print(link)



🔍 Searching for: Samsung Microwave Oven 23 Liters, SAM-MS23K3513ASEE
🔎 Query: Samsung Microwave Oven 23 Liters Samsung SAM-MS23K3513ASEE site:*.com
🌐 Fetched Links:

✅ Final Valid Product Links:


In [72]:
from googlesearch import search

def search_product_links_google(product_name, brand, sku, max_links=5):
    query = f"{product_name} {sku} site:*.com"
    print(f"\n🔍 Searching for: hello, {brand}, {sku}")
    print("🌐 Query:", query)

    raw_links = []
    try:
        for result in search(query, num_results=15):
            if result.startswith("http") and "/search?" not in result:
                raw_links.append(result)
    except Exception as e:
        print("❌ Google search failed:", e)
        return []

    print("\n🌐 Fetched Links:")
    for link in raw_links:
        print("  -", link)

    return raw_links[:max_links]


In [75]:
product_name = "Samsung Microwave Oven 23 Liters"
brand = "Samsung"
sku = "SAM-MS23K3513ASEE"

links = search_product_links_google(product_name, brand, sku)



🔍 Searching for: hello, Samsung, SAM-MS23K3513ASEE
🌐 Query: Samsung Microwave Oven 23 Liters SAM-MS23K3513ASEE site:*.com
❌ Google search failed: 429 Client Error: Too Many Requests for url: https://www.google.com/sorry/index?continue=https://www.google.com/search%3Fq%3DSamsung%2BMicrowave%2BOven%2B23%2BLiters%2BSAM-MS23K3513ASEE%2Bsite%253A%252A.com%26num%3D17%26hl%3Den%26start%3D0%26safe%3Dactive&hl=en&q=EgQ88xJ7GMyFxcIGIjDaXqGb9eKf7gvjIPk2-f0G1Uaq0w3gL4hGTr4GXunmA0nMAbAFQj_RuxPxmvMCq2QyAnJSWgFD


In [36]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote

In [52]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote

def fetch_google_links(product_name, sku, num_results=3):
    query = f"{product_name} {sku}"
    encoded_query = quote(query)
    url = f"https://www.google.com/search?q={encoded_query}"
    
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/114.0.0.0 Safari/537.36"
        )
    }
    
    try:
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"❌ Failed to fetch: HTTP {response.status_code}")
            return []

        soup = BeautifulSoup(response.text, "html.parser")
        links = []
        
        for a in soup.select("a"):
            href = a.get("href", "")
            if href.startswith("/url?q=") and "webcache" not in href:
                real_link = href.split("/url?q=")[1].split("&")[0]
                links.append(real_link)
                if len(links) == num_results:
                    break
        
        return links
    except Exception as e:
        print("❌ Exception occurred:", e)
        return []


In [53]:
product = input("Enter Product Name: ")
sku = input("Enter Product SKU: ")

top_links = fetch_google_links(product, sku)

print("\n🔗 Top Google Search Results:")
for i, link in enumerate(top_links, 1):
    print(f"{i}. {link}")


Enter Product Name: Samsung Microwave Oven 23 Liters
Enter Product SKU: SAM-MS23K3513ASEE
❌ Failed to fetch: HTTP 429

🔗 Top Google Search Results:


In [56]:
pip install selenium webdriver-manager

Note: you may need to restart the kernel to use updated packages.


In [69]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import quote
import time

def google_search_with_browser(product_name, sku, num_results=3):
    query = f"{product_name} {sku}"
    search_url = f"https://www.google.com/search?q={quote(query)}"

    chrome_options = Options()
    chrome_options.add_argument("--headless=new")  # Modern headless mode
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--window-size=1920x1080")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)

    try:
        driver.get(search_url)
        time.sleep(2)  # Allow JS to load content

        print("🔎 Page Title:", driver.title)

        # NEW: Locate all result blocks
        containers = driver.find_elements(By.CSS_SELECTOR, "div.MjjYud")
        top_links = []

        for container in containers:
            try:
                link_tag = container.find_element(By.TAG_NAME, "a")
                link = link_tag.get_attribute("href")
                if link and "google.com/search" not in link:
                    top_links.append(link)
                if len(top_links) == num_results:
                    break
            except:
                continue  # If no link, skip

        return top_links

    except Exception as e:
        print("❌ Error:", e)
        return []
    finally:
        driver.quit()


In [70]:
product = input("Enter Product Name: ")
sku = input("Enter Product SKU: ")

top_links = google_search_with_browser(product, sku)

print("\n🔗 Top Google Search Results:")
for i, link in enumerate(top_links, 1):
    print(f"{i}. {link}")


Enter Product Name: LG 100" QNED UHD Smart TV
Enter Product SKU: LG-100QNED86A6
🔎 Page Title: https://www.google.com/search?q=LG%20100%22%20QNED%20UHD%20Smart%20TV%20LG-100QNED86A6&sei=JEJRaLygLoGTseMP7JzEwAo

🔗 Top Google Search Results:


In [71]:
import requests
from bs4 import BeautifulSoup
import urllib.parse

def google_search(query, num_results=3):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/114.0.0.0 Safari/537.36"
    }
    
    # Construct Google Search URL
    query_encoded = urllib.parse.quote_plus(query)
    url = f"https://www.google.com/search?q={query_encoded}&hl=en"
    
    # Send GET request
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract all search result links
    links = []
    for a_tag in soup.find_all("a", href=True):
        href = a_tag["href"]
        if href.startswith("/url?q="):
            actual_url = href.split("/url?q=")[1].split("&")[0]
            if "google.com" not in actual_url:
                links.append(actual_url)
        if len(links) >= num_results:
            break
    
    return links

# Example usage
query = "best Python IDEs 2025"
top_links = google_search(query)

print("Top Google Results:")
for i, link in enumerate(top_links, 1):
    print(f"{i}. {link}")


Top Google Results:
