In [2]:
from dotenv import load_dotenv
import os
import psycopg
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import csv
import time
import requests
import bs4
import pandas as pd
import time
import re
from sqlalchemy import create_engine, Column, String, Integer, Text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
import psycopg2

In [3]:
# Set the base URL for Search Engine Land
base_url = 'https://searchengineland.com/'

# Create headers to simulate a browser (prevents being blocked)
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}

# Get the main page first
response = requests.get(base_url, headers=headers)
soup = bs4.BeautifulSoup(response.text, 'html.parser')

# Print the title of the page to confirm it worked
print(soup.title.text)

# Create lists to store our data
article_titles = []
article_urls = []

# Function to check if a URL is a valid article URL
def is_article_url(url):
    # Skip author pages
    if '/author/' in url:
        return False
        
    # Skip category/library pages
    if '/category/' in url or '/library/' in url:
        return False
        
    # Skip tag pages
    if '/tag/' in url:
        return False
        
    # Skip about/contact/advertise pages
    if any(x in url for x in ['/about', '/contact', '/advertise', '/jobs']):
        return False
        
    # Skip archive pages
    if re.search(r'/\d{4}/\d{2}/', url):
        return False
        
    # Skip URLs with just numbers at the end (likely pagination)
    if re.search(r'/\d+/?$', url):
        return False
        
    # Accept URLs that end with a numeric ID (likely an article)
    # Example: https://searchengineland.com/google-drops-ai-while-browsing-feature-453671
    if re.search(r'-\d+/?$', url):
        return True
        
    return False

# Function to extract article headlines and links from a page
def extract_articles(url):
    # Get the page content
    response = requests.get(url, headers=headers)
    
    # Check if request was successful
    if response.status_code != 200:
        print(f"Failed to access {url}, status code: {response.status_code}")
        return
    
    # Parse the HTML
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    
    # Find all links
    links = soup.find_all('a', href=True)
    
    initial_count = len(article_urls)
    
    # Process each link
    for link in links:
        href = link['href']
        
        # Check if it's a Search Engine Land URL
        if 'searchengineland.com' in href:
            # Check if it matches our article URL pattern
            if is_article_url(href):
                title = link.get_text(strip=True)
                
                # Skip empty or very short titles
                if not title or len(title) < 10:
                    continue
                    
                # Only add if not already in our list
                if href not in article_urls:
                    article_titles.append(title)
                    article_urls.append(href)
    
    print(f"Found {len(article_urls) - initial_count} new articles on {url}")

# Start with the homepage
print("Starting to scrape Search Engine Land for article headlines...")
urls_to_visit = [base_url]
already_visited = set()

# Set limit for number of pages to scrape
max_pages = 10
pages_scraped = 0

# Scrape pages until we reach our limit
while urls_to_visit and pages_scraped < max_pages:
    # Get the next URL
    current_url = urls_to_visit.pop(0)
    
    # Skip if already visited
    if current_url in already_visited:
        continue
        
    # Mark as visited
    already_visited.add(current_url)
    pages_scraped += 1
    
    print(f"Scraping page {pages_scraped}/{max_pages}: {current_url}")
    
    # Add delay to be respectful to the server
    time.sleep(2)
    
    # Extract articles from the page
    extract_articles(current_url)
    
    # Get the page content to find more URLs
    response = requests.get(current_url, headers=headers)
    if response.status_code == 200:
        soup = bs4.BeautifulSoup(response.text, 'html.parser')
        
        # Find links to category pages and pagination pages
        for link in soup.find_all('a', href=True):
            href = link['href']
            
            # Add category pages but not author or tag pages
            if ('searchengineland.com' in href and 
                (('/category/' in href) or ('/page/' in href)) and
                href not in already_visited and 
                href not in urls_to_visit):
                urls_to_visit.append(href)
                
        # Also try to find pagination links by pattern matching
        # Look for /page/2, /page/3, etc.
        if '/page/' not in current_url:  # Only do this if we're not already on a paginated page
            base_path = current_url.rstrip('/')
            for page_num in range(2, 6):  # Check pages 2 through 5
                pagination_url = f"{base_path}/page/{page_num}/"
                if pagination_url not in already_visited and pagination_url not in urls_to_visit:
                    urls_to_visit.append(pagination_url)

# Create a DataFrame with our data
articles_df = pd.DataFrame({
    'title': article_titles,
    'url': article_urls
})

# Remove duplicates
articles_df = articles_df.drop_duplicates(subset=['url'])

# Save to CSV (keeping this for backup)
articles_df.to_csv("search_engine_land_articles.csv", index=False)

print(f"Completed! Scraped {pages_scraped} pages and found {len(articles_df)} unique articles.")
print(f"Data saved to search_engine_land_articles.csv")

# Print a sample of the articles found
print("\nSample of articles found:")
for i, (title, url) in enumerate(zip(articles_df['title'].head(5), articles_df['url'].head(5))):
    print(f"{i+1}. {title}")
    print(f"   {url}")
    print()

Search Engine Land - News, Search Engine Optimization (SEO), Pay-Per-Click (PPC)
Starting to scrape Search Engine Land for article headlines...
Scraping page 1/10: https://searchengineland.com/
Found 42 new articles on https://searchengineland.com/
Scraping page 2/10: https://searchengineland.com/page/2
Found 36 new articles on https://searchengineland.com/page/2
Scraping page 3/10: https://searchengineland.com/page/3
Found 35 new articles on https://searchengineland.com/page/3
Scraping page 4/10: https://searchengineland.com/page/4
Found 37 new articles on https://searchengineland.com/page/4
Scraping page 5/10: https://searchengineland.com/page/5
Found 40 new articles on https://searchengineland.com/page/5
Scraping page 6/10: https://searchengineland.com/page/2/
Found 0 new articles on https://searchengineland.com/page/2/
Scraping page 7/10: https://searchengineland.com/page/3/
Found 0 new articles on https://searchengineland.com/page/3/
Scraping page 8/10: https://searchengineland.co