# SEO Tools 
## Scraping Metadata 

In [2]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from bs4 import BeautifulSoup

#Define headers to mimic a real browser since I keep getting a 403 error
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Referer": "https://www.google.com/",
    "Accept-Language": "en-US,en;q=0.9"
}

#Get sitemap (XML)
sitemap_url = "https://www.schmittchevrolet.com/page-sitemap.xml"
response = requests.get(sitemap_url, headers=headers)

if response.status_code == 200:
    root = ET.fromstring(response.text)

    #Extract all URLs from sitemap
    namespace = {"ns": "http://www.sitemaps.org/schemas/sitemap/0.9"}
    urls = [elem.text for elem in root.findall(".//ns:loc", namespace)]

    print(f"Found {len(urls)} URLs in the sitemap.\n")

    #Scrape each page for metadata
    data = []
    for url in urls:
        try:
            page_response = requests.get(url, headers=headers, timeout=10)
            if page_response.status_code == 200:
                soup = BeautifulSoup(page_response.text, "html.parser")

                #Extract metadata
                title = soup.title.text.strip() if soup.title else None
                description = (
                    soup.find("meta", attrs={"name": "description"})["content"].strip()
                    if soup.find("meta", attrs={"name": "description"})
                    else None
                )
                keywords = (
                    soup.find("meta", attrs={"name": "keywords"})["content"].strip()
                    if soup.find("meta", attrs={"name": "keywords"})
                    else None
                )

                # Determine if each metadata element is missing (1 = missing, 0 = present)
                missing_title = 1 if title is None else 0
                missing_description = 1 if description is None else 0
                missing_keywords = 1 if keywords is None else 0

                # Replace None values with placeholders for better readability
                title = title if title else "No Title"
                description = description if description else "No Description"
                keywords = keywords if keywords else "No Keywords"

                # Append data to list
                data.append([url, title, description, keywords, missing_title, missing_description, missing_keywords])

            else:
                print(f"Failed to scrape {url}: {page_response.status_code}")

        except Exception as e:
            print(f"Error scraping {url}: {e}")

    df = pd.DataFrame(data, columns=["URL", "Title", "Description", "Keywords", "Missing_Title", "Missing_Description", "Missing_Keywords"])

    df.to_csv("weber_sitemap_metadata.csv", index=False, encoding="utf-8")

    print("\n Data saved to sitemap_metadata.csv")

else:
    print(f"Error {response.status_code}: Unable to access sitemap.")


Found 195 URLs in the sitemap.

Failed to scrape https://www.schmittchevrolet.com/chevrolet-business-choice/: 404

 Data saved to sitemap_metadata.csv
