In [5]:
import requests
import pandas as pd
from urllib.parse import urljoin

# List of websites to scrape
websites = [
    "https://www.google.com/",
    "https://www.facebook.com/",
    "https://www.amazon.com/",
    "https://www.twitter.com/",
    "https://www.wikipedia.org/",
    "https://www.reddit.com/",
    "https://www.nytimes.com/",
    "https://www.bbc.com/",
]

In [6]:
def fetch_robots_txt(url):
    """Fetches the robots.txt file from a given website."""
    robots_url = urljoin(url, "/robots.txt")
    try:
        response = requests.get(robots_url, timeout=5)
        if response.status_code == 200:
            return response.text
        else:
            return None
    except requests.RequestException:
        return None

In [7]:
def parse_robots_txt(robots_txt):
    """Parses the robots.txt file and extracts crawl rules."""
    rules = {
        "Allow": [],
        "Disallow": []
    }
    if robots_txt:
        for line in robots_txt.split("\n"):
            parts = line.split(": ", 1)  # Safely split into two parts
            if len(parts) == 2:  # Ensure there are both key and value
                key, value = parts
                if key == "Allow":
                    rules["Allow"].append(value)
                elif key == "Disallow":
                    rules["Disallow"].append(value)
    return rules


In [8]:
data = []

for site in websites:
    robots_txt = fetch_robots_txt(site)
    rules = parse_robots_txt(robots_txt)
    data.append({
        "Website": site,
        "Allowed Paths": len(rules["Allow"]),
        "Disallowed Paths": len(rules["Disallow"]),
        "Example Disallowed": rules["Disallow"][:5]  # Example of first 5 disallowed paths
    })

# Create a DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df)

# Save to CSV
df.to_csv("robots_txt_analysis.csv", index=False)


                      Website  Allowed Paths  Disallowed Paths  \
0     https://www.google.com/             89               248   
1   https://www.facebook.com/             84               686   
2     https://www.amazon.com/             17               121   
3    https://www.twitter.com/              4                20   
4  https://www.wikipedia.org/              3               459   
5     https://www.reddit.com/              0                 1   
6    https://www.nytimes.com/             19               124   
7        https://www.bbc.com/              0                67   

                                  Example Disallowed  
0        [/search, /sdch, /groups, /index.html?, /?]  
1                                    [/, /, /, /, /]  
2  [/exec/obidos/account-access-login, /exec/obid...  
3  [/search/realtime, /search/users, /search/*/gr...  
4                                    [/, /, /, /, /]  
5                                                [/]  
6  [/ads/, /adx/bin/