<a href="https://colab.research.google.com/github/simodepth96/Robots.Txt-Competitor-Analysis/blob/main/Robots_txt_Competitor_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install advertools
!pip install plotly



In [3]:
#@title Fetch all Robots.txt files

import advertools as adv
import pandas as pd
import requests
import time

# List of robots.txt URLs
robotstxt_urls = [
'https://www.bbc.com/robots.txt',
'https://www.theguardian.com/robots.txt',
'https://www.thesun.co.uk/robots.txt',
'https://www.mirror.co.uk/robots.txt'
]

robots_dfs = []

# Loop through each URL, fetch with requests, and parse
for url in robotstxt_urls:
    try:
        print(f"Fetching: {url}")
        # Use advertools.robotstxt_to_df with the URL
        df = adv.robotstxt_to_df(robotstxt_url=url)
        df['robots_url'] = url
        robots_dfs.append(df)

    except Exception as e:
        print(f"Error processing {url}: {e}")

    time.sleep(5)  # Be polite to servers

robots_df = pd.concat(robots_dfs, ignore_index=True)
robots_df

INFO:root:Getting: https://www.bbc.com/robots.txt


Fetching: https://www.bbc.com/robots.txt


INFO:root:Getting: https://www.theguardian.com/robots.txt


Fetching: https://www.theguardian.com/robots.txt


INFO:root:Getting: https://www.thesun.co.uk/robots.txt


Fetching: https://www.thesun.co.uk/robots.txt


INFO:root:Getting: https://www.mirror.co.uk/robots.txt


Fetching: https://www.mirror.co.uk/robots.txt


Unnamed: 0,directive,content,robotstxt_url,download_date,robots_url,etag,robotstxt_last_modified
0,comment,version: c6c2d0415f0c1d97565a01952c7b2146e2abe17f,https://www.bbc.com/robots.txt,2025-07-19 15:55:19.768046+00:00,https://www.bbc.com/robots.txt,,NaT
1,comment,HTTPS www.bbc.com,https://www.bbc.com/robots.txt,2025-07-19 15:55:19.768046+00:00,https://www.bbc.com/robots.txt,,NaT
2,User-agent,*,https://www.bbc.com/robots.txt,2025-07-19 15:55:19.768046+00:00,https://www.bbc.com/robots.txt,,NaT
3,Sitemap,https://www.bbc.com/sitemaps/https-index-com-a...,https://www.bbc.com/robots.txt,2025-07-19 15:55:19.768046+00:00,https://www.bbc.com/robots.txt,,NaT
4,Sitemap,https://www.bbc.com/sitemaps/https-index-com-n...,https://www.bbc.com/robots.txt,2025-07-19 15:55:19.768046+00:00,https://www.bbc.com/robots.txt,,NaT
...,...,...,...,...,...,...,...
397,Crawl-delay,0,https://www.mirror.co.uk/robots.txt,2025-07-19 15:55:35.422996+00:00,https://www.mirror.co.uk/robots.txt,,2024-12-09 16:33:15
398,User-agent,bingbot,https://www.mirror.co.uk/robots.txt,2025-07-19 15:55:35.422996+00:00,https://www.mirror.co.uk/robots.txt,,2024-12-09 16:33:15
399,Crawl-delay,1,https://www.mirror.co.uk/robots.txt,2025-07-19 15:55:35.422996+00:00,https://www.mirror.co.uk/robots.txt,,2024-12-09 16:33:15
400,User-agent,Meta-ExternalAgent,https://www.mirror.co.uk/robots.txt,2025-07-19 15:55:35.422996+00:00,https://www.mirror.co.uk/robots.txt,,2024-12-09 16:33:15


In [4]:
#@title Data Pre-processing - remove unwanted headers & special characters from directives
import re
robots_df = robots_df.drop(columns=['etag', 'robotstxt_url', 'download_date', 'robotstxt_last_modified'], errors='ignore')

# Keep rows where 'directive' is 'User-agent' or 'Disallow'
robots_df = robots_df[robots_df['directive'].isin(['User-agent', 'Disallow'])]

# Function to clean content using regex
def clean_content(text):
    """
    Clean text by removing special characters and keeping only alphanumeric characters and spaces.

    Args:
        text (str): Input text to clean

    Returns:
        str: Cleaned text with only alphanumeric characters and spaces
    """
    if pd.isna(text):
        return text

    # Convert to string if not already
    text = str(text)

    # Use regex to keep only alphanumeric characters and spaces
    # This removes all special characters like *, =, **, etc.
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)

    # Remove extra whitespace and strip
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

    return cleaned_text

# Apply the cleaning function to the 'content' column
robots_df['content'] = robots_df['content'].apply(clean_content)

robots_df

Unnamed: 0,directive,content,robots_url
2,User-agent,,https://www.bbc.com/robots.txt
35,Disallow,asset,https://www.bbc.com/robots.txt
36,Disallow,bitesize search,https://www.bbc.com/robots.txt
37,Disallow,bitesize search,https://www.bbc.com/robots.txt
38,Disallow,bitesize search,https://www.bbc.com/robots.txt
...,...,...,...
395,Disallow,,https://www.mirror.co.uk/robots.txt
396,User-agent,grapeshot,https://www.mirror.co.uk/robots.txt
398,User-agent,bingbot,https://www.mirror.co.uk/robots.txt
400,User-agent,Meta ExternalAgent,https://www.mirror.co.uk/robots.txt


#User-Agents Table

##Quick heads-up
Empty content header from the following `user-agent` table means a wildcard for all user-agents


In [5]:
user_agent = robots_df.copy()
user_agent = user_agent[user_agent['directive'] == 'User-agent']
#user_agent.to_excel('user_agent.xlsx',index=False)
user_agent

Unnamed: 0,directive,content,robots_url
2,User-agent,,https://www.bbc.com/robots.txt
82,User-agent,Amazonbot,https://www.bbc.com/robots.txt
84,User-agent,magpie crawler,https://www.bbc.com/robots.txt
86,User-agent,CCBot,https://www.bbc.com/robots.txt
92,User-agent,Claude Web,https://www.bbc.com/robots.txt
...,...,...,...
392,User-agent,anthropic ai,https://www.mirror.co.uk/robots.txt
394,User-agent,Claude Web,https://www.mirror.co.uk/robots.txt
396,User-agent,grapeshot,https://www.mirror.co.uk/robots.txt
398,User-agent,bingbot,https://www.mirror.co.uk/robots.txt


#CAVEAT
It might be necessary for you to download export the output from above and do a bit of manual clean up in Excel.

You can then import the XLSX cleaned file in the first line of code just below

In [None]:
# Upload a xlsx file
import pandas as pd
user_agent_cleaned = pd.read_excel('/content/directive.xlsx')

'''
From now on make sure you replace
user_agent
 with
user_agent_cleaned
'''

In [6]:
#@title User-Agent Table pre-processing & Rule-based classification

# Fix column name if there's a typo
if 'directive]' in user_agent.columns:
    user_agent = user_agent.rename(columns={'directive]': 'directive'})

# Drop irrelevant columns (optional, if they exist)
user_agent = user_agent.drop(
    columns=['etag', 'download_date', 'robotstxt_last_modified'],
    errors='ignore'
)

# Define the user agent classification function
def classify_user_agent(value):
    if pd.isna(value):
        return "Other/Unclassified"

    v = value.lower().strip()

    # Google/Alphabet
    if any(k in v for k in ["googlebot", "google extended", "google cloudvertexbot", "mediapartners google"]):
        return "Google"

    # OpenAI
    elif any(k in v for k in ["gptbot", "chatgpt user", "oai searchbot"]):
        return "OpenAI"

    # Anthropic
    elif any(k in v for k in ["claude web", "claudebot", "anthropic ai"]):
        return "Anthropic"

    # Meta
    elif any(k in v for k in ["facebookbot", "meta externalagent"]):
        return "Meta"

    # Microsoft
    elif "bingbot" in v:
        return "Bing"

    # Apple
    elif any(k in v for k in ["applebot"]):
        return "Apple"

    # Yandex
    elif any(k in v for k in ["yandex"]):
        return "Yandex"

    # ByteDance
    elif "bytespider" in v:
        return "ByteDance"

    # Huawei
    elif "petalbot" in v:
        return "Huawei"

    # Cohere
    elif "cohere ai" in v:
        return "Cohere"

    # Perplexity
    elif "perplexity" in v:
        return "Perplexity"

    # Baidu
    elif any(k in v for k in ["baiduspider","baidubaikebot"]):
        return "Baidu"

    #Amazon
    elif "amazonbot" in v:
      return "Amazon"

    # SEO/Marketing Tools
    elif any(k in v for k in ["ahrefsbot", "mj12bot", "awario", "sentione", "meltwater", "grapeshot", "semetrical"]):
        return "SEO/Marketing Tools"

    # Other Bots
    elif any(k in v for k in ["slurp", "ccbot", "scrapy", "magpie crawler", "coccocbot", "newsnow", "news please",
                             "rogerbot", "daumoa", "sosospider", "ia archiver", "omgili", "piplbot",
                             "imagesift", "jenkersbot", "scalepostai", "buck"]):
        return "General Crawlers/Scrapers"

# Apply classification to the 'content' column
user_agent['user_agent_bucket'] = user_agent['content'].apply(classify_user_agent)

# Preview result
user_agent.value_counts('user_agent_bucket')

Unnamed: 0_level_0,count
user_agent_bucket,Unnamed: 1_level_1
General Crawlers/Scrapers,24
Anthropic,12
SEO/Marketing Tools,8
Meta,6
Google,6
Apple,5
Perplexity,5
OpenAI,5
Yandex,4
ByteDance,3


In [9]:
#@title Plotting a Heatmap of Blocked User-Agents

import pandas as pd
import plotly.express as px

# Function to extract clean site names from robots.txt URLs
def get_site_name(url):
    """Extract clean site name from robots.txt URL"""
    url_mapping = {
        'https://www.bbc.com/robots.txt': 'BBC',
        'https://www.theguardian.com/robots.txt': 'The Guardian',
        'https://www.thesun.co.uk/robots.txt': 'The Sun',
        'https://www.mirror.co.uk/robots.txt': 'The Mirror'
    }

    # Return mapped name if exact match, otherwise extract from URL
    if url in url_mapping:
        return url_mapping[url]

    # Fallback: extract domain name
    if 'bbc.com' in url:
        return 'BBC'
    elif 'theguardian.com' in url:
        return 'The Guardian'
    elif 'thesun.co.uk' in url:
        return 'The Sun'
    elif 'mirror.co.uk' in url:
        return 'The Mirror'
    else:
        # Generic extraction from domain
        domain = url.replace('https://www.', '').replace('http://www.', '').replace('/robots.txt', '')
        return domain.replace('.com', '').replace('.co.uk', '').title()

# Create a cross tab of robots_url and user_agent_bucket
heatmap_data = pd.crosstab(user_agent['robots_url'], user_agent['user_agent_bucket'])

# Create mapping of site names
site_names = [get_site_name(url) for url in heatmap_data.index]

# Create the heatmap
fig = px.imshow(
    heatmap_data,
    labels=dict(x="User-Agents", y="Websites Robots.txt", color="Count"),
    x=heatmap_data.columns,
    y=site_names,  # Use clean site names instead of URLs
    color_continuous_scale="viridis",
    text_auto=True  # adds value annotations like sns.heatmap(annot=True)
)

fig.update_layout(
    title="Distribution of Blocked User-Agents",
    height=max(400, len(heatmap_data.index) * 60),  # Adjust height based on number of sites
    margin=dict(l=120, r=50, t=80, b=50)  # Increase left margin for site names
)

fig.show()

#Disallow Directive Table

In [10]:
directive = robots_df.copy()
directive = directive[directive['directive'] == 'Disallow']
directive.to_excel('directive.xlsx',index=False)
directive

Unnamed: 0,directive,content,robots_url
35,Disallow,asset,https://www.bbc.com/robots.txt
36,Disallow,bitesize search,https://www.bbc.com/robots.txt
37,Disallow,bitesize search,https://www.bbc.com/robots.txt
38,Disallow,bitesize search,https://www.bbc.com/robots.txt
39,Disallow,cbbc search,https://www.bbc.com/robots.txt
...,...,...,...
389,Disallow,,https://www.mirror.co.uk/robots.txt
391,Disallow,,https://www.mirror.co.uk/robots.txt
393,Disallow,,https://www.mirror.co.uk/robots.txt
395,Disallow,,https://www.mirror.co.uk/robots.txt


In [None]:
#@title Directive Table pre-processing & Rule-based classification

# Fix column name if there's a typo
if 'content]' in directive_cleaned.columns:
    directive_cleaned = directive_cleaned.rename(columns={'content]': 'content'})

# Drop irrelevant columns (optional, if they exist)
directive_cleaned = directive_cleaned.drop(
    columns=['etag', 'download_date', 'robotstxt_last_modified'],
    errors='ignore'
)

# Filter for relevant directives (if not already done)
directive_cleaned = directive_cleaned[
    directive_cleaned['directive'].isin(['User-agent', 'Disallow'])
]

# Clean 'content' column
import re
def clean_content(text):
    if pd.isna(text):
        return text
    text = str(text)
    text = re.sub(r'[^a-zA-Z0-9\s?._-]', ' ', text)  # allow query-style URLs
    text = re.sub(r'\s+', ' ', text).strip()
    return text

directive_cleaned['content'] = directive_cleaned['content'].apply(clean_content)

# Define the clustering bucket function

'''
The whole list of URL strings that follows in brackets is editable based on the URL strings the actual competition is blocking.
Feel free to adjust to your needs.
'''

def classify_bucket(value):
    if pd.isna(value):
        return "Other/Unclassified"
    v = value.lower()
    if any(k in v for k in ["accedi", "account", "login", "logout", "signup", "register", "passwordreset","live-chat","storedetails", "countrycheck","legalarea", "livechat", "customercare"]):
        return "User Account & Auth"
    elif any(k in v for k in ["cart", "checkout", "carrello","?ctart","ctart"]):
        return "Cart & Checkout"
    elif any(k in v for k in ["search", "?q","?cgid","s"]):
        return "Internal Search"
    elif any(k in v for k in ["q","page", "?sz", "srule", "sort", "prefn", "prefv", "pmax", "pmin", "filter", "viewmode","dwvar","cgid","sz","src","product", "dwvar_", "size"]):
        return "Product Filtering"
    elif any(k in v for k in ["outlet", "promo", "rebajas", "saldi", "rasprodazha", "sale", "soldes", "mothersday"]):
        return "Promotions & Sales"
    elif "wishlist" in v:
        return "Wishlist"
    elif any(k in v for k in ["coord", "ss21","hidden categories","mothers day gift guide"]):
        return "Collections"
    elif any(k in v for k in ["store details", "georedirect"]):
        return "Store Locator"
    elif any(k in v for k in ["on demandware store","home","ondemandware", "actionajax", "noname", "autlyet", "cid", "?src","row"]):
        return "System/Internal"
    else:
        return "International Subfolders"

# Apply bucket classification directly to the 'content' column
directive_cleaned['cluster_bucket'] = directive_cleaned['content'].apply(classify_bucket)

# Preview result
#directive_cleaned[['directive', 'content', 'cluster_bucket','robots_url']].head(20)
directive_cleaned.value_counts('cluster_bucket')

Unnamed: 0_level_0,count
cluster_bucket,Unnamed: 1_level_1
International Subfolders,156
Internal Search,126
Product Filtering,21
User Account & Auth,20
Cart & Checkout,16
System/Internal,6
Promotions & Sales,3
Other/Unclassified,1
Store Locator,1


#CAVEAT
It might be necessary for you to export the output from above and do a bit of manual clean up in Excel.

You can then import the XLSX cleaned file in the first line of code just below



In [None]:
# Upload a xlsx file
import pandas as pd
directive_cleaned = pd.read_excel('/content/directive.xlsx')

'''
From now on make sure you replace
directive
 with
directive_cleaned
'''

##Rule-based classification - Ask Claude for Help

If clustering hundreds and hundreds of folders results as a bit of an overkill, you can always ask Claude for some help.

1. Export the Directive Table in XLSX
2. Copy  the list of blocked directives ("content" header)
3. paste in Claude with the following prompt:



> Please, apply a rule-based classification to label the following list of terms.

> Once you've ascertained where these terms belong, please translate in python to do the clustering.

> Please, make sure the output is concatenated as an additional header to the existing "directive" dataframe

All you have to do now is to copy paste the python code in a new cell and follow along

In [13]:
#@title NEWS EXAMPLE FROM CLAUDE - Directive Table preprocessing & rule-based classification
import re

def classify_content_term(term):
    """
    Classify content terms into functional categories
    """
    if pd.isna(term):
        return "Other/Unclassified"

    # Convert to lowercase and clean
    t = str(term).lower().strip()

    # Search & Navigation
    if any(keyword in t for keyword in ['search', 'chwilio', 'websearch', 'find', 'query']):
        return "Search & Navigation"

    # Educational Content
    elif any(keyword in t for keyword in ['bitesize', 'education', 'newsround', 'learning', 'curriculum']):
        return "Educational Content"

    # Food & Lifestyle
    elif any(keyword in t for keyword in ['food', 'recipes', 'menus', 'shopping list', 'favourites', 'cooking']):
        return "Food & Lifestyle"

    # User Management & Authentication
    elif any(keyword in t for keyword in ['users', 'userinfo', 'login', 'sso', 'profile', 'auth', 'account', 'user']):
        return "User Management & Auth"

    # Media & Entertainment
    elif any(keyword in t for keyword in ['sounds', 'music', 'artist', 'album', 'radio', 'audio', 'player', 'tv']):
        return "Media & Entertainment"

    # Sports
    elif any(keyword in t for keyword in ['sport', 'olympics', 'horseracing', 'racecards', 'results', 'medals', 'events']):
        return "Sports"

    # Technical/System
    elif any(keyword in t for keyword in ['ajax', 'css', 'js', 'php', 'api', 'embed', 'wp', 'admin', 'apps', 'json', 'xml', 'asset']):
        return "Technical/System"

    # News & Articles
    elif any(keyword in t for keyword in ['news', 'articles', 'headline', 'stories', 'feedarticle', 'most read', 'breaking']):
        return "News & Articles"

    # User-Generated Content
    elif any(keyword in t for keyword in ['ugc', 'comment', 'discussion', 'report abuse', 'permalink', 'handlers']):
        return "User-Generated Content"

    # External Services & Integrations
    elif any(keyword in t for keyword in ['whsmiths', 'overture', 'brightcove', 'tealium', 'external', 'third party']):
        return "External Services"

    # Commerce & Shopping
    elif any(keyword in t for keyword in ['shop', 'buy', 'cart', 'checkout', 'payment', 'order']):
        return "Commerce & Shopping"

    # Help & Support
    elif any(keyword in t for keyword in ['help', 'support', 'contact', 'feedback', 'faq']):
        return "Help & Support"

    # Archive & Historical
    elif any(keyword in t for keyword in ['archive', 'historical', 'past', 'old']):
        return "Archive & Historical"

    # Entertainment & Lifestyle Content
    elif any(keyword in t for keyword in ['celeb', 'celebrity', 'weird', 'cartoons', 'lifestyle', 'entertainment']):
        return "Entertainment & Lifestyle"

    # Geographic/Location
    elif any(keyword in t for keyword in ['travel', 'location', 'seaside', 'uk', 'local']):
        return "Geographic/Location"

    else:
        return "Other/Unclassified"

# Apply the classification to your directive dataframe
# Assuming your dataframe is called 'directive' and has a 'content' column
directive['content_category'] = directive['content'].apply(classify_content_term)

# Display the distribution of categories
directive['content_category'].value_counts()

Unnamed: 0_level_0,count
content_category,Unnamed: 1_level_1
Other/Unclassified,130
Search & Navigation,28
Sports,12
User-Generated Content,11
Technical/System,9
User Management & Auth,9
News & Articles,8
Commerce & Shopping,7
Educational Content,7
Media & Entertainment,5


In [19]:
#@title Plotting a Heatmap of Disallowed Directives

import pandas as pd
import plotly.express as px

# Function to extract clean site names from robots.txt URLs
def get_site_name(url):
    """Extract clean site name from robots.txt URL"""
    url_mapping = {
        'https://www.bbc.com/robots.txt': 'BBC',
        'https://www.theguardian.com/robots.txt': 'The Guardian',
        'https://www.thesun.co.uk/robots.txt': 'The Sun',
        'https://www.mirror.co.uk/robots.txt': 'The Mirror'
    }

    # Return mapped name if exact match, otherwise extract from URL
    if url in url_mapping:
        return url_mapping[url]

    # Fallback: extract domain name
    if 'bbc.com' in url:
        return 'BBC'
    elif 'theguardian.com' in url:
        return 'The Guardian'
    elif 'thesun.co.uk' in url:
        return 'The Sun'
    elif 'mirror.co.uk' in url:
        return 'The Mirror'
    else:
        # Generic extraction from domain
        domain = url.replace('https://www.', '').replace('http://www.', '').replace('/robots.txt', '')
        return domain.replace('.com', '').replace('.co.uk', '').title()

# Create a cross tab of robots_url and directive_bucket
heatmap_data = pd.crosstab(directive['robots_url'], directive['content_category'])

# Create mapping of site names
site_names = [get_site_name(url) for url in heatmap_data.index]

# Create the heatmap
fig = px.imshow(
    heatmap_data,
    labels=dict(x="Disallowed Directives", y="Websites Robots.txt", color="Count"),
    x=heatmap_data.columns,
    y=site_names,  # Use clean site names instead of URLs
    color_continuous_scale="viridis",
    text_auto=True  # adds value annotations like sns.heatmap(annot=True)
)

fig.update_layout(
    title="Distribution of Disallowed Directives",
    height=max(400, len(heatmap_data.index) * 60),  # Adjust height based on number of sites
    margin=dict(l=120, r=50, t=80, b=50)  # Increase left margin for site names
)

fig.show()