<a href="https://colab.research.google.com/github/simodepth/Structured-data/blob/main/Scrape_%26_Benchmark_Structured_Data_in_bulk_with_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#How to extract and compare structured data against competitors in bulk

Structured represent a goldmine in your technical SEO asset as a widely detailed implementation of schemas may positively impact CTR and overall visibility by stimulating Google to tip your webpages with 'sexy' SERP features.

If you're looking for a quick win-win process for retrieving not only your website structured data but also the ones applied by your competitors, this script will cut off loads of time consuming manual research.


In [None]:
#@title Install Modules
!pip install extruct
!pip install w3lib.htmml

Collecting extruct
  Downloading extruct-0.13.0-py2.py3-none-any.whl (25 kB)
Collecting pyrdfa3
  Downloading pyRdfa3-3.5.3-py3-none-any.whl (121 kB)
[K     |████████████████████████████████| 121 kB 5.8 MB/s 
Collecting jstyleson
  Downloading jstyleson-0.0.2.tar.gz (2.0 kB)
Collecting rdflib
  Downloading rdflib-6.1.1-py3-none-any.whl (482 kB)
[K     |████████████████████████████████| 482 kB 32.0 MB/s 
[?25hCollecting html-text>=0.5.1
  Downloading html_text-0.5.2-py2.py3-none-any.whl (7.5 kB)
Collecting mf2py
  Downloading mf2py-1.1.2.tar.gz (25 kB)
Collecting w3lib
  Downloading w3lib-1.22.0-py2.py3-none-any.whl (20 kB)
Collecting rdflib-jsonld
  Downloading rdflib_jsonld-0.6.2-py2.py3-none-any.whl (4.0 kB)
Collecting isodate
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[K     |████████████████████████████████| 41 kB 527 kB/s 
Building wheels for collected packages: jstyleson, mf2py
  Building wheel for jstyleson (setup.py) ... [?25l[?25hdone
  Created wheel for js

In [None]:
#@title Import Modules
import pandas as pd
import extruct
import requests
from w3lib.html import get_base_url
from urllib.parse import urlparse

#Have an unlimited list of competing URLs to scrape

In [None]:
sites = ['https://www.liverpoolfc.com/',
         'https://www.liverpoolfc.com/news',
         'https://legacy.liverpoolfc.com/',
         'https://video.liverpoolfc.com/',
         'https://matchcentre.liverpoolfc.com/']

#Extract the metadata for one sample page

In [None]:
def extract_metadata(url):

    r = requests.get(url)
    base_url = get_base_url(r.text, r.url)
    metadata = extruct.extract(r.text, 
                               base_url=base_url,
                               uniform=True,
                               syntaxes=['json-ld',
                                         'microdata',
                                         'opengraph',
                                         'rdfa'])
    return metadata

In [None]:
metadata = extract_metadata('https://www.liverpoolfc.com/')
metadata

{'json-ld': [{'@context': 'https://schema.org',
   '@type': 'WebPage',
   'breadcrumb': 'Home',
   'headline': 'Liverpool FC - Homepage',
   'publisher': {'@type': 'Organization',
    'address': '5 St Paul’s Square, Liverpool, L3 9SLiverpool Football Club and Athletic Grounds Limited Anfield Road, Liverpool, L4 0TH.',
    'description': "Liverpool FC, LFC, is one of the world's most successful Football Clubs, with 48 major Mens Team Honours. LFC play in the Premier League in England, the Club was formed in 1892. Their home stadium is Anfield, Liverpool.",
    'email': 'contactus@liverpoolfc.com',
    'logo': {'@type': 'ImageObject',
     'height': '109',
     'url': 'https://www.liverpoolfc.com/liverpoolfc_crest.png',
     'width': '80'},
    'name': 'Liverpool FC',
    'url': 'https://www.liverpoolfc.com'}}],
 'microdata': [],
 'opengraph': [{'@context': {'og': 'http://ogp.me/ns#'},
   '@type': 'website',
   'og:locale': 'en_GB',
   'og:title': 'Liverpool FC - Homepage',
   'og:url': 

#Investigate whether the URL is using a specific metadata type

In [None]:
def uses_metadata_type(metadata, metadata_type):
    if (metadata_type in metadata.keys()) and (len(metadata[metadata_type]) > 0):
        return True
    else:
        return False

In [None]:
uses_metadata_type(metadata, 'opengraph')

True

In [None]:
uses_metadata_type(metadata, 'rdfa')

True

In [None]:
uses_metadata_type(metadata, 'json-ld')

True

In [None]:
uses_metadata_type(metadata, 'microdata')

False

In [None]:
#@title Extract metadata usage for each site
df = pd.DataFrame(columns = ['url', 'microdata', 'json-ld', 'opengraph', 'rdfa'])

for url in sites:    
    metadata = extract_metadata(url)
    urldata = urlparse(url)

    row = {
        'url': urldata.netloc, 
        'microdata': uses_metadata_type(metadata, 'microdata'),
        'json-ld': uses_metadata_type(metadata, 'json-ld'),
        'opengraph': uses_metadata_type(metadata, 'opengraph'),
        'rdfa': uses_metadata_type(metadata, 'rdfa')              
    }

    df = df.append(row, ignore_index=True)

df.head(10).sort_values(by='microdata', ascending=False)

Unnamed: 0,url,microdata,json-ld,opengraph,rdfa
0,www.liverpoolfc.com,False,True,True,True
1,www.liverpoolfc.com,False,True,True,True
2,legacy.liverpoolfc.com,False,False,True,True
3,video.liverpoolfc.com,False,False,False,True
4,matchcentre.liverpoolfc.com,False,False,True,True


#Examine the specific metadata used

In [None]:
def key_exists(dict, key):

    if not any(item['@type'] == key for item in dict):
        return False
    else:
        return True   

#Scrape specific metadata usage per site
---
We’re looping over the URLs, scraping the HTML, extracting the metadata, and then checking each key to see whether it is implemented by a given metadata type.

In [None]:
metadata = extract_metadata('https://www.liverpoolfc.com/')
metadata

{'json-ld': [{'@context': 'https://schema.org',
   '@type': 'WebPage',
   'breadcrumb': 'Home',
   'headline': 'Liverpool FC - Homepage',
   'publisher': {'@type': 'Organization',
    'address': '5 St Paul’s Square, Liverpool, L3 9SLiverpool Football Club and Athletic Grounds Limited Anfield Road, Liverpool, L4 0TH.',
    'description': "Liverpool FC, LFC, is one of the world's most successful Football Clubs, with 48 major Mens Team Honours. LFC play in the Premier League in England, the Club was formed in 1892. Their home stadium is Anfield, Liverpool.",
    'email': 'contactus@liverpoolfc.com',
    'logo': {'@type': 'ImageObject',
     'height': '109',
     'url': 'https://www.liverpoolfc.com/liverpoolfc_crest.png',
     'width': '80'},
    'name': 'Liverpool FC',
    'url': 'https://www.liverpoolfc.com'}}],
 'microdata': [],
 'opengraph': [{'@context': {'og': 'http://ogp.me/ns#'},
   '@type': 'website',
   'og:locale': 'en_GB',
   'og:title': 'Liverpool FC - Homepage',
   'og:url': 

In [None]:
df_specific = pd.DataFrame(columns = ['url', 
                                      'organization-json-ld', 
                                      'organization-microdata',                                   
                                      'product-json-ld', 
                                      'product-microdata',                  
                                      'offer-json-ld', 
                                      'offer-microdata',     
                                      'review-json-ld', 
                                      'review-microdata',   
                                      'aggregaterating-json-ld', 
                                      'aggregaterating-microdata',   
                                      'breadcrumblist-json-ld', 
                                      'breadcrumblist-microdata',            
                                     ])

for url in sites:    
    metadata = extract_metadata(url)
    urldata = urlparse(url)


    row = {
        'url': urldata.netloc, 
        'organization-json-ld': key_exists(metadata['json-ld'], 'Organization'),
        'organization-microdata': key_exists(metadata['microdata'], 'Organization'),
        'product-json-ld': key_exists(metadata['json-ld'], 'Product'),
        'product-microdata': key_exists(metadata['microdata'], 'Product'),
        'offer-json-ld': key_exists(metadata['json-ld'], 'Offer'),
        'offer-microdata': key_exists(metadata['microdata'], 'Offer'),
        'review-json-ld': key_exists(metadata['json-ld'], 'Review'),
        'review-microdata': key_exists(metadata['microdata'], 'Review'),
        'aggregaterating-json-ld': key_exists(metadata['json-ld'], 'AggregateRating'),
        'aggregaterating-microdata': key_exists(metadata['microdata'], 'AggregateRating'),
        'breadcrumblist-json-ld': key_exists(metadata['json-ld'], 'BreadcrumbList'),
        'breadcrumblist-microdata': key_exists(metadata['microdata'], 'BreadcrumbList'),
    }

    df_specific = df_specific.append(row, ignore_index=True)

df_specific.sort_values(by='url', ascending=False).head(3).T


Unnamed: 0,0,1,3
url,www.liverpoolfc.com,www.liverpoolfc.com,video.liverpoolfc.com
organization-json-ld,False,False,False
organization-microdata,False,False,False
product-json-ld,False,False,False
product-microdata,False,False,False
offer-json-ld,False,False,False
offer-microdata,False,False,False
review-json-ld,False,False,False
review-microdata,False,False,False
aggregaterating-json-ld,False,False,False
