## Web Scrape Metadata of [The Open Graph protocol](https://ogp.me/) Compliant Posts/Articles


In [1]:
from datetime import datetime, timedelta
nb_st = datetime.utcnow()
print(f"\nNotebook START time: {nb_st} UTC\n")


Notebook START time: 2022-08-14 07:11:34.039013 UTC



In [2]:
%%HTML
<style>
@media (max-width: 540px) {
  .output .output_subarea {
    max-width: 100%;
  }
}
</style>
<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('ðŸ”Ž Show Python Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('âŒ¦ Hide Python Code')
    }
    code_shown = !code_shown
  }

  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide();
    $('div.input:contains("%%HTML")').removeClass( "input")
    $('div.input:contains("%%capture")').removeClass("input")
  });
</script>
<form action="javascript:code_toggle()">
  <input type="submit" id="toggleButton" value="ðŸ”Ž Show Python Code"
         class="btn btn-default btn-lg">
</form>

In [3]:
import pandas as pd
from pathlib import Path
import warnings
import requests
from bs4 import BeautifulSoup
import json

In [4]:
warnings.filterwarnings(action='error')

In [5]:
VERBOSE = False
BASE_LANGUAGE = 'en'
OUTPUT_DIR = Path.cwd() / 'docs'

In [6]:
df = pd.read_csv("data/shares.csv")

if VERBOSE:
    display(df)

In [7]:
ogp_data = {}


for idx,row in df.iterrows():
    
    if VERBOSE:
        print(f"*** Processing: ({idx}) [{row['Language']}] {row['URL']}")
    
    if not row['Language'] in ogp_data:
        ogp_data[row['Language']] = []
        
        if VERBOSE:
            print(" ** New Language: ", row['Language'])
    response = requests.get(row['URL'])
    
    if VERBOSE:
        print("  * Response status code:", response.status_code)
    elif response.status_code < 200 or response.status_code >= 300:
        print(f"*** Processing: ({idx}) [{row['Language']}] {row['URL']}")
        print("  * Response status code:", response.status_code)

    soup = BeautifulSoup(response.text)

    ogp_data[row['Language']].append({
        "og_url": soup.find(
            "meta", attrs={"property": "og:url"}).attrs["content"],
        "og_type": soup.find(
            "meta", attrs={"property": "og:type"}).attrs["content"],
        "og_title": soup.find(
            "meta", attrs={"property": "og:title"}).attrs["content"],
        "og_description": soup.find(
            "meta", attrs={"property": "og:description"}).attrs["content"],
        "og_image": soup.find(
            "meta", attrs={"property": "og:image"}).attrs["content"],
        "og_image_alt": soup.find(
            "meta", attrs={"property": "og:image:alt"}).attrs["content"],
        "og_image_type": soup.find(
            "meta", attrs={"property": "og:image:type"}).attrs["content"],
        "og_image_width": soup.find(
            "meta", attrs={"property": "og:image:width"}).attrs["content"],
        "og_image_height": soup.find(
            "meta", attrs={"property": "og:image:height"}).attrs["content"],
        "article_published_time": soup.find(
            "meta",
            attrs={"property": "article:published_time"}).attrs["content"],
        "article_modified_time": soup.find(
            "meta",
            attrs={"property": "article:modified_time"}).attrs["content"],
        "article_publisher": soup.find(
            "meta", attrs={"property": "article:publisher"}).attrs["content"],
        "article_author": [
            x.attrs["content"]
            for x in soup.find_all("meta",
                                   attrs={"property": "article:author"})
        ],
        "article_section": soup.find(
            "meta", attrs={"property": "article:section"}).attrs["content"],
        "article_tag": [
            x.attrs["content"]
            for x in soup.find_all("meta", attrs={"property": "article:tag"})
        ],
    })

In [8]:
if VERBOSE:
    print("*** Languages encountered:",", ".join(ogp_data.keys()))

for language in ogp_data.keys():
    
    if language == BASE_LANGUAGE:
        p = OUTPUT_DIR
    else:
        p = OUTPUT_DIR / language
    p.mkdir(parents=True, exist_ok=True)
    p /= "ogp.json"
    
    if VERBOSE:
        print(f"  * Writing [{language}]:", p.relative_to(Path.cwd()))
    with open(p , 'w') as f:
        f.write(json.dumps(ogp_data[language], indent="\t"))

In [9]:
print(f"\n ** Total Elapsed time: {datetime.utcnow() - nb_st} ** \n")
print(f"Notebook END time: {datetime.utcnow()} UTC\n")


 ** Total Elapsed time: 0:00:08.070057 ** 

Notebook END time: 2022-08-14 07:11:42.109550 UTC

