From c290fc83c640126f4d7fccaa3219e9c01459c8f4 Mon Sep 17 00:00:00 2001
From: ShivinDass <shivin16091@iiitd.ac.in>
Date: Wed, 27 May 2020 16:19:08 +0530
Subject: [PATCH 1/3] Support for expanded opengraph metadata

---
 extruct/rdfa.py                               |  41 ++++++-
 .../misc/expanded_OG_support_test.html        |  40 +++++++
 .../misc/expanded_OG_support_test.json        | 100 ++++++++++++++++++
 tests/test_rdfa.py                            |  11 ++
 4 files changed, 191 insertions(+), 1 deletion(-)
 create mode 100644 tests/samples/misc/expanded_OG_support_test.html
 create mode 100644 tests/samples/misc/expanded_OG_support_test.json
diff --git a/extruct/rdfa.py b/extruct/rdfa.py
index e5ab06bd..4b4b8e23 100644
--- a/extruct/rdfa.py
+++ b/extruct/rdfa.py
@@ -26,6 +26,24 @@
     "fb": "http://ogp.me/ns/fb#"
 })
 
+_OG_NAMESPACES = {
+  'og': 'http://ogp.me/ns#',
+  'music': 'http://ogp.me/ns/music#',
+  'video': 'http://ogp.me/ns/video#',
+  'article': 'http://ogp.me/ns/article#',
+  'book': 'http://ogp.me/ns/book#',
+  'profile': 'http://ogp.me/ns/profile#'
+}
+
+_OG_NAMESPACES_TAGS = {
+  'og': 'xmlns:og',
+  'music': 'xmlns:music',
+  'video': 'xmlns:video',
+  'article': 'xmlns:article',
+  'book': 'xmlns:book',
+  'profile': 'xmlns:profile'
+}
+
 
 class RDFaExtractor(object):
 
@@ -43,7 +61,28 @@ def extract_items(self, document, base_url=None, expanded=True):
                           vocab_cache_report=False,
                           refresh_vocab_cache=False,
                           check_lite=False)
-
+        document = self.expandedOGSupport(document)
         g = PyRdfa(options, base=base_url).graph_from_DOM(document, graph=Graph(), pgraph=Graph())
         jsonld_string = g.serialize(format='json-ld', auto_compact=not expanded).decode('utf-8')
         return json.loads(jsonld_string)
+
+    def expandedOGSupport(self,document):
+      prefixDic = {}
+      for head in document.xpath('//head'):
+        for el in head.xpath('meta[@property and @content]'):
+          prop = el.attrib['property']
+          ns = prop.partition(':')[0]
+          if ns in _OG_NAMESPACES.keys():
+            prefixDic[_OG_NAMESPACES_TAGS[ns]] = _OG_NAMESPACES[ns]
+
+      html_element = None
+      for element in document.iter():
+        if element.tag == 'html':
+          html_element = element
+          break
+
+      if html_element is not None:
+        for k in prefixDic.keys():
+          if not (html_element.get(k)):
+            html_element.set(k,prefixDic[k])
+      return document
\ No newline at end of file
diff --git a/tests/samples/misc/expanded_OG_support_test.html b/tests/samples/misc/expanded_OG_support_test.html
new file mode 100644
index 00000000..0bebb5f0
--- /dev/null
+++ b/tests/samples/misc/expanded_OG_support_test.html
@@ -0,0 +1,40 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="https://www.w3.org/1999/xhtml" xmlns:og="https://ogp.me/ns#" xmlns:fb="https://www.facebook.com/2008/fbml">
+	<head>
+		<title>Himanshu's Open Graph Protocol</title>
+		<meta http-equiv="Content-Type" content="text/html;charset=WINDOWS-1252" />
+	   	<meta http-equiv="Content-Language" content="en-us" />
+	   	<link rel="stylesheet" type="text/css" href="event-education.css" />
+	   	<meta name="verify-v1" content="so4y/3aLT7/7bUUB9f6iVXN0tv8upRwaccek7JKB1gs=" >
+	   	<meta property="og:image" content="https://www.eventeducation.com/images/982336_wedding_dayandouan_th.jpg"/>
+   		<meta property="fb:admins" content="himanshu160"/>
+   		<meta property="og:site_name" content="Event Education"/>
+
+   		<meta property="og:url" content="http://www.nytimes.com/2016/12/15/arts/music/from-steet-theater-to-wagner-on-the-opera-stage.html" />
+		<meta property="og:type" content="article" />
+		<meta property="og:title" content="From Street Theater to Wagner on the Opera Stage" />
+		<meta property="og:description" content="which he set in Bangladesh instead of Norway. The production opens in Madrid on Saturday." />
+		<meta property="article:published" itemprop="datePublished" content="2016-12-15T05:55:55-05:00" />
+		<meta property="article:modified" itemprop="dateModified" content="2016-12-15T06:19:30-05:00" />
+		<meta property="article:section" itemprop="articleSection" content="Music" />
+		<meta property="article:section-taxonomy-id" itemprop="articleSection" content="C5BFA7D5-359C-427B-90E6-6B7245A6CDD8" />
+		<meta property="article:section_url" content="http://www.nytimes.com/section/arts" />
+		<meta property="article:top-level-section" content="arts" />
+		<meta property="fb:app_id" content="9869919170" />
+		<meta property="music:duration" content="60" />
+		<meta property="video:tag" content="Exhilerating" />
+		<meta property="book:release_date" content="2016-12-15T06:19:30-05:00" />
+		<meta property="profile:first_name" content="John" />
+		<meta property="profile:last_name" content="Lennon" />
+  	</head>
+  	<body>
+   		<div id="fb-root"></div>
+   			<script>(function(d, s, id) {
+               var js, fjs = d.getElementsByTagName(s)[0];
+               if (d.getElementById(id)) return;
+                  js = d.createElement(s); js.id = id;
+                  js.src = "//connect.facebook.net/en_US/all.js#xfbml=1&appId=501839739845103";
+                  fjs.parentNode.insertBefore(js, fjs);
+                  }(document, 'script', 'facebook-jssdk'));</script>
+	</body>
+</html>
\ No newline at end of file
diff --git a/tests/samples/misc/expanded_OG_support_test.json b/tests/samples/misc/expanded_OG_support_test.json
new file mode 100644
index 00000000..03ae6c5d
--- /dev/null
+++ b/tests/samples/misc/expanded_OG_support_test.json
@@ -0,0 +1,100 @@
+[
+  {
+    "https://ogp.me/ns#url": [
+      {
+        "@value": "http://www.nytimes.com/2016/12/15/arts/music/from-steet-theater-to-wagner-on-the-opera-stage.html"
+      }
+    ], 
+    "http://ogp.me/ns/profile#first_name": [
+      {
+        "@value": "John"
+      }
+    ], 
+    "https://ogp.me/ns#type": [
+      {
+        "@value": "article"
+      }
+    ], 
+    "http://ogp.me/ns/article#section": [
+      {
+        "@value": "Music"
+      }
+    ], 
+    "http://ogp.me/ns/music#duration": [
+      {
+        "@value": "60"
+      }
+    ], 
+    "http://ogp.me/ns/article#modified": [
+      {
+        "@value": "2016-12-15T06:19:30-05:00"
+      }
+    ], 
+    "http://ogp.me/ns/video#tag": [
+      {
+        "@value": "Exhilerating"
+      }
+    ], 
+    "https://ogp.me/ns#site_name": [
+      {
+        "@value": "Event Education"
+      }
+    ], 
+    "http://ogp.me/ns/profile#last_name": [
+      {
+        "@value": "Lennon"
+      }
+    ], 
+    "https://www.facebook.com/2008/fbmladmins": [
+      {
+        "@value": "himanshu160"
+      }
+    ], 
+    "http://ogp.me/ns/article#section_url": [
+      {
+        "@value": "http://www.nytimes.com/section/arts"
+      }
+    ], 
+    "https://ogp.me/ns#title": [
+      {
+        "@value": "From Street Theater to Wagner on the Opera Stage"
+      }
+    ], 
+    "https://www.facebook.com/2008/fbmlapp_id": [
+      {
+        "@value": "9869919170"
+      }
+    ], 
+    "https://ogp.me/ns#image": [
+      {
+        "@value": "https://www.eventeducation.com/images/982336_wedding_dayandouan_th.jpg"
+      }
+    ], 
+    "http://ogp.me/ns/book#release_date": [
+      {
+        "@value": "2016-12-15T06:19:30-05:00"
+      }
+    ], 
+    "http://ogp.me/ns/article#section-taxonomy-id": [
+      {
+        "@value": "C5BFA7D5-359C-427B-90E6-6B7245A6CDD8"
+      }
+    ], 
+    "http://ogp.me/ns/article#published": [
+      {
+        "@value": "2016-12-15T05:55:55-05:00"
+      }
+    ], 
+    "https://ogp.me/ns#description": [
+      {
+        "@value": "which he set in Bangladesh instead of Norway. The production opens in Madrid on Saturday."
+      }
+    ], 
+    "@id": "http://www.example.com/index.html", 
+    "http://ogp.me/ns/article#top-level-section": [
+      {
+        "@value": "arts"
+      }
+    ]
+  }
+]
\ No newline at end of file
diff --git a/tests/test_rdfa.py b/tests/test_rdfa.py
index 98da33d2..cf3757a5 100644
--- a/tests/test_rdfa.py
+++ b/tests/test_rdfa.py
@@ -100,3 +100,14 @@ def test_wikipedia_xhtml_rdfa_no_prefix(self):
         data = rdfae.extract(body, base_url='http://nielslubberman.nl/drupal/')
 
         self.assertJsonLDEqual(data, expected)
+
+    def test_expanded_opengraph_support(self):
+        body = get_testdata('misc','expanded_OG_support_test.html')
+        expected = json.loads(
+                   get_testdata('misc','expanded_OG_support_test.json'
+                   ).decode('UTF-8'))
+
+        rdfae = RDFaExtractor()
+        data = rdfae.extract(body, base_url='http://www.example.com/index.html')
+
+        self.assertJsonLDEqual(data,expected)
\ No newline at end of file

From 58fd76b8695e685110eb75d29683a081309b13f6 Mon Sep 17 00:00:00 2001
From: Shivin Dass <shivin16091@iiitd.ac.in>
Date: Tue, 2 Jun 2020 16:29:42 +0530
Subject: [PATCH 2/3] changed the approach to updating the initial context

---
 extruct/rdfa.py | 51 ++++++++-----------------------------------------
 1 file changed, 8 insertions(+), 43 deletions(-)

diff --git a/extruct/rdfa.py b/extruct/rdfa.py
index 4b4b8e23..f71df1a3 100644
--- a/extruct/rdfa.py
+++ b/extruct/rdfa.py
@@ -23,28 +23,15 @@
 
 initial_context["http://www.w3.org/2011/rdfa-context/rdfa-1.1"].ns.update({
     "twitter": "https://dev.twitter.com/cards#",
-    "fb": "http://ogp.me/ns/fb#"
+    "fb": "http://ogp.me/ns/fb#",
+    'og': 'http://ogp.me/ns#',
+	'music': 'http://ogp.me/ns/music#',
+	'video': 'http://ogp.me/ns/video#',
+	'article': 'http://ogp.me/ns/article#',
+	'book': 'http://ogp.me/ns/book#',
+	'profile': 'http://ogp.me/ns/profile#'
 })
 
-_OG_NAMESPACES = {
-  'og': 'http://ogp.me/ns#',
-  'music': 'http://ogp.me/ns/music#',
-  'video': 'http://ogp.me/ns/video#',
-  'article': 'http://ogp.me/ns/article#',
-  'book': 'http://ogp.me/ns/book#',
-  'profile': 'http://ogp.me/ns/profile#'
-}
-
-_OG_NAMESPACES_TAGS = {
-  'og': 'xmlns:og',
-  'music': 'xmlns:music',
-  'video': 'xmlns:video',
-  'article': 'xmlns:article',
-  'book': 'xmlns:book',
-  'profile': 'xmlns:profile'
-}
-
-
 class RDFaExtractor(object):
 
     def extract(self, htmlstring, base_url=None, encoding="UTF-8",
@@ -61,28 +48,6 @@ def extract_items(self, document, base_url=None, expanded=True):
                           vocab_cache_report=False,
                           refresh_vocab_cache=False,
                           check_lite=False)
-        document = self.expandedOGSupport(document)
         g = PyRdfa(options, base=base_url).graph_from_DOM(document, graph=Graph(), pgraph=Graph())
         jsonld_string = g.serialize(format='json-ld', auto_compact=not expanded).decode('utf-8')
-        return json.loads(jsonld_string)
-
-    def expandedOGSupport(self,document):
-      prefixDic = {}
-      for head in document.xpath('//head'):
-        for el in head.xpath('meta[@property and @content]'):
-          prop = el.attrib['property']
-          ns = prop.partition(':')[0]
-          if ns in _OG_NAMESPACES.keys():
-            prefixDic[_OG_NAMESPACES_TAGS[ns]] = _OG_NAMESPACES[ns]
-
-      html_element = None
-      for element in document.iter():
-        if element.tag == 'html':
-          html_element = element
-          break
-
-      if html_element is not None:
-        for k in prefixDic.keys():
-          if not (html_element.get(k)):
-            html_element.set(k,prefixDic[k])
-      return document
\ No newline at end of file
+        return json.loads(jsonld_string)
\ No newline at end of file

From d9f7c8e1f3b267c182ae116ee6faa53c3cddc8ef Mon Sep 17 00:00:00 2001
From: Shivin Dass <shivin16091@iiitd.ac.in>
Date: Wed, 3 Jun 2020 02:20:51 +0530
Subject: [PATCH 3/3] cosmetic changes 1

---
 extruct/rdfa.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/extruct/rdfa.py b/extruct/rdfa.py
index f71df1a3..edc53642 100644
--- a/extruct/rdfa.py
+++ b/extruct/rdfa.py
@@ -24,14 +24,15 @@
 initial_context["http://www.w3.org/2011/rdfa-context/rdfa-1.1"].ns.update({
     "twitter": "https://dev.twitter.com/cards#",
     "fb": "http://ogp.me/ns/fb#",
-    'og': 'http://ogp.me/ns#',
-	'music': 'http://ogp.me/ns/music#',
-	'video': 'http://ogp.me/ns/video#',
-	'article': 'http://ogp.me/ns/article#',
-	'book': 'http://ogp.me/ns/book#',
-	'profile': 'http://ogp.me/ns/profile#'
+    "og": "http://ogp.me/ns#",
+    "music": "http://ogp.me/ns/music#",
+    "video": "http://ogp.me/ns/video#",
+    "article": "http://ogp.me/ns/article#",
+    "book": "http://ogp.me/ns/book#",
+    "profile": "http://ogp.me/ns/profile#"
 })
 
+
 class RDFaExtractor(object):
 
     def extract(self, htmlstring, base_url=None, encoding="UTF-8",