Skip to content

Commit

Permalink
Merge pull request #74 from scrapinghub/flatten-microformat-children
Browse files Browse the repository at this point in the history
Flatten microformat children
  • Loading branch information
kmike committed Apr 25, 2018
2 parents 9439fa4 + 3174cb3 commit 20738c0
Show file tree
Hide file tree
Showing 5 changed files with 242 additions and 86 deletions.
33 changes: 22 additions & 11 deletions extruct/uniform.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,21 @@ def _umicrodata_microformat(extracted, schema_context):
res.append(flatten_dict(obj, schema_context, True))
elif isinstance(extracted, dict):
res.append(flatten_dict(extracted, schema_context, False))

return res


def _flatten(element, schema_context):
if isinstance(element, dict):
element = flatten_dict(element, schema_context, False)
elif isinstance(element, list):
element = [
flatten_dict(o, schema_context, False)
if isinstance(o, dict) else o
for o in element
]
return element


def flatten_dict(d, schema_context, add_context):
out = dict(d)
typ = out.pop('type', None)
Expand All @@ -36,21 +47,21 @@ def flatten_dict(d, schema_context, add_context):
else:
context, typ = infer_context(typ, schema_context)
out['@type'] = typ

if add_context:
out['@context'] = context

props = out.pop('properties', {})
for field, value in props.items():
if isinstance(value, dict):
value = flatten_dict(value, schema_context, False)
elif isinstance(value, list):
value = [
flatten_dict(o, schema_context, False)
if isinstance(o, dict) else o
for o in value
]
value = _flatten(value, schema_context)
out[field] = value

children = out.pop('children', [])
if children:
out['children'] = []
for child in children:
child = _flatten(child, schema_context)
out['children'].append(child)
return out


Expand All @@ -64,4 +75,4 @@ def infer_context(typ, context='http://schema.org'):
elif parsed_context.path:
context = base
typ = parsed_context.path.strip('/')
return context, typ
return context, typ
99 changes: 72 additions & 27 deletions tests/samples/misc/microformat_flat_test.json
Original file line number Diff line number Diff line change
@@ -1,33 +1,78 @@
[
{
"@type": ["h-entry"],
"@context": "http://microformats.org/wiki/",
{
"@type": [
"h-hidden-tablet",
"h-hidden-phone"
],
"@context": "http://microformats.org/wiki/",
"name": [
""
]
},
{
"@type": [
"h-hidden-phone"
],
"@context": "http://microformats.org/wiki/",
"name": [
""
],
"children": [
{
"@type": [
"h-hidden-tablet",
"h-hidden-phone"
],
"name": [
"Microformats are amazing"
""
]
},
{
"@type": [
"h-hidden-phone"
],
"author": [
{
"@type": ["h-card"],
"name": [
"W. Developer"
],
"url": [
"http://example.com"
],
"value": "W. Developer"
}
"name": [
"aJ Styles FastLane 2018 15 x 17 Framed Plaque w/ Ring Canvas"
],
"published": [
"2013-06-13 12:00:00"
"photo": [
"/on/demandware.static/-/Sites-main/default/dwa3227ee6/images/small/CN1148.jpg"
]
}
]
},
{
"@type": [
"h-entry"
],
"@context": "http://microformats.org/wiki/",
"name": [
"Microformats are amazing"
],
"author": [
{
"value": "W. Developer",
"@type": [
"h-card"
],
"summary": [
"In which I extoll the virtues of using microformats."
"name": [
"W. Developer"
],
"content": [
{
"html": "\n<p>Blah blah blah</p>\n",
"value": "\nBlah blah blah\n"
}
]
}
]
"url": [
"http://example.com"
]
}
],
"published": [
"2013-06-13 12:00:00"
],
"summary": [
"In which I extoll the virtues of using microformats."
],
"content": [
{
"html": "\n<p>Blah blah blah</p>\n",
"value": "\nBlah blah blah\n"
}
]
}
]
11 changes: 11 additions & 0 deletions tests/samples/misc/microformat_test.html
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,17 @@
<link rel="stylesheet" type="text/css" href="event-education.css" />
<meta name="verify-v1" content="so4y/3aLT7/7bUUB9f6iVXN0tv8upRwaccek7JKB1gs=" >
<meta property="og:title" content="Himanshu's Open Graph Protocol"/>
<div class="h-hidden-tablet h-hidden-phone"></div>
<div class="js-sticky-order b-header-pdp_sticky h-hidden-phone">
<div class="g-grid-container">
<div class="g-grid-row">
<div class="g-grid-col-1 h-hidden-tablet h-hidden-phone"></div>
<div class="g-grid-col-1 g-grid-col-2-tablet h-hidden-phone">
<img src="/on/demandware.static/-/Sites-main/default/dwa3227ee6/images/small/CN1148.jpg" alt="aJ Styles FastLane 2018 15 x 17 Framed Plaque w/ Ring Canvas" />
</div>
</div>
</div>
</div>
<article class="h-entry">
<h1 class="p-name">Microformats are amazing</h1>
<p>Published by <a class="p-author h-card" href="http://example.com">W. Developer</a>
Expand Down
109 changes: 78 additions & 31 deletions tests/samples/misc/microformat_test.json
Original file line number Diff line number Diff line change
@@ -1,40 +1,87 @@
[
{
{
"type": [
"h-hidden-tablet",
"h-hidden-phone"
],
"properties": {
"name": [
""
]
}
},
{
"type": [
"h-hidden-phone"
],
"properties": {
"name": [
""
]
},
"children": [
{
"type": [
"h-hidden-tablet",
"h-hidden-phone"
],
"properties": {
"name": [
""
]
}
},
{
"type": [
"h-entry"
"h-hidden-phone"
],
"properties": {
"name": [
"aJ Styles FastLane 2018 15 x 17 Framed Plaque w/ Ring Canvas"
],
"photo": [
"/on/demandware.static/-/Sites-main/default/dwa3227ee6/images/small/CN1148.jpg"
]
}
}
]
},
{
"type": [
"h-entry"
],
"properties": {
"name": [
"Microformats are amazing"
],
"author": [
{
"type": [
"h-card"
],
"properties": {
"name": [
"Microformats are amazing"
],
"author": [
{
"type": [
"h-card"
],
"properties": {
"name": [
"W. Developer"
],
"url": [
"http://example.com"
]
},
"value": "W. Developer"
}
],
"published": [
"2013-06-13 12:00:00"
"W. Developer"
],
"summary": [
"In which I extoll the virtues of using microformats."
],
"content": [
{
"html": "\n<p>Blah blah blah</p>\n",
"value": "\nBlah blah blah\n"
}
"url": [
"http://example.com"
]
},
"value": "W. Developer"
}
],
"published": [
"2013-06-13 12:00:00"
],
"summary": [
"In which I extoll the virtues of using microformats."
],
"content": [
{
"html": "\n<p>Blah blah blah</p>\n",
"value": "\nBlah blah blah\n"
}
]
}
]
}
]

0 comments on commit 20738c0

Please sign in to comment.