Skip to content

Commit

Permalink
fix issue #154
Browse files Browse the repository at this point in the history
  • Loading branch information
andrew2net committed Oct 20, 2023
1 parent ac20439 commit 9423075
Show file tree
Hide file tree
Showing 41 changed files with 671 additions and 660 deletions.
101 changes: 56 additions & 45 deletions lib/relaton_iso/scrapper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -270,56 +270,59 @@ def stage_code(doc)
# @param doc [Nokogiri::HTML::Document]
# @return [Hash]
def fetch_workgroup(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
wg = doc.at("//div[@class='clearfix']")
wg_link = wg.at "span/a"
return unless wg_link
wg = doc.at("////div[contains(., 'Technical Committe')]/following-sibling::span/a")
return unless wg

workgroup = wg_link.text.split "/"
workgroup = wg.text.split "/"
type = workgroup[1]&.match(/^[A-Z]+/)&.to_s || "TC"
# {
# name: "International Organization for Standardization",
# abbreviation: "ISO",
# url: "www.iso.org",
# }
tc_numb = workgroup[1]&.match(/\d+/)&.to_s&.to_i
tc_name = wg.at("span[@class='entry-title']").text
tc = RelatonBib::WorkGroup.new(name: tc_name, identifier: wg_link.text,
tc_name = wg[:title]
tc = RelatonBib::WorkGroup.new(name: tc_name, identifier: wg.text,
type: type, number: tc_numb)
RelatonIsoBib::EditorialGroup.new(technical_committee: [tc])
end

# rubocop:disable Metrics/MethodLength

# Fetch relations.
# @param doc [Nokogiri::HTML::Document]
# @return [Array<Hash>]
def fetch_relations(doc) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity
def fetch_relations(doc)
types = ["Now", "Now under review"]
doc.xpath("//ul[@class='steps']/li", "//div[@class='sub-step']").reduce([]) do |a, r|
r_type = r.at("h4", "h5").text
date = []
type = case r_type.strip
when "Previously", "Will be replaced by" then "obsoletes"
when "Corrigenda / Amendments", "Revised by", "Now confirmed"
on = doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
date << { type: "circulated", on: on.text } if on
"updates"
else r_type
end
if types.include?(type) then a
else
a + r.css("a").map do |id|
docid = RelatonBib::DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
bibitem = RelatonIsoBib::IsoBibliographicItem.new(
docid: [docid], formattedref: fref, date: date,
)
{ type: type, bibitem: bibitem }
type, date = relation_type(r.at("h4", "h5").text.strip, doc)
next a if types.include?(type)

a + create_relations(r, type, date)
end
end

def relation_type(type, doc)
date = []
t = case type.strip
when "Previously", "Will be replaced by" then "obsoletes"
when "Corrigenda / Amendments", "Revised by", "Now confirmed"
on = doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
date << { type: "circulated", on: on.text } if on
"updates"
else type
end
end
[t, date]
end

def create_relations(rel, type, date)
rel.css("a").map do |id|
docid = RelatonBib::DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
bibitem = RelatonIsoBib::IsoBibliographicItem.new(
docid: [docid], formattedref: fref, date: date,
)
{ type: type, bibitem: bibitem }
end
end
# rubocop:enable Metrics/MethodLength

# Fetch type.
# @param ref [String]
Expand All @@ -343,14 +346,20 @@ def fetch_type(ref)
# @param doc [Nokogiri::HTML::Document]
# @param lang [String]
# @return [Array<RelatonBib::TypedTitleString>]
def fetch_title(doc, lang)
content = doc.at(
"//nav[contains(@class,'heading-condensed')]/h2 | "\
"//nav[contains(@class,'heading-condensed')]/h3",
)&.text&.gsub(/\u2014/, "-")
return RelatonBib::TypedTitleStringCollection.new unless content

RelatonBib::TypedTitleString.from_string content, lang, script(lang)
def fetch_title(doc, lang) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
head = doc.at "//nav[contains(@class,'heading-condensed')]"
types = { "h2" => "title-intro", "h3" => "title-main", "h4" => "title-part" }
title_types = head.xpath("h2 | h3 | h4").each_with_object({}) do |t, h|
h[types[t.name]] = t.text
end
title = RelatonBib::TypedTitleStringCollection.new
title_types.each do |type, content|
title << RelatonBib::TypedTitleString.new(
type: type, content: content, language: lang, script: script(lang),
)
end
main = title.map { |t| t.title.content }.join " - "
title << RelatonBib::TypedTitleString.new(type: "main", content: main, language: lang, script: script(lang))
end

# Return ISO script code.
Expand All @@ -363,12 +372,11 @@ def script(lang)
end
end

# rubocop:disable Metrics/MethodLength
# Fetch dates
# @param doc [Nokogiri::HTML::Document]
# @param ref [String]
# @return [Array<Hash>]
def fetch_dates(doc, ref) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity
def fetch_dates(doc, ref) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity, Metrics/MethodLength
dates = []
%r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ ref
pub_date_str = doc.xpath("//span[@itemprop='releaseDate']").text
Expand Down Expand Up @@ -400,25 +408,28 @@ def fetch_contributors(ref)
mem << { entity: publisher, role: [type: "publisher"] }
end
end
# rubocop:enable Metrics/MethodLength

# Fetch ICS.
# @param doc [Nokogiri::HTML::Document]
# @return [Array<Hash>]
def fetch_ics(doc)
doc.xpath("//dl[dt/strong[.='ICS']]/dd/span/a").map do |i|
doc.xpath("//div[contains(., 'ICS')]/following-sibling::span/a").map do |i|
code = i.text.match(/[\d.]+/).to_s.split "."
{ field: code[0], group: code[1], subgroup: code[2] }
end
end

#
# Fetch links.
# @param doc [Nokogiri::HTML::Document]
# @param url [String]
#
# @param doc [Nokogiri::HTML::Document] document to parse
# @param url [String] document url
#
# @return [Array<Hash>]
#
def fetch_link(doc, url)
links = [{ type: "src", content: url }]
obp = doc.at_css("a#obp-preview")
obp = doc.at("//h4[contains(@class, 'h5')]/a")
links << { type: "obp", content: obp[:href] } if obp
rss = doc.at("//a[contains(@href, 'rss')]")
links << { type: "rss", content: DOMAIN + rss[:href] } if rss
Expand Down
2 changes: 1 addition & 1 deletion lib/relaton_iso/version.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# frozen_string_literal: true

module RelatonIso
VERSION = "1.16.1"
VERSION = "1.16.2"
end
30 changes: 15 additions & 15 deletions spec/fixtures/hits.xml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<documents>
<bibitem id="ISO19115-2-2019" type="standard" schema-version="v1.2.4">
<fetched>2023-10-14</fetched>
<fetched>2023-10-19</fetched>
<title type="title-intro" format="text/plain" language="en" script="Latn">Geographic information</title>
<title type="title-main" format="text/plain" language="en" script="Latn">Metadata</title>
<title type="title-part" format="text/plain" language="en" script="Latn">Part 2: Extensions for acquisition and processing</title>
Expand Down Expand Up @@ -62,15 +62,15 @@
<place>Geneva</place>
</bibitem>
<bibitem id="ISO19115-2-2019/Amd1-2022" type="standard" schema-version="v1.2.4">
<fetched>2023-10-14</fetched>
<fetched>2023-10-19</fetched>
<title type="title-intro" format="text/plain" language="en" script="Latn">Geographic information</title>
<title type="title-main" format="text/plain" language="en" script="Latn">Metadata</title>
<title type="title-part" format="text/plain" language="en" script="Latn">Part 2: Extensions for acquisition and processing -- Amendment 1</title>
<title type="main" format="text/plain" language="en" script="Latn">Geographic information - Metadata - Part 2: Extensions for acquisition and processing -- Amendment 1</title>
<title type="title-part" format="text/plain" language="en" script="Latn">Part 2: Extensions for acquisition and processing</title>
<title type="main" format="text/plain" language="en" script="Latn">Geographic information - Metadata - Part 2: Extensions for acquisition and processing</title>
<title type="title-intro" format="text/plain" language="fr" script="Latn">Information géographique</title>
<title type="title-main" format="text/plain" language="fr" script="Latn">Métadonnées</title>
<title type="title-part" format="text/plain" language="fr" script="Latn">Partie 2: Extensions pour l'acquisition et le traitement -- Amendement 1</title>
<title type="main" format="text/plain" language="fr" script="Latn">Information géographique - Métadonnées - Partie 2: Extensions pour l'acquisition et le traitement -- Amendement 1</title>
<title type="title-part" format="text/plain" language="fr" script="Latn">Partie 2: Extensions pour l'acquisition et le traitement</title>
<title type="main" format="text/plain" language="fr" script="Latn">Information géographique - Métadonnées - Partie 2: Extensions pour l'acquisition et le traitement</title>
<uri type="src">https://www.iso.org/standard/78888.html</uri>
<uri type="obp">https://www.iso.org/obp/ui/en/#!iso:std:78888:en</uri>
<uri type="rss">https://www.iso.org/contents/data/standard/07/88/78888.detail.rss</uri>
Expand Down Expand Up @@ -111,7 +111,7 @@
<place>Geneva</place>
</bibitem>
<bibitem id="ISO19115-3-2023" type="standard" schema-version="v1.2.4">
<fetched>2023-10-14</fetched>
<fetched>2023-10-19</fetched>
<title type="title-intro" format="text/plain" language="en" script="Latn">Geographic information</title>
<title type="title-main" format="text/plain" language="en" script="Latn">Metadata</title>
<title type="title-part" format="text/plain" language="en" script="Latn">Part 3: XML schema implementation for fundamental concepts</title>
Expand Down Expand Up @@ -171,7 +171,7 @@ Le présent document décrit la procédure utilisée pour générer des schémas
<place>Geneva</place>
</bibitem>
<bibitem id="ISO/TS19157-2-2016" type="standard" schema-version="v1.2.4">
<fetched>2023-10-14</fetched>
<fetched>2023-10-19</fetched>
<title type="title-intro" format="text/plain" language="en" script="Latn">Geographic information </title>
<title type="title-main" format="text/plain" language="en" script="Latn">Data quality</title>
<title type="title-part" format="text/plain" language="en" script="Latn">Part 2: XML schema implementation</title>
Expand Down Expand Up @@ -219,7 +219,7 @@ Le présent document décrit la procédure utilisée pour générer des schémas
<place>Geneva</place>
</bibitem>
<bibitem id="ISO19115-2-2009" type="standard" schema-version="v1.2.4">
<fetched>2023-10-14</fetched>
<fetched>2023-10-19</fetched>
<title type="title-intro" format="text/plain" language="en" script="Latn">Geographic information</title>
<title type="title-main" format="text/plain" language="en" script="Latn">Metadata</title>
<title type="title-part" format="text/plain" language="en" script="Latn">Part 2: Extensions for imagery and gridded data</title>
Expand Down Expand Up @@ -275,7 +275,7 @@ Le présent document décrit la procédure utilisée pour générer des schémas
<place>Geneva</place>
</bibitem>
<bibitem id="ISO/TS19115-3-2016" type="standard" schema-version="v1.2.4">
<fetched>2023-10-14</fetched>
<fetched>2023-10-19</fetched>
<title type="title-intro" format="text/plain" language="en" script="Latn">Geographic information</title>
<title type="title-main" format="text/plain" language="en" script="Latn"> Metadata</title>
<title type="title-part" format="text/plain" language="en" script="Latn">Part 3: XML schema implementation for fundamental concepts</title>
Expand Down Expand Up @@ -341,15 +341,15 @@ This implementation model does not alter the semantics of the target conceptual
<place>Geneva</place>
</bibitem>
<bibitem id="ISO/TS19139-2-2012" type="standard" schema-version="v1.2.4">
<fetched>2023-10-14</fetched>
<fetched>2023-10-19</fetched>
<title type="title-intro" format="text/plain" language="en" script="Latn">Geographic information</title>
<title type="title-main" format="text/plain" language="en" script="Latn">Metadata</title>
<title type="title-part" format="text/plain" language="en" script="Latn">XML schema implementation -- Part 2: Extensions for imagery and gridded data</title>
<title type="main" format="text/plain" language="en" script="Latn">Geographic information - Metadata - XML schema implementation -- Part 2: Extensions for imagery and gridded data</title>
<title type="title-part" format="text/plain" language="en" script="Latn">XML schema implementation</title>
<title type="main" format="text/plain" language="en" script="Latn">Geographic information - Metadata - XML schema implementation</title>
<title type="title-intro" format="text/plain" language="fr" script="Latn">Information géographique</title>
<title type="title-main" format="text/plain" language="fr" script="Latn">Métadonnées</title>
<title type="title-part" format="text/plain" language="fr" script="Latn">Mise en oeuvre par des schémas XML -- Partie 2: Extension pour l'imagerie et les données maillées</title>
<title type="main" format="text/plain" language="fr" script="Latn">Information géographique - Métadonnées - Mise en oeuvre par des schémas XML -- Partie 2: Extension pour l'imagerie et les données maillées</title>
<title type="title-part" format="text/plain" language="fr" script="Latn">Mise en oeuvre par des schémas XML</title>
<title type="main" format="text/plain" language="fr" script="Latn">Information géographique - Métadonnées - Mise en oeuvre par des schémas XML</title>
<uri type="src">https://www.iso.org/standard/57104.html</uri>
<uri type="rss">https://www.iso.org/contents/data/standard/05/71/57104.detail.rss</uri>
<docidentifier type="ISO" primary="true">ISO/TS 19139-2:2012</docidentifier>
Expand Down
14 changes: 7 additions & 7 deletions spec/vcr_cassettes/git_hub_not_found.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 9423075

Please sign in to comment.