This notebook processes the Pubs Warehouse links. I will do more work here on making better sense of the content on the other side of links.

In [62]:
import requests
import pandas as pd
from wbmaker import WikibaseConnection
from urllib.parse import urlparse
from joblib import Parallel, delayed
from tqdm import tqdm

In [6]:
geokb = WikibaseConnection('GEOKB_CLOUD')

In [8]:
sparql_queries = {}

sparql_queries['pw_index_id'] = """
PREFIX wdt: <https://geokb.wikibase.cloud/prop/direct/>

SELECT ?item ?itemLabel ?indexId (YEAR(?pub_year) AS ?pubyear)
WHERE {
  ?item wdt:P114 ?indexId .
  OPTIONAL {
    ?item wdt:P7 ?pub_year .
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
}
"""

sparql_queries['pw_doi'] = """
PREFIX wdt: <https://geokb.wikibase.cloud/prop/direct/>

SELECT ?item ?doi
WHERE {
  ?item wdt:P74 ?doi .
}
"""

def wb_results_to_df(data):
    columns = data['head']['vars']
    records = data['results']['bindings']

    formatted_records = []
    for record in records:
        temp_dict = {}
        for col in columns:
            temp_dict[col] = record[col]['value'] if col in record else None
        formatted_records.append(temp_dict)
    
    return pd.DataFrame(formatted_records)

def sparql_query(query, endpoint="https://geokb.wikibase.cloud/query/sparql"):
    params = {
        "query": query,
        "format": "json"
    }
    response = requests.get(endpoint, params=params)
    if response.status_code == 200:
        data = response.json()
        return wb_results_to_df(data)

In [41]:
geokb_pw_ids = sparql_query(sparql_queries['pw_index_id'])
geokb_pw_ids['qid'] = geokb_pw_ids['item'].apply(lambda x: x.split('/')[-1])
geokb_pw_ids.drop(columns="item", inplace=True)

In [2]:
pw_links = pd.read_parquet('./data/pw_cache/pw_links.parquet')

In [11]:
geokb_pw_links = pw_links[pw_links['indexId'].isin(geokb_pw_ids['indexId'])]

In [13]:
geokb_pw_links['link_type'].unique()

array(['Thumbnail', 'Document', 'Index Page', 'Cover', 'Plate',
       'Illustration', 'NGMDB Index Page', 'Project Site',
       'Additional Report Piece', 'Spatial Data', 'Table',
       'Application Site', 'Data Release', 'Digital Object Identifier',
       'Related Work', 'Metadata', 'Figure', 'Raw Data',
       'Companion Files', 'Version History', 'Appendix', 'Database',
       'Read Me', 'Sheet', 'Dataset', 'Image Folder', 'Publication XML',
       'HTML Document', 'Software Release', 'Errata', 'Abstract', 'Image',
       'Chapter', 'Referenced Work', 'Authors Website'], dtype=object)

In [49]:
readable_link_ext = ['pdf','PDF','htm','HTM','html','HTML','txt','TXT']

document_links = geokb_pw_links[geokb_pw_links['link_type'] == 'Document'].reset_index(drop=True)
document_links['link_url'] = document_links['link_url'].apply(lambda x: x.strip())
document_links['domain'] = document_links['link_url'].apply(lambda x: urlparse(x).netloc)
document_links['file_ext'] = document_links['link_url'].apply(lambda x: urlparse(x).path.split('.')[-1] if '.' in urlparse(x).path else None)

In [58]:
def url_status_code(url):
    return {
        'link_url': url,
        'status_code': requests.head(url).status_code
    }

usable_doc_links = document_links[document_links['file_ext'].isin(readable_link_ext)]
unique_doc_url = list(set(usable_doc_links['link_url']))

In [57]:
link_checks = Parallel(n_jobs=8, prefer="threads")(delayed(url_status_code)(i) for i in tqdm(unique_doc_url))

['https://pubs.usgs.gov/gf/096/text.pdf',
 'https://pubs.usgs.gov/wdr/1996/ca-96/WRD-1996-vol2.pdf',
 'https://pubs.usgs.gov/fs/2002/0057/report.pdf',
 'https://pubs.usgs.gov/of/1979/1173/report.pdf',
 'https://pubs.usgs.gov/sim/3256/downloads/pdf/SIM3256_pamphlet.pdf',
 'https://pubs.usgs.gov/wri/2002/4267/report.pdf',
 'https://pubs.usgs.gov/fs/0148-96/report.pdf',
 'https://pubs.usgs.gov/of/1991/0276/report.pdf',
 'https://pubs.usgs.gov/of/1989/0487/report.pdf',
 'https://pubs.usgs.gov/of/1980/1229/report.pdf',
 'https://pubs.usgs.gov/of/1984/0327/report.pdf',
 'https://pubs.usgs.gov/of/1986/0381/report.pdf',
 'https://pubs.usgs.gov/wri/1986/4360/report.pdf',
 'https://pubs.usgs.gov/of/2020/1102/ofr20201102.pdf',
 'https://pubs.usgs.gov/of/1977/0750/report.pdf',
 'https://pubs.usgs.gov/of/1996/0231/report.pdf',
 'https://pubs.usgs.gov/wri/1993/4203/report.pdf',
 'https://pubs.usgs.gov/of/1991/0039/report.pdf',
 'https://pubs.usgs.gov/circ/1994/1104/report.pdf',
 'https://pubs.usgs.g

In [None]:
unique_doc_url

In [46]:
content_links_to_geokb = pd.merge(
    left=document_links[document_links['file_ext'].isin(readable_link_ext)].groupby('indexId', as_index=False)['link_url'].agg(list),
    right=geokb_pw_ids[['qid','indexId']],
    how="inner",
    on="indexId"
)
content_links_to_geokb.head()

Unnamed: 0,indexId,link_url,qid
0,1008336,[http://www.fs.fed.us/fire/fmt/fmt_pdfs/FMT64-...,Q148524
1,1016376,[http://fresc.usgs.gov/products/papers/1389_Be...,Q144195
2,2000059,[http://www.glfc.org/pubs/TechReports/Tr58.pdf],Q151819
3,2000064,[http://nepis.epa.gov/Adobe/PDF/200077AN.PDF],Q150014
4,2000077,[http://www.fs.fed.us/psw/publications/documen...,Q152407


In [47]:
refs = geokb.models.References()
refs.add(
    geokb.datatypes.Item(
        prop_nr=geokb.prop_lookup["data source"],
        value="Q54915"
    )
)

for index, row in content_links_to_geokb.head(2).iterrows():
    item = geokb.wbi.item.get(row['qid'])

    content_link_claims = []
    for url in row['link_url']:
        content_link_claims.append(
            geokb.datatypes.URL(
                prop_nr="P136",
                value=url,
                references=refs
            )
        )
    item.claims.add(
        content_link_claims,
        action_if_exists=geokb.action_if_exists.REPLACE_ALL
    )

    display(item.get_json())

    # response = item.write(
    #     summary="Added links to content for publication"
    # )
    # print(response.id)

{'labels': {'en': {'language': 'en',
   'value': "American Indian influence on fire regimes in Calfornia's coastal ranges"}},
 'descriptions': {'en': {'language': 'en',
   'value': 'a Article (Journal Article) published by U.S. Department of Agriculture, Forest Service as part of series - Fire Management Today'}},
 'aliases': {},
 'type': 'item',
 'claims': {'P1': [{'mainsnak': {'snaktype': 'value',
     'property': 'P1',
     'datatype': 'wikibase-item',
     'datavalue': {'value': {'entity-type': 'item',
       'numeric-id': 7,
       'id': 'Q7'},
      'type': 'wikibase-entityid'}},
    'type': 'statement',
    'id': 'Q148524$53B4975B-A8B6-4854-A38A-67698FA5656D',
    'rank': 'normal',
    'references': [{'snaks': {'P27': [{'snaktype': 'value',
         'property': 'P27',
         'datatype': 'wikibase-item',
         'datavalue': {'value': {'entity-type': 'item',
           'numeric-id': 54915,
           'id': 'Q54915'},
          'type': 'wikibase-entityid'}}]},
      'snaks-orde

{'labels': {'en': {'language': 'en',
   'value': 'Monitoring temporal change in riparian vegetation of Great Basin National Park'}},
 'descriptions': {'en': {'language': 'en',
   'value': 'a Article (Journal Article) published by Monte L. Bean Life Science Museum, Brigham Young University as part of series - Western North American Naturalist'}},
 'aliases': {},
 'type': 'item',
 'claims': {'P1': [{'mainsnak': {'snaktype': 'value',
     'property': 'P1',
     'datatype': 'wikibase-item',
     'datavalue': {'value': {'entity-type': 'item',
       'numeric-id': 7,
       'id': 'Q7'},
      'type': 'wikibase-entityid'}},
    'type': 'statement',
    'id': 'Q144195$9ABC8097-9297-4D48-BEEB-C8E3C58E24B1',
    'rank': 'normal',
    'references': [{'snaks': {'P27': [{'snaktype': 'value',
         'property': 'P27',
         'datatype': 'wikibase-item',
         'datavalue': {'value': {'entity-type': 'item',
           'numeric-id': 54915,
           'id': 'Q54915'},
          'type': 'wikibase-