We are currently running a process to pull USGS Profiles every day through a web scraping process running as a serverless processing pipeline on CHS. This cache can be processed to introduce a number of new entities and relationships into our model. This notebook handles the process of pulling the cache and generating a graphable tables.

In [5]:
import requests
import json
import pandas as pd
import xmltodict
import os
import pickle
from copy import copy
import re
import validators
import datetime
import string
import click
from pylinkedcmd import utilities

import isaid_helpers

In [2]:
%%time
if click.confirm('Do you really want to proceed with rebuilding the local USGS Profiles cache from source?', default=True):
    profile_cache = isaid_helpers.cache_chs_cache("usgs_profiles")
    pickle.dump(profile_cache, open(isaid_helpers.f_raw_profiles, "wb"))
    print(
        isaid_helpers.f_raw_profiles, 
        "CREATED", 
        datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_raw_profiles))
    )

Do you really want to proceed with rebuilding the local USGS Profiles cache from source? [Y/n]: Y
data/process_usgs_profiles.p CREATED 2021-06-07 21:48:47.109775
CPU times: user 473 ms, sys: 143 ms, total: 616 ms
Wall time: 14 s


In [6]:
# USGS Profiles
def profile_entities(
    raw_profiles_file=isaid_helpers.f_raw_profiles, 
    emails_to_filter=None, 
    return_raw_summary=False,
    return_format="list"
):
    if not os.path.exists(raw_profiles_file):
        raise ValueError("The raw profiles file needs to exist. Run that process first.")
        
    raw_profiles = pickle.load(open(raw_profiles_file, "rb"))
    
    profile_summary_props = [
        "profile",
        "_date_cached",
        "profile_image_url",
        "email",
        "orcid",
        "description",
        "title",
        "expertise",
        "body_content_links"
    ]
    
    entity_summary_props = [
        "email",
        "orcid",
        "profile",
        "_date_cached",
        "profile_image_url",
        "description"
    ]
    
    if emails_to_filter is not None:
        usable_profiles = [
            {k:v for k,v in i.items() if k in profile_summary_props} 
            for i in raw_profiles if "email" in i and i["email"] in emails_to_filter
        ]
    else:
        usable_profiles = [
            {k:v for k,v in i.items() if k in profile_summary_props} 
            for i in raw_profiles if "email" in i or "orcid" in i
        ]
        
    if return_raw_summary:
        if return_format == "list":
            return usable_profiles
        elif return_format == "dataframe":
            return pd.DataFrame(usable_profiles)
    
    entity_summary = [
        {k:v for k,v in i.items() if k in entity_summary_props} 
        for i in usable_profiles
    ]
    
    if return_format == "list":
        return entity_summary
    elif return_format == "dataframe":
        return pd.DataFrame(entity_summary)

def profile_relationships(
    rel_type="expertise",
    emails_to_filter=None,
    return_format="list"
):
    profiles = profile_entities(emails_to_filter=emails_to_filter, return_raw_summary=True)
    
    profile_rels = list()

    if rel_type == "expertise":
        for profile in [i for i in profiles if "expertise" in i and i["expertise"]]:
            profile_rels.extend([
                {
                    "email": profile["email"],
                    "orcid": profile["orcid"],
                    "expertise_term": i,
                    "date_qualifier": profile["_date_cached"],
                    "reference": profile["profile"]
                } for i in profile["expertise"]
            ])
            
    elif rel_type == "creativework":
        for profile in [i for i in profiles if "body_content_links" in i and i["body_content_links"]]:
            profile_rels.extend([
                {
                    "email": profile["email"],
                    "orcid": profile["orcid"],
                    "url": i["link_href"],
                    "doi": utilities.doi_from_string(i["link_href"]),
                    "title": i["link_text"],
                    "date_qualifier": profile["_date_cached"],
                    "reference": profile["profile"]
                } for i in profile["body_content_links"]
            ])

    if return_format == "list":
        return profile_rels
    elif return_format == "dataframe":
        return pd.DataFrame(profile_rels)


In [7]:
%%time
profile_entities(
    emails_to_filter=isaid_helpers.active_usgs_emails(), 
    return_format="dataframe"
).to_csv(isaid_helpers.f_graphable_profiles, index=False)
print(
    isaid_helpers.f_graphable_profiles, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_profiles))
)

profile_relationships(
    rel_type="creativework", 
    emails_to_filter=isaid_helpers.active_usgs_emails(), 
    return_format="dataframe"
).to_csv(isaid_helpers.f_graphable_profile_creative_works, index=False)
print(
    isaid_helpers.f_graphable_profile_creative_works, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_profile_creative_works))
)

profile_relationships(
    rel_type="expertise", 
    emails_to_filter=isaid_helpers.active_usgs_emails(), 
    return_format="dataframe"
).to_csv(isaid_helpers.f_graphable_profile_expertise, index=False)
print(
    isaid_helpers.f_graphable_profile_expertise, 
    "CREATED", 
    datetime.datetime.fromtimestamp(os.path.getmtime(isaid_helpers.f_graphable_profile_expertise))
)


data/graphable_table_profile_entities.csv CREATED 2021-06-07 21:50:54.858115
data/graphable_table_profile_creative_works.csv CREATED 2021-06-07 21:50:56.344036
data/graphable_table_profile_expertise.csv CREATED 2021-06-07 21:50:57.725638
CPU times: user 4.01 s, sys: 129 ms, total: 4.14 s
Wall time: 4.36 s
