In [None]:
import bibliograph as bg
import pandas as pd

aliases_dict = {
    'actor': 'bibliograph/test_data/aliases_actor.csv',
    'work': 'bibliograph/test_data/aliases_work.csv'
}

tn = bg.slurp_shorthand(
    'bibliograph/test_data/shorthand_with_aliases.shnd',
    "bibliograph/resources/default_entry_syntax.csv",
    "bibliograph/resources/default_link_syntax.csv",
    syntax_case_sensitive=False,
    aliases_dict=aliases_dict,
    aliases_case_sensitive=False,
    item_separator='__',
    space_char='|',
    na_string_values='!',
    na_node_type='missing',
    default_entry_prefix='wrk',
    skiprows=2,
    comment_char='#',
)

tn.resolve_assertions().query('link_type == "alias"')


In [None]:
import bibliograph as bg

bibtex_fname = 'bibliograph/test_data/bibtex_test_data_short.bib'
entry_syntax_fname = "bibliograph/resources/default_bibtex_syntax.csv"

tn = bg.slurp_bibtex(
    bibtex_fname,
    entry_syntax_fname,
    syntax_case_sensitive=True,
    allow_redundant_items=False,
    aliases_dict=None,
    aliases_case_sensitive=True,
    space_char='|',
    na_string_values='!',
    na_node_type='missing'
)
tn.resolve_assertions().query('link_type == "alias"')

### Aggregate works when building nodes.

A work is an entry node type, defined in the entry syntax. 
  - A work has an item of node type "identifier" with a link to the parent entry of type "doi"
  - A work has items of node type "work" with links to the parent entry of types "volume", "page", and "supertitle"/"title"
  - A work has an item of node type "date" with a link to the parent entry of type "published"

IF two assertions exist between different strings of node type "work" and the same string of node type "identifier",
  - THEN the work strings should map to the same node ID

IF two assertions of link type "doi" exist between different strings of node type "work" and different strings of node type "identifier", 
  - AND the identifiers have strings in common that are the same after stripping any leading substrings which end in one of 'doi:', 'doi.org/', or 'doi/'
  - THEN the work strings should map to the same node ID
  - __to make this work, you build a set of aliases for the identifier nodes first and then check for strings in common__

IF three assertions of link type ("title" or "supertitle"), "volume", and "page" exist between different strings of node type "work" and the same strings,
  - THEN the work strings should map to the same node ID

IF three assertions of link type "published", "volume", and "page" exist between different strings of node type "work" and the same strings,
  - THEN the work strings should PROBABLY map to the same node ID

IF there are two strings that map to nodes of the same type after applying a specified transformation, the strings should map to the same node ID
  - __to make this work, you build a set of aliases using the transformation first and then build nodes__

In [None]:
import pandas as pd

def apply_alias_generator(string_series, func):
    
    aliases = pd.DataFrame({
        'string': string_series,
        'alias': string_series.map(func)
    })
    
    return aliases.dropna()

def western_surname_alias_generator_serial(
    name,
    drop_nouns=['ms', 'mr', 'dr'],
    generationals=['jr', 'sr'],
    partial_surnames=['st', 'de', 'le', 'van', 'von']
):

    if ',' not in name:
        return pd.NA

    name = name.casefold()

    drop_nouns = [s for s in drop_nouns if s in name]
    drop_nouns = [s + '.' if s + '.' in name else s for s in drop_nouns]

    generationals = [s for s in generationals if s in name]
    generationals = [s + '.' if s + '.' in name else s for s in generationals]

    partial_surnames = [s for s in partial_surnames if s in name]
    partial_surnames = [
        s + '.' if s + '.' in name else s for s in partial_surnames
    ]

    name = name.split(',')
    name = [n.strip() for n in name]

    if name[1] in drop_nouns:
        
        if len(name) == 2:
            name = name[0].rsplit(' ', maxsplit=1)
            name = [name[1], name[0]]
        
        else:
            return pd.NA

    if name[1] in generationals:
        
        if len(name) == 2:
            g = name[1]
            name = name[0].rsplit(' ', maxsplit=1)
            name = [name[1], name[0]]
            name[1] = name[1] + ' ' + g
        
        else:
            return pd.NA

    for m in drop_nouns:
        name = [n.removeprefix(m) for n in name]
        name = [n.removesuffix(m) for n in name]

    name = [n.strip() for n in name]
    
    for p in partial_surnames:

        if name[1].endswith(' ' + p):

            name[0] = p + ' ' + name[0]
            name[1] = name[1][:-len(p)]

    name[0] = ''.join([c for c in name[0] if c.isalpha()])

    name[1] = [
        s.strip()[0]
        for substring in name[1].split(' ')
        for s in substring.split('-')
        if s != ''
    ]

    return (name[0] + ''.join(name[1]))

def western_surname_alias_generator_vector(
    name_series,
    drop_nouns=['ms', 'mrs', 'mr', 'dr', 'sir', 'dame'],
    generationals=['jr', 'sr'],
    partial_surnames=['st', 'de', 'le', 'van', 'von']
):

    names = name_series.copy().loc[name_series.str.contains(',')]

    names = names.str.casefold()

    names = names.str.split(',', expand=True)
    names = names.apply(lambda x: x.str.strip())
    
    if len(names.columns) > 2:
        more_fields = names[2].notna()

    else:
        more_fields = pd.Series(False, index=names[0].index)
    
    names = names[[0, 1]]
    
    drop_nouns = pd.Series(drop_nouns)
    drop_nouns = pd.concat([drop_nouns, drop_nouns.map(lambda x: x + '.')])
    is_drop_noun = names[1].isin(drop_nouns)
    
    if is_drop_noun.any():

        selection = names[0].loc[is_drop_noun & ~more_fields].copy()
        selection = selection.str.rsplit(' ', n=1, expand=True)

        names[0].loc[selection.index] = selection[1]
        names[1].loc[selection.index] = selection[0]
        
        names[1].loc[is_drop_noun & more_fields] = pd.NA
 
    generationals = pd.Series(generationals)
    generationals = pd.concat([
        generationals,
        generationals.map(lambda x: x + '.')
    ])
    is_generational = names[1].isin(generationals)

    if is_generational.any():
        
        gens = names[1].loc[is_generational & ~more_fields].copy()

        selection = names[0].loc[is_generational & ~more_fields].copy()
        selection = selection.str.rsplit(' ', n=1, expand=True)
        slctn_idx = selection.index

        names[0].loc[slctn_idx] = selection[1]
        names[1].loc[slctn_idx] = selection[0]
        names[1].loc[slctn_idx] = names[1].loc[slctn_idx] + ' ' + gens
        
        names[1].loc[is_generational & more_fields] = pd.NA

    for m in drop_nouns:
        names = names.apply(lambda x: x.str.removeprefix(m))
        names = names.apply(lambda x: x.str.removesuffix(m))

    names = names.apply(lambda x: x.str.strip())
    
    partial_surnames = partial_surnames + [p + '.' for p in partial_surnames]

    for p in partial_surnames:

        endswith_p = names[1].str.endswith(' ' + p).fillna(False)

        names[0].loc[endswith_p] = p + ' ' + names[0].loc[endswith_p]
        names[1].loc[endswith_p] = names[1].loc[endswith_p].str.slice(
            stop=-len(p)
        )

    names[0] = names[0].str.replace(r'[^\w]|[\d_]', '', regex=True)
    names[1] = names[1].str.replace(r'(?!\b)\w*|\W*?', '', regex=True)
    
    aliases = (names[0] + names[1]).str.casefold()
    aliases = pd.concat([
        aliases,
        pd.Series(pd.NA, index=name_series.index.difference(aliases.index))
    ])

    return aliases.sort_index()

names = pd.Series([
    'Loon, H. van',
    'van Loon, h.',
    'van Loon, Harry',
    'VAN LOON, H',
    'Van loon, ',
    'some other person',
    'Rodríguez-Silva, Ileana',
    'nasa',
    'Martin Luther King, jr.',
    'King, Martin Luther jr.',
    'Mr. Martin Luther King, jr.',
    'St. Whatever, Given Name',
    'Whatever, Given Name St.',
    'University of Washington, Seattle',
    'University of Chicago',
    'Ms. Gerould, Joanne',
    'Gerould, Ms. Joanne',
    'Gerould, Joanne, Ms.',
    'Joanne Gerould, Ms.',
    'Surname, Compound Given-Name',
    'Monde, Alice le',
    'le Monde, Alice'
])

'''serial = apply_alias_generator(names, western_surname_alias_generator_serial)
vector = western_surname_alias_generator_vector(names)
vector = vector.dropna().rename('vectorized')
pd.concat([serial, vector], axis='columns')'''
serial = names.map(western_surname_alias_generator_serial)
vector = western_surname_alias_generator_vector(names)
((serial == vector) | (serial.isna() & vector.isna())).all()
pd.concat([names, western_surname_alias_generator_vector(names)], axis=1)

In [None]:
import pandas as pd

def doi_alias_generator(doi_series, delimiters=['doi:', 'doi.org/', 'doi/']):

    has_delimiter = pd.concat(
        [doi_series.str.contains(d).rename(d) for d in delimiters],
        axis='columns'
    )

    has_delimiter = has_delimiter.apply(
        lambda x: pd.Series(x.name, index=has_delimiter.index).where(x)
    )
    has_delimiter = has_delimiter.ffill(axis='columns')
    has_delimiter = has_delimiter[has_delimiter.columns[-1]]
    
    output = pd.concat(
        [doi_series.rename('string'), has_delimiter.rename('delimiter')],
        axis='columns'
    )

    def delimiter_splitter(delimiter_group):
        
        delimiter = delimiter_group.name

        if pd.isna(delimiter):
            return pd.DataFrame(
                {
                    'string': pd.NA,
                    'delimiter': delimiter
                },
                index=delimiter_group.index
            )

        else:
            strings = delimiter_group['string'].str.split(
               delimiter,
               expand=True
            )
            return pd.DataFrame({
                'string': strings[1].str.strip(),
                'delimiter': delimiter
            })

    output = output.groupby(by='delimiter', dropna=False)
    output = output.apply(delimiter_splitter)

    return output['string'].rename(None)

identifiers = pd.Series([
    'xxx',
    'yyy',
    'zzz',
    'doi:yyy',
    'https://doi.org/zzz',
    'doi/yyy'
])
'''aliases = doi_alias_generator(identifiers)
output = pd.concat([identifiers, aliases], axis='columns')
output = output.rename(columns={0: 'string', 1: 'alias'})
output'''
doi_alias_generator(identifiers)

In [None]:
import bibliograph as bg

aliases_dict = {
    'actor': 'bibliograph/test_data/aliases_actor.csv',
    'work': 'bibliograph/test_data/aliases_work.csv'
}

tn = bg.slurp_shorthand(
    'bibliograph/test_data/shorthand_with_aliases.shnd',
    "bibliograph/resources/default_entry_syntax.csv",
    "bibliograph/resources/default_link_syntax.csv",
    syntax_case_sensitive=False,
    aliases_dict=aliases_dict,
    aliases_case_sensitive=False,
    item_separator='__',
    space_char='|',
    na_string_values='!',
    na_node_type='missing',
    default_entry_prefix='wrk',
    skiprows=2,
    comment_char='#',
)

node_2_aliases = [
    'NASA',
    'National Aeronautics and Space Administration',
    'nasa',
    'national aeronautics and space administration'
]

strings_with_node_2 = tn.strings.loc[tn.strings['node_id'] == 2, 'string']

assert (strings_with_node_2 == node_2_aliases).all().all()

tn.resolve_strings().sort_values(by='node_id')


In [None]:
!pytest bibliograph/tests.py

In [None]:
import bibliograph as bg
import pandas as pd

aliases_dict = {
    'actor': 'bibliograph/test_data/aliases_actor.csv',
    'work': 'bibliograph/test_data/aliases_work.csv'
}

tn = bg.slurp_shorthand(
    'bibliograph/test_data/shorthand_with_aliases.shnd',
    "bibliograph/resources/default_entry_syntax.csv",
    "bibliograph/resources/default_link_syntax.csv",
    syntax_case_sensitive=False,
    aliases_dict=aliases_dict,
    aliases_case_sensitive=False,
    automatic_aliasing=True,
    item_separator='__',
    space_char='|',
    na_string_values='!',
    na_node_type='missing',
    default_entry_prefix='wrk',
    skiprows=2,
    comment_char='#',
)

tn.resolve_assertions().query('link_type == "alias"')
tn.resolve_strings().sort_values(by='node_id')