In [1]:
import pandas as pd
import shorthand as shnd

s = shnd.Shorthand(
    entry_syntax="shorthand/resources/default_entry_syntax.csv",
    link_syntax="shorthand/resources/default_link_syntax.csv",
    syntax_case_sensitive=False
)
'''
parsed = s.parse_text(
    'shorthand/tests/manual_annotation.shnd',
    skiprows=2,
    comment_char='#'
)
'''

parsed = s.parse_text(
    'shorthand/tests/single_column.shnd',
    item_separator='__',
    default_entry_prefix='wrk',
    space_char='|',
    na_string_values='!',
    na_node_type='missing',
    skiprows=0,
    comment_char='#',
    drop_na=False
)

links = parsed.links
links.iloc[60:]
'''
print('       strings', parsed.strings.memory_usage(deep=True).sum()/1000, 'kb')
print('         links', parsed.links.memory_usage(deep=True).sum()/1000, 'kb')
print('resolved links', parsed.resolve_links().memory_usage(deep=True).sum()/1000, 'kb')
'''
entry_type = parsed.id_lookup('link_types', 'entry')
entry_string_ids = parsed.links.loc[parsed.links['link_type_id'] == entry_type, 'tgt_string_id']

parsed.resolve_links().query('src_string.str.contains("nasa")').query('src_node_type != "shorthand_text"').query('tgt_node_type != "shorthand_text"')

parsed.resolve_links().query('link_type == "cited"').merge(parsed.links, left_index=True, right_index=True)

s = parsed.synthesize_shorthand_entries('wrk', fill_spaces=True)

check = pd.Series([
    'asmith_bwu__1999__s_bams__101__803__xxx',
    'asmith_bwu__1998__s_bams__100__42__yyy',
    'bjones__1975__s_jats__90__1__!',
    'bwu__1989__t_long|title__!__80__!',
    'Some|Author__1989__t_A|Title|With|\\#__!__!__!',
    'asmith_bwu__2008__s_bams__110__1__zzz'
])

(check == s).all()

True

In [2]:
import pandas as pd
import shorthand as shnd
from bibtexparser.bparser import BibTexParser

bibtex_parser = BibTexParser(common_strings=True)
with open("shorthand/tests/bibtex_test_data_short.bib", encoding='utf8') as f:
    bibdatabase = bibtex_parser.parse_file(f)

data = pd.DataFrame(bibdatabase.entries)

s = shnd.Shorthand(
    entry_syntax="shorthand/resources/default_bibtex_syntax.csv",
    syntax_case_sensitive=False
)

parsed = s.parse_items(
    data,
    space_char='|',
    na_string_values='!',
    na_node_type='missing'
)

parsed_identifiers = parsed.strings.query('node_type_id == 8')

check = pd.Series([
    '10.1038/194638b0',
    '10.1175/1520-0493(1962)090<0311:OTOKEB>2.0.CO;2',
    '10.3402/tellusa.v14i3.9551',
    '10.1175/1520-0477-43.9.451',
    '10.3402/tellusa.v14i4.9569',
    '10.1007/BF02317953',
    '10.1007/BF02247180',
    '10.1029/JZ068i011p03345',
    '10.1029/JZ068i009p02375',
])

assert (check == parsed_identifiers['string'].array).all()

(check == parsed_identifiers['string'].array).all()

True

In [None]:
from bibtexparser.bibdatabase import BibDatabase as _bibtex_db
from bibtexparser import dumps as _dump_bibtex_string

def btwriter(entry_series):
    db = _bibtex_db()
    db.entries = [dict(entry_series.dropna().map(str))]
    return _dump_bibtex_string(db)

print(data.apply(btwriter, axis=1)[9])