In [None]:
import bibliograph as bg

#bibtex_file = 'C:\\Users\\short\\Dropbox\\PhD\\bibliograph\\test data\\bibtex_test_data_short.bib'
bibtex_file = 'C:\\Users\\short\\Dropbox\\PhD\\bibliograph\\test data\\bibtex_test_data_2.bib'
#bibtex_file = 'C:\\Users\\short\\Dropbox\\PhD\\dissertation\\NCARpapers1992.bib'

tn = bg.load_bibtex(bibtex_file, case_sensitive=False)
tn.resolve_edges('name', drop=True)[['src_string', 'link_type', 'tgt_string', 'src_node_type', 'tgt_node_type']]
tn.resolve_assertions()

In [None]:
import random 
import string
import pandas as pd

# cleanup missing data from a test dataframe

total_docs = 10
num_unique_docs = 600
num_unique_author_lists = 540 # ~10% of author lists are repeats
author_list_lengths = range(5,35)
year_range = range(1960,2000)
num_publications = 80
num_NaN_pubs = int((5/100)*num_unique_docs)
publication_title_lengths = range(5,15)
volume_range = range(1,100)
num_NaN_vols = int((10/100)*num_unique_docs)
page_range = range(1,15000)
num_NaN_pages = int((15/100)*num_unique_docs)
avg_doi_length = 20
num_NaN_dois = int((50/100)*num_unique_docs)


# generate a random string
chars = string.ascii_lowercase + string.ascii_uppercase
def get_random_string(size):
    return ''.join(random.choice(chars) for i in range(size)) 

# random strings for abbreviated author lists 
authors = [get_random_string(random.choice(author_list_lengths))
           for i in range(num_unique_author_lists)]

# random year per document
years = [str(y) for y in year_range]
years = [random.choice(years) for i in range(num_unique_docs)]

# random publication titles
pubs = [get_random_string(random.choice(publication_title_lengths))
            for i in range(num_publications - num_NaN_pubs)]
pubs = pubs + [pd.NA]*num_NaN_pubs

# random volume number per document plus 4% NaN values
volumes = [str(y) for y in volume_range]
volumes = [random.choice(volumes)
           for i in range(num_unique_docs - num_NaN_pubs)]
volumes = volumes + [pd.NA]*num_NaN_vols

# random page number per document
pages = [str(y) for y in page_range]
pages = [random.choice(pages) for i in range(num_unique_docs - num_NaN_pages)]
pages = pages + [pd.NA]*num_NaN_pages

# random doi per document
dois = [get_random_string(20) for i in range(num_unique_docs - num_NaN_dois)]
dois = dois + [pd.NA]*num_NaN_dois

data = pd.DataFrame({'author':[random.choice(authors) for i in range(total_docs)],
                     'year':[random.choice(years) for i in range(total_docs)],
                     'pub':[random.choice(pubs) for i in range(total_docs)],
                     'vol':[random.choice(volumes) for i in range(total_docs)],
                     'page':[random.choice(pages) for i in range(total_docs)],
                     'doi':[random.choice(dois) for i in range(total_docs)]})

In [281]:
import random 
import string
import numpy as np
import pandas as pd

## make test data
columns = [ 'authors', 'year',  'title', 'volume', 'page', 'doi']
data = [[    'asmith',   2005,   'bams',      100,   3231,     2],
        [       'bwu',   2000, 'nature',     2575,    801,     3],
        [    'asmith',   2010,   'bams',      105,    457,     5],
        [       'bwu',   2011,   'jats',       90,     35,    11],
        ['alicesmith',   2011,   'jats',       90,    711,    13],
        [     'bobwu',   2012,    pd.NA,    pd.NA,  pd.NA, pd.NA],
        ['asmith_bwu',   2015, 'report',    pd.NA,     40, pd.NA],
        [       pd.NA,   1975,   'long',    pd.NA,  pd.NA, pd.NA]]
data = pd.DataFrame(data, columns=columns)

data = pd.DataFrame(data)

# duplicate the test data a bunch of times to make a big dataset
for i in range(14):
    data = data.append(data, ignore_index=True)

# insert some missing values in the expanded dataset to simulate base
# values assembled from multiple sources that might be missing values
# present in the "true" data
fraction_missing = 0.5
sample = random.sample(range(data.size), int(data.size*fraction_missing))
idx = np.unravel_index(sample, data.shape)
a = np.zeros(data.shape)
a[idx] = 1
data[a.astype(bool)] = np.nan

# get the one unique value or nan from a column, or raise an error if
# there are multiple values
def get_unique(series):
    values = series.value_counts()
    length = len(values)
    if length > 1:
        raise ValueError('inconsistent values in column "{}"'
                         .format(series.name))
    elif length == 1:
        return values.index[0]
    else:
        return pd.NA

# recover the base dataset from the simulated patchy data

# It's reasonable to assume that if the author, year, and page for a 
# document are the same then they entries represent the same document
# even if they have no doi. Same for title, volume, page.

# function to get candidate matching rows based on arbitrary columns
def get_match_candidates(df, notna_cols, how='index'):
    '''
    function looks for rows in a dataframe that have values in a subset
    of columns but have at least one NA value in the other columns and
    returns those rows
    '''
    
    is_candidate = df[notna_cols].notna().all(axis='columns')

    other_cols = [c for c in df.columns if (c not in notna_cols)]
    if len(other_cols) > 0:
        is_candidate = is_candidate & df[other_cols].isna().any(axis='columns')

    if how == 'index':
        return df.loc[is_candidate].index
    elif how == 'rows':
        return df.loc[is_candidate, :]
    else:
        raise ValueError('how parameter must be "index" or "rows", not {}'
                         .format(how))

def reduce_rows(df, columns, **kwargs):
    output = df.groupby(by=columns, **kwargs)
    return output.apply(lambda x: x.apply(get_unique))

data = data.drop_duplicates()

matcher_parameters = {'ayp':['authors', 'year', 'page'],
                      'tvp':['title', 'volume', 'page'],
                      'doi':['doi']}

match_cnddte_indexes = {k:get_match_candidates(data, v)
                        for k,v in matcher_parameters.items()}

[print('found {} rows to reduce based on matching values for columns {}'
       .format(len(match_cnddte_indexes[k]), v))
 for k,v in matcher_parameters.items()]

reductions = {k:reduce_rows(data.loc[match_cnddte_indexes[k]], v)
              for k,v in matcher_parameters.items()}

reductions['doi']



found 37 rows to reduce based on matching values for columns ['authors', 'year', 'page']
found 35 rows to reduce based on matching values for columns ['title', 'volume', 'page']
found 155 rows to reduce based on matching values for columns ['doi']


Unnamed: 0_level_0,authors,year,title,volume,page,doi
doi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,asmith,2005.0,bams,100,3231,2
3,bwu,2000.0,nature,2575,801,3
5,asmith,2010.0,bams,105,457,5
11,bwu,2011.0,jats,90,35,11
13,alicesmith,2011.0,jats,90,711,13


In [73]:
def _check_for_prefix(s):
    parts = s.split('_', maxsplit=1)
    if (len(parts) > 1) and (len(parts[0]) == 1):
        return True
    else:
        return False


def _insert_default_prefix(column, default_codes, has_prefix):
    prefix = default_codes[column.name] + '_'
    no_prefix =  ~has_prefix[column.name]
    return column.mask(no_prefix, lambda x: prefix + x)


def _expand_manual_entries(df, coded_cols=None):
    
    if type(coded_cols) == dict:
        default_codes = coded_cols
        coded_cols = default_codes.keys()
    elif bg.iterable_not_string(coded_cols):
        default_codes = None
    else:
        coded_cols = [coded_cols]
        default_codes = None

    # combine columns to get a Series of all strings representing
    # documents
    entries = df.stack() \
                .reset_index(drop=True) \
                .drop_duplicates()
    entries.name = 'transcribed_value'

    # split on double underbars to get a dataframe of single-underbar
    # delimited strings, and join those columns with the original 
    # column of input values
    entries = pd.concat([entries, entries.str.split('__', expand=True)],
                        axis=1)
    
    if coded_cols is not None:
        # check for values in coded columns with missing prefix codes
        has_prefix = entries[coded_cols].applymap(_check_for_prefix)
        has_prefix = has_prefix.loc[:, has_prefix.any()]
        some_prefixes_missing = ~has_prefix.all()

        # if there are values with missing prefix codes, insert defaults
        if some_prefixes_missing.any():
            if default_codes is None:
                raise ValueError('Some values in columns with prefix codes '
                                 'are missing prefixes. Use the default_codes '
                                 'keyword argument to assign default prefixes.')
            mixed_columns = has_prefix.loc[:, ~has_prefix.all()].columns
            with_prefix = entries[mixed_columns]
            with_prefix = with_prefix.apply(_insert_default_prefix,
                                            args=(default_codes, has_prefix))
            entries.loc[:, mixed_columns] = with_prefix
        
        disagged = entries[coded_cols].stack().str.split('_', expand=True)
        disagged.index = disagged.index.get_level_values(0)
        disagged = disagged.pivot(columns=0, values=1)
        
        uncoded_cols = [c for c in entries.columns if c not in coded_cols]
        
        entries = pd.concat([entries[uncoded_cols], disagged], axis='columns')

    return entries

In [8]:
import bibliograph as bg
import pandas as pd

data = pd.DataFrame([['asmith__1999__bams__101__803__xxx','asmith_bwu__1998__bams__100__42__yyy'],
                     ['asmith__1999__bams__101__803__xxx','bjones__1975__jats__90__1'],
                     ['asmith__1999__bams__101__803__xxx','bwu__1989__t_long|title__x__80'],
                     ['asmith__1999__bams__101__803__xxx','f__nasa__NASA|grant|12345-6789'],
                     ['asmith__1999__bams__101__803__xxx','k__bethany|wu'],
                     ['asmith__1999__bams__101__803__xxx','o__nasa'],
                     ['asmith__1999__bams__101__803__xxx','q__super great quote, right?'],
                     ['asmith__1999__bams__101__803__xxx','n__this is an article I made up for testing']])

data = data.loc[data[1].str.slice(start=1, stop=3) != '__']
#coded_cols = [2]
default_codes = {2:'s'}
data = data.applymap(bg.data_file_input._manual_single_space_parser)
#bg.data_file_input._expand_manual_entries(data, default_codes)
bg.data_file_input._expand_manual_works(data, default_codes)

Unnamed: 0,transcribed_value,0,1,3,4,5,s,t
0,asmith__1999__bams__101__803__xxx,asmith,1999,101,803,xxx,bams,
1,asmith_bwu__1998__bams__100__42__yyy,asmith_bwu,1998,100,42,yyy,bams,
3,bjones__1975__jats__90__1,bjones,1975,90,1,,jats,
5,bwu__1989__t_long title__x__80,bwu,1989,x,80,,,long title


ND__nd_lt_rep__nd_lt_rep__...__nd_lt_rep tag tag, ND__nd_lt_rep__nd_lt_rep__...__nd_lt_rep tag tag, LT tag tag

actor_actor_actor__date__title__volume__page__doi

work__actor_author_rep__date_published_rep__work_title_rep__work_volume_rep__work_page_rep__identifier_doi_rep

In [16]:
import re
import random
import string

chars = string.ascii_letters + string.digits
a = random.choices(chars, k=int(1e7))

indexes = random.sample(range(len(a)), k=34000)

for i in indexes:
    a[i] = '|'

indexes = random.sample(range(len(a)), k=500)

for i in indexes:
    a[i:i+2] = '\\|'

a = ''.join(a)
#s = "ab|||cd|this|one:|\||doesn't|get|whitespaced"

#'|'.join((' '.join(re.split("(?<!\\\)[|]", s))).split("\|"))

In [17]:
%%timeit
'|'.join((' '.join(re.split("(?<!\\\)[|]", a))).split("\|"))


16.7 s ± 409 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [35]:
import re
sep='__'
a = 'this__string should __ not__be split here\\\\__but should be split __elsewhere'
re.split("(?<!\\\)({})".format(sep), a)[::2]
re.split("(?<=\\\)({})".format(sep), a)[::2]


['this__string should __ not__be split here\\\\',
 'but should be split __elsewhere']

In [2]:
import bibliograph as bg
import bibtexparser

filename = "C:\\Users\\short\\Dropbox\\PhD\\bibliograph\\test data\\bibtex_test_data_short.bib"

#bibtex_parser = bibtexparser.bparser.BibTexParser(common_strings=True)
#with open(filename, encoding='utf8') as f:
#    data = bibtex_parser.parse_file(f)
#data.entries_dict
tn = bg.load_bibtex(filename)


processing bibtex file C:\Users\short\Dropbox\PhD\bibliograph\test data\bibtex_test_data_short.bib
	loaded 11 records from bibtex file in 0.1 seconds
	size of initial dataframe: 0.1 mb
	created TextNet in 0.4 seconds
	overall time 0.5 seconds


In [6]:
tn.resolve_assertions(drop=True)

Unnamed: 0,assertion_id,source_id,link_type,src_string,tgt_string
0,0,0,author,"Newkirk, Gordon A.","@article{newkirk_daytime_1962,\n author = {New..."
1,1,0,author,"Eddy, John A.","@article{newkirk_daytime_1962,\n author = {New..."
2,17,0,contains,Nature,"@article{newkirk_daytime_1962,\n author = {New..."
3,28,0,title,Daytime Sky Radiance from Forty to Eighty Thou...,"@article{newkirk_daytime_1962,\n author = {New..."
4,46,0,metadata,1962__194__4829__638__641__10.1038/194638b0,"@article{newkirk_daytime_1962,\n author = {New..."
5,2,0,author,"Wiin-Nielsen, A.","@article{wiin-nielsen_transformation_1962,\n a..."
6,18,0,contains,Monthly Weather Review,"@article{wiin-nielsen_transformation_1962,\n a..."
7,29,0,title,ON TRANSFORMATION OF KINETIC ENERGY BETWEEN TH...,"@article{wiin-nielsen_transformation_1962,\n a..."
8,48,0,metadata,1962__90__8__311__323__10.1175/1520-0493(1962)...,"@article{wiin-nielsen_transformation_1962,\n a..."
9,3,0,author,"Wiin-Nielsen, A.","@article{wiin-nielsen_truncation_1962,\n abstr..."


In [90]:
w = bibtexparser.bwriter.BibTexWriter()
w._entry_to_bibtex(data.entries[0])

'@article{newkirk_daytime_1962,\n author = {Newkirk, Gordon A. and Eddy, John A.},\n copyright = {1962 Nature Publishing Group},\n doi = {10.1038/194638b0},\n file = {Newkirk_Eddy_1962_Daytime Sky Radiance from Forty to Eighty Thousand Feet.pdf:C\\:\\\\Dropbox\\\\Zotero\\\\storage\\\\6S2N7WD8\\\\Newkirk_Eddy_1962_Daytime Sky Radiance from Forty to Eighty Thousand Feet.pdf:application/pdf},\n issn = {1476-4687},\n journal = {Nature},\n language = {en},\n month = {May},\n note = {Number: 4829\nPublisher: Nature Publishing Group},\n number = {4829},\n pages = {638--641},\n title = {Daytime {Sky} {Radiance} from {Forty} to {Eighty} {Thousand} {Feet}},\n url = {https://www.nature.com/articles/194638b0},\n urldate = {2020-08-19},\n volume = {194},\n year = {1962}\n}\n\n'