In [1]:
import pandas as pd
import shorthand as shnd

s = shnd.Shorthand(
    entry_syntax="shorthand/resources/default_entry_syntax.csv",
    link_syntax="shorthand/resources/default_link_syntax.csv",
    item_separator='__',
    default_entry_prefix='wrk',
    space_char='|',
    na_string_values='!',
    na_node_type='missing',
    syntax_case_sensitive=False
)
'''
parsed = s.parse_text(
    'shorthand/test_data/manual_annotation.shnd',
    skiprows=2,
    comment_char='#'
)
'''
parsed = s.parse_text(
    'shorthand/test_data/single_column.shnd',
    skiprows=0,
    comment_char='#',
    drop_na=False
)

links = parsed.links
links.iloc[60:]
'''
print('       strings', parsed.strings.memory_usage(deep=True).sum()/1000, 'kb')
print('         links', parsed.links.memory_usage(deep=True).sum()/1000, 'kb')
print('resolved links', parsed.resolve_links().memory_usage(deep=True).sum()/1000, 'kb')
'''
entry_type = parsed.id_lookup('link_types', 'entry')
entry_string_ids = parsed.links.loc[parsed.links['link_type_id'] == entry_type, 'tgt_string_id']

parsed.resolve_links().query('src_string.str.contains("nasa")').query('src_node_type != "shorthand_text"').query('tgt_node_type != "shorthand_text"')

parsed.resolve_links().query('link_type == "cited"').merge(parsed.links, left_index=True, right_index=True)

s = parsed.synthesize_entries('wrk', fill_spaces=True)

check = pd.Series([
    'asmith_bwu__1999__s_bams__101__803__xxx',
    'asmith_bwu__1998__s_bams__100__42__yyy',
    'bjones__1975__s_jats__90__1__!',
    'bwu__1989__t_long|title__!__80__!',
    'Some|Author__1989__t_A|Title|With|\\#__!__!__!',
    'asmith_bwu__2008__s_bams__110__1__zzz'
])

(check == s).all()

True

In [4]:
pd.concat([pd.Series([1,2,3]), pd.Series(['a','b','c'])], axis='columns', ignore_index=True)

Unnamed: 0,0,1
0,1,a
1,2,b
2,3,c


In [12]:
pd.Series(['_____', '__ ', 'a']).str.replace

[1;31mSignature:[0m [0mstr[0m[1;33m.[0m[0mreplace[0m[1;33m([0m[0mself[0m[1;33m,[0m [0mold[0m[1;33m,[0m [0mnew[0m[1;33m,[0m [0mcount[0m[1;33m=[0m[1;33m-[0m[1;36m1[0m[1;33m,[0m [1;33m/[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Return a copy with all occurrences of substring old replaced by new.

  count
    Maximum number of occurrences to replace.
    -1 (the default value) means replace all occurrences.

If the optional argument count is given, only the first count occurrences are
replaced.
[1;31mType:[0m      method_descriptor


In [9]:
pd.read_csv('shorthand/test_data/single_column.shnd')

Unnamed: 0,left_entry,right_entry,link_tags_or_override,reference
0,asmith_bwu__1999__bams__101__803__xxx,,,
1,asmith_bwu__1998__bams__100__42__yyy,,,
2,bjones__1975__jats__90__1__ bjones1975_is_tagg...,,,
3,bwu__1989__t_long|title__!__80,,,
4,Some Author__1989__t_A Title With \#__!__!,,,
5,asmith_bwu__2008__bams__110__1__zzz,,,


In [6]:
from io import StringIO
import pandas as pd

with StringIO('left entry,\na,\nb,\nc,') as stream:
    a = pd.read_csv(stream)

a

Unnamed: 0,left entry,Unnamed: 1
0,a,
1,b,
2,c,


In [None]:
'''
Make a function that reads a csv file representing entries into
ParsedShorthand. Column labels are missing or numeric; entry syntax is
defined positionally as if each line were an entry string in the manual
scheme.
'''

import pandas as pd
import shorthand as shnd

na_string_values = []
na_node_type = 'missing'
entry_node_type = 'entry_node_type'
entry_syntax = ''
space_char = '|'
big_id_dtype = pd.Int32Dtype()
small_id_dtype = pd.Int8Dtype()

data = pd.DataFrame(columns=['item', 'labels'])

entries = data.apply(lambda x: ', '.join(map(str, x)), axis=1)
entries = pd.DataFrame({'string': entries.array, 'node_type': entry_node_type})
entries = entries.reset_index().rename(columns={'index': 'csv_row'})

# Replace NA values and empty strings with the first string in
# na_string_values
data = data.fillna(na_string_values[0])
data = data.replace('', na_string_values[0])

# items with no node type in the entry syntax are prefixed to
# indicate which node type they correspond to
item_is_prefixed = entry_syntax['item_node_type'].isna()

if item_is_prefixed.any():

    prefixed_items = entry_syntax.loc[item_is_prefixed]
    labels_of_prefixed_items = prefixed_items['item_label'].array

    # stack the prefixed items into a series
    disagged = data[labels_of_prefixed_items].stack()

    # Split the prefixes off of the stacked items and expand into a
    # dataframe
    disagged = disagged.groupby(level=1).apply(
        shnd.entry_parsing._item_prefix_splitter,
        prefixed_items
    )

    # drop the item labels from the multiindex so the disaggregated
    # items align with the index of the entry group
    disagged.index = disagged.index.droplevel(1)

    # pivot the disaggregated items to create a dataframe with
    # columns for each item prefix
    disagged = disagged.pivot(columns=0)
    disagged.columns = disagged.columns.get_level_values(1)

    # get labels of items that are not prefixed and present in this
    # dataset
    unprefixed_item_labels = [
        label for label in entry_syntax['item_label']
        if label.isdigit()
        and label in data.columns
        and label not in labels_of_prefixed_items
    ]

    # select only the unprefixed item labels
    data = data[unprefixed_item_labels]
    # concatenate the unprefixed and prefixed items
    data = pd.concat([data, disagged], axis='columns')

# Replace any empty strings with null values
data = data.mask(data == '', pd.NA)

# Regular expressions to match bare and escaped space placeholders
regex_space_char = shnd.util.escape_regex_metachars(space_char)
space_plchldr_regex = r"(?<!\\)({})".format(regex_space_char)
escaped_space_plchldr_regex = fr"(\\{regex_space_char})"

# Replace space placeholders with spaces in the data items
data = data.replace(
    to_replace=space_plchldr_regex,
    value=' ',
    regex=True
)
# Replace escaped space placeholders with bare placeholders
data = data.replace(
    to_replace=escaped_space_plchldr_regex,
    value=regex_space_char,
    regex=True
)

# Stack data. Stacking creates a series whose values are the string
# values of every item in the input and whose index levels are
#       input index, item label
data = data.stack()

# create a map from item labels to node types and link types
item_types = pd.DataFrame(
    {
        'node_type': entry_syntax['item_node_type'].array,
        'link_type': entry_syntax['item_link_type'].array
    },
    index=entry_syntax['item_label'].array
)
item_types = item_types.loc[data.index.get_level_values(1)]

data = pd.concat([data.rename('string'), item_types], axis=1)
# data = _set_StringDtype(data)

data = data.reset_index()
data = data.rename(
    columns={
        'level_0': 'csv_row',
        'level_1': 'item_label'
    }
)

# Concatenate expanded items with the entries
data = pd.concat([data, entries]).sort_index().fillna(pd.NA)

# For any strings that represent null values, overwrite the node
# type inferred from the syntax with the null node type
null_strings = data['string'].isin(na_string_values)
data.loc[null_strings, 'node_type'] = na_node_type

dtypes = {
    'csv_row': big_id_dtype,
    # 'item_label': pd.StringDtype(),
    # 'string': pd.StringDtype(),
    # 'node_type': pd.StringDtype(),
    # 'link_type': pd.StringDtype()
}
# can't use pd.StringDtype() throughout because it currently
# doesn't allow construction with null types other than pd.NA.
# This will likely change soon
# https://github.com/pandas-dev/pandas/pull/41412

data = data.astype(dtypes)
data.index = data.index.astype(big_id_dtype)

'''
data is currently a DataFrame with these columns:
    ['csv_row', 'item_label', 'string', 'node_type', 'link_type']
csv_row is integer-valued, others are 'object'
'''

# Map string-valued item labels to integer IDs
item_label_id_map = shnd.Shorthand._create_id_map(
    data['item_label'],
    dtype=small_id_dtype
)
# Replace item labels in the mutable data with integer IDs
data['item_label'] = data['item_label'].map(
    item_label_id_map
)
data = data.rename(
    columns={'item_label': 'item_label_id'}
)

# These link types are required to complete linking operations
# later
link_types = pd.Series(['entry', 'tagged', 'requires'])

# Map string-valued link types to integer IDs
link_types = shnd.Shorthand._create_id_map(
    pd.concat([link_types, data['link_type']]),
    dtype=small_id_dtype
)
# Replace link types in the mutable data with integer IDs
data['link_type'] = data['link_type'].map(
    link_types
)
data = data.rename(
    columns={'link_type': 'link_type_id'}
)
# Mutate link_types into a series whose index is integer IDs and
# whose values are string-valued link types
link_types = pd.Series(link_types.index, index=link_types)

'''
NEXT
    expand items that have list delimiters in the syntax
    (line 969 in Shorthand.py)

    get the list positions (lines 975-1006 in Shorthand.py)

    create node types and map them in data
    (lines 1036-1047 + line 1060 in Shorthand.py)

    make the strings table, convert 'string' to 'string_id'
    drop node_type_id from data

    make links (lines 1070-1130 + line 1354 in Shorthand.py)

    decide what to do with tags

    make an item_label_id_map

    return ParsedShorthand
'''


In [6]:
import pandas as pd

mi = pd.MultiIndex.from_arrays(((1,2), ('a','b')))
df = pd.DataFrame([[10,100],[20,200]], index=mi)

mi = pd.MultiIndex.from_arrays(((1,2), (pd.NA, pd.NA)))
df2 = pd.DataFrame([[30,300],[40,400]], index=mi)

pd.concat([df, df2])

Unnamed: 0,Unnamed: 1,0,1
1,a,10,100
2,b,20,200
1,,30,300
2,,40,400
