In [None]:
%run validate_partner_manifest_dev.ipynb

In [None]:
def validate_bioscan(fn, template_fn='../data/BIOSCAN_Manifest_V3_20240301.xlsx', 
                     samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributors', 
                     verbose=False, write_sts=True, bold_input=False):
    '''
    BIOSCAN partner manifest validation
    Validation follows the order of columns order in data entry sheet
    '''

    setup_logging(verbose=verbose)

    logging.info(f'# partner manifest validation v.{VALIDATION_VERSION}')
    logging.info(f'# validating against BIOSCAN manifest v.{BIOSCAN_VERSION}')
    logging.info(f'# manifest "{fn}"')

    # read data
    template_df = get_data(template_fn, sheet='TAB 2 Metadata Entry')
    if bold_input:
        df = parse_bold(fn, template_df, sheet='BGE entry')
    else:
        df = get_data(fn, sheet=samples_sheet)
    df = fix_date_formats(df)
    # check manifest verison
    v = infer_bioscan_version(df)
    # check series, exclude non-numeric
    df = validate_series(df)
    # clean up data
    df = remove_trailing_spaces(df, title='sample')
    
    # read NCBI taxonomy
    ncbi = ete3.NCBITaxa()
    
    # prepare for validation
    check_columns(df, template_df, bioscan_version=v)
    valid_dict = get_valid_dict(template_fn, validation_sheet='TAB 4 DO NOT EDIT - Data Valida')
    # update validation dictionary from v3 to v2
    if v == 'v2':
        valid_dict['SORTING_SOLUTION_USED'] += ['N']
    if bold_input:
        contrib_df = validate_contributors(template_fn, contrib_sheet=contrib_sheet)
    else:
        contrib_df = validate_contributors(fn, contrib_sheet=contrib_sheet)

    # orange cols
    validate_regex('CATCH_LOT', df, na_values=[])
    
    df, gal, partner_code = validate_plates_wells(
        df, contrib_df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID', bioscan=True, bioscan_version=v)
    # check and exclude blanks
    df, is_blank = check_blanks(df, bioscan=True)
    if df[~is_blank].shape[0] == 0:
        logging.error('no non-blank samples to validate, terminating')
        return df
    # hope to get WING into STS ORGANISM_PART values later, fixing on our side for now
    if df['ORGANISM_PART'].str.contains('WING').any():
        logging.warning('replacing ORGANISM_PART "WING" entries with “**OTHER_SOMATIC_ANIMAL_TISSUE**”')
        df['ORGANISM_PART'] = df['ORGANISM_PART'] \
                .str.replace('WING', '**OTHER_SOMATIC_ANIMAL_TISSUE**', regex=False)
    validate_values('ORGANISM_PART', df, valid_dict, sep='|')
#     df = strip_asterisks('ORGANISM_PART', df)
    validate_values('PRESERVATIVE_SOLUTION', df, valid_dict)
#     df = strip_asterisks('PRESERVATIVE_SOLUTION', df)
    # columns below validated for non-blank samples only
    if v == 'v3':
        validate_values('CATCH_SOLUTION', df[~is_blank], valid_dict)
        df = strip_asterisks('CATCH_SOLUTION', df)
    is_malaise = (~is_blank & (df['COLLECTION_METHOD'] == 'MALAISE_TRAP'))
    is_other = (~is_blank & (df['COLLECTION_METHOD'] != 'MALAISE_TRAP'))
    validate_values('BOTTLE_DIRECTION', df[is_malaise], valid_dict, 
                    extra_msg=' for malaise trap samples')
    # allow only empty for non-malaise trap - weird capture of bottle direction filled and collection method empty
    validate_values('BOTTLE_DIRECTION', df[is_other], {'BOTTLE_DIRECTION':['']}, 
                    extra_msg=' for non-malaise trap samples')
    # no missing datex expected for malaise
    validate_regex('DATE_OF_COLLECTION', df[is_malaise], 
                   extra_msg=' for malaise trap samples')
    validate_regex('DATE_OF_COLLECTION', df[is_other], na_values=['NOT_COLLECTED'],
                   extra_msg=' for non-malaise trap samples')
    check_catch_lot_dates(df[~is_blank])
    validate_regex('DECIMAL_LATITUDE', df[~is_blank], na_values=[])
    validate_regex('DECIMAL_LONGITUDE', df[~is_blank], na_values=[])
    # COUNTRY_OF_COLLECTION, DECIMAL_LATITUDE, DECIMAL_LONGITUDE
    validate_country_and_coordinates(df[~is_blank], fn, na_values=[''], bioscan=True)
    # COLLECTION_LOCATION not checked
    
    # purple cols - valiated for non-blank samples
    validate_regex('WHAT_3_WORDS', df[~is_blank], na_values=[''])
    # no missing times expected for malaise
    validate_regex('TIME_OF_COLLECTION', df[is_malaise],
                   extra_msg=' for malaise trap samples') 
    validate_regex('TIME_OF_COLLECTION', df[is_other], na_values=['NOT_COLLECTED',''],
                   extra_msg=' for non-malaise trap samples') 
    # no missing durations for malaise
    validate_regex('DURATION_OF_COLLECTION', df[is_malaise],
                   extra_msg=' for malaise trap samples')
    validate_regex('DURATION_OF_COLLECTION', df[is_other], na_values=['NOT_COLLECTED',''],
                   extra_msg=' for non-malaise trap samples')
    validate_values('COLLECTION_METHOD', df[~is_blank], valid_dict, na_values=[''])
    df = strip_asterisks('COLLECTION_METHOD', df)
    # no missing plating dates for malaise
    validate_regex('DATE_OF_PLATING', df[is_malaise],
                   extra_msg=' for malaise trap samples')
    validate_regex('DATE_OF_PLATING', df[~is_blank], na_values=['NOT_COLLECTED',''],
                   extra_msg=' for non-malaise trap samples')
    compare_dates_text('DATE_OF_COLLECTION', 'DATE_OF_PLATING', df[~is_blank])
    # taxonomy validation adds taxid columns to original dataframe - skipping for now
    df = validate_taxonomy(df, ncbi, anospp=False, na_values = [''])
    validate_values('SPECIMEN_IDENTITY_RISK', df[~is_blank], valid_dict, na_values=[''])
    validate_specimen_id_risk(df)
    validate_values('LIFESTAGE', df[~is_blank], valid_dict, na_values=[''])
    validate_values('SEX', df[~is_blank], valid_dict, na_values=[''])
    validate_values('SORTING_SOLUTION_USED', df[~is_blank], valid_dict, na_values=[''])
    validate_values('CATCH_BOTTLE_TEMPERATURE_STORAGE', df[~is_blank], valid_dict, na_values=[''])
    validate_values('PLATE_TEMPERATURE_STORAGE', df[~is_blank], valid_dict, na_values=[''])
    if v == 'v3':
        validate_values('AMOUNT_OF_CATCH_PLATED', df[~is_blank], valid_dict)
    # white cols - validated for all samples
    validate_freetext('MORPHOSPECIES_DESCRIPTION', df)
    validate_freetext('DESCRIPTION_OF_COLLECTION_METHOD', df)
    validate_freetext('HABITAT', df)
    validate_freetext('PRESERVATION_APPROACH', df)
    # TODO check if STS will need something here
    validate_freetext('COLLECTOR_SAMPLE_ID', df)
    validate_freetext('VOUCHER_ID', df)
    validate_regex('ELEVATION', df, na_values=[''])
    validate_freetext('OTHER_INFORMATION', df)
    # validate_freetext('MISC_METADATA', df)
    validate_identifier('IDENTIFIED_BY', df, contrib_df, na_values=[''])
    
    df = expand_plate_only(df)
    df = add_sts_cols(df, contrib_df, gal, bioscan=True, v=v)
    if bold_input:
        logging.info('Replacing CONTRIBUTORS with IDENTIFIED_BY for BOLD manifest')
        df['CONTRIBUTORS'] = df['IDENTIFIED_BY']
    
    
    # keep filename operations together
    validate_input_filename(fn, partner_code, v)
    if write_sts:
        write_sts_manifest(df, fn, VALIDATION_VERSION)
        
    logging.info('# ended validation of bioscan partner manifest')
    
    print('\n'.join(df.RACK_OR_PLATE_ID.unique()))

    return df

In [None]:
df = validate_bioscan(fn='../data/BIOSCAN_Manifest_V3_20240301.xlsx', 
                      template_fn='../data/BIOSCAN_Manifest_V3_20240301.xlsx', 
                      verbose=False,
                      samples_sheet='TAB 6 TEST',
                      write_sts=True)

In [None]:
raise Exception('Setup complete')

## Validation

In [None]:
fn = '../results/20241120_bgku_2024_v3/BGKU_2024_BIOSCAN_Manifest_V2.0_patched(1).xlsx'
df = validate_bioscan(fn, samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributors', verbose=False)

In [None]:
%run validate_partner_manifest_dev.ipynb
fn = '../results/20241104_bgeg/TOL-BGEG-082-083_single_specimen_metadata&biobanking.xlsx'
df = validate_bioscan(fn, samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributors', 
                      verbose=False, bold_input=True)