In [1]:
%%capture
%run validate_partner_manifest_dev.ipynb

In [2]:
def validate_bioscan(fn, template_fn='../data/BIOSCAN_Manifest_V2.0_20221017.xlsx', 
                     verbose=False, version='2.0', samples_sheet='TAB 2 Metadata Entry'):
    '''
    BIOSCAN partner manifest validation
    Validation follows the order of columns order in data entry sheet
    '''

    setup_logging(verbose=verbose)

    logging.info(f'# started validate_bioscan_partner_manifest_v.{version}')
    logging.warning(f'# manifest {fn}')

    # read data
    df = get_data(fn, sheet=samples_sheet)
    df = remove_trailing_spaces(df)
    
    # read NCBI taxonomy
    ncbi = ete3.NCBITaxa()
    
    # prepare for validation
    template_df = get_data(template_fn, sheet='TAB 2 Metadata Entry')
    check_columns(df, template_df)
    valid_dict = get_valid_dict(template_fn, validation_sheet='TAB 4 DO NOT EDIT - Data Valida')

    # orange cols
    # exclude empty series
    df = validate_series(df)
    # CATCH_LOT not checked TODO do not allow missing
    df = validate_plates_wells(df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID', bioscan=True)
    # check and exclude blanks - TODO yield all samples for STS
    is_blank = check_blanks(df)
    validate_values('ORGANISM_PART', df, valid_dict, sep='|')
    validate_values('PRESERVATIVE_SOLUTION', df, valid_dict)
    # columns below validated for non-blank samples only
    validate_values('BOTTLE_DIRECTION', df[~is_blank], valid_dict)
    date_coll = validate_date('DATE_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])
    # COUNTRY_OF_COLLECTION, DECIMAL_LATITUDE, DECIMAL_LONGITUDE
    validate_country_and_coordinates(df[~is_blank], fn, na_values=[''], bioscan=True)
    # COLLECTION_LOCATION not checked
    
    # purple cols - valiated for non-blank samples
    # WHAT_3_WORDS not checked
    validate_time('TIME_OF_COLLECTION', df[~is_blank])
    validate_time_period('DURATION_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED'])
    validate_values('COLLECTION_METHOD', df[~is_blank], valid_dict)
    date_plat = validate_date('DATE_OF_PLATING', df[~is_blank], na_values=['NOT_COLLECTED',''])
    compare_dates(before=date_coll, after=date_plat)
    # taxonomy validation adds taxid columns to original dataframe
    df = validate_taxonomy(df, ncbi, anospp=False, na_values = [''])
    # TODO check that SPECIMEN_IDENTITY_RISK only filled where species ID exists
    validate_values('SPECIMEN_IDENTITY_RISK', df[~is_blank], valid_dict, na_values=[''])
    validate_values('LIFESTAGE', df[~is_blank], valid_dict, na_values=[''])
    validate_values('SEX', df[~is_blank], valid_dict, na_values=[''])
    validate_values('SORTING_SOLUTION_USED', df[~is_blank], valid_dict)
    validate_values('CATCH_BOTTLE_TEMPERATURE_STORAGE', df[~is_blank], valid_dict)
    validate_values('PLATE_TEMPERATURE_STORAGE', df[~is_blank], valid_dict)
    
    # white cols - validated for all samples
    # MORPHOSPECIES_DESCRIPTION not checked
    # DESCRIPTION_OF_COLLECTION_METHOD not checked
    # HABITAT not checked
    # PRESERVATION_APPROACH not checked - should match DATE_OF_PRESERVATION
    # COLLECTOR_SAMPLE_ID not checked - STS will need something here
    # VOUCHER_ID not checked
    validate_float('ELEVATION', df, na_values=[''])
    # OTHER_INFORMATION not checked
    # MISC_METADATA not checked
    # IDENTIFIED_BY not checked
    # IDENTIFIER_AFFILIATION not checked
    
    validate_contributors(fn)
        
    logging.info('# ended validate_bioscan_partner_manifest_v.{}'.format(version))

    return df

# fn = '../../results/partner_manifests/IRD-Neandersquito_T222Amplicon_Manifest_V2.0.xlsx'
df = validate_bioscan(fn='../data/BIOSCAN_Manifest_V2.0_20221017.xlsx', 
                      template_fn='../data/BIOSCAN_Manifest_V2.0_20221017.xlsx', 
                      verbose=True,
                      samples_sheet='TAB 5 DO NOT EDIT - TEST Met')

[INFO] # started validate_bioscan_partner_manifest_v.2.0
[INFO] reading data from '../data/BIOSCAN_Manifest_V2.0_20221017.xlsx'
[INFO] reading data from '../data/BIOSCAN_Manifest_V2.0_20221017.xlsx'
[INFO] checking manifest columns against template
[INFO] extracting value validation data from '../data/BIOSCAN_Manifest_V2.0_20221017.xlsx'
[INFO] validating SERIES
[INFO] validating RACK_OR_PLATE_ID and TUBE_OR_WELL_ID
[INFO] found 288 samples across 3 plates
[INFO] Checking and excluding blank samples
[INFO] validating values in column 'ORGANISM_PART'
[INFO] excluding 0 [] samples without data in 'ORGANISM_PART'
[INFO] validating values in column 'PRESERVATIVE_SOLUTION'
[INFO] excluding 0 [] samples without data in 'PRESERVATIVE_SOLUTION'
[INFO] validating values in column 'BOTTLE_DIRECTION'
[INFO] excluding 0 [] samples without data in 'BOTTLE_DIRECTION'
[INFO] validating date column 'DATE_OF_COLLECTION'
[INFO] excluding 0 ['NOT_COLLECTED', ''] samples without data in 'DATE_OF_COLLECTIO