In [3]:
%%capture
%run validate_partner_manifest_dev.ipynb

In [None]:
def validate_bioscan(fn, template_fn, verbose=False, version='1.0'):
    '''
    Validation follows the order of columns order in data entry sheet
    '''

    setup_logging(verbose=verbose)

    logging.info(f'# started validate_partner_manifest_v.{version}')
    logging.warning(f'# manifest {fn}')

    # read data
    df = get_data(fn)
    
    # read NCBI taxonomy
    ncbi = ete3.NCBITaxa()
    
    # prepare for validation
    template_df = get_data(template_fn)
    check_columns(df, template_df)
    valid_dict = get_valid_dict(template_fn)

    # orange cols
    # exclude empty series
    df = validate_series(df)
    df = validate_plates_wells(df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID')
    
    # check and exclude blanks
    df = check_blanks(df)
    
    validate_values('PRESERVATIVE_SOLUTION', df, valid_dict)
    # CATCH_LOT not checked TODO do not allow missing
    validate_values('BOTTLE_DIRECTION', df, valid_dict)
    validate_values('ORGANISM_PART', df, valid_dict, sep='|')
    validate_values('HAZARD_GROUP', df, valid_dict)
    validate_values('REGULATORY_COMPLIANCE', df, valid_dict)
    date_coll = validate_date('DATE_OF_COLLECTION', df, na_values=['NOT_COLLECTED'])
    check_location(df, fn)
    
    # purple cols
    # taxonomy validation adds a few columns
    df = validate_ncbi_taxonomy(df, ncbi, na_values = ['NOT_COLLECTED'])
    validate_values('SEX', df, valid_dict)
    # HABITAT not checked
    validate_time('TIME_OF_COLLECTION', df)
    validate_time_period('DURATION_OF_COLLECTION', df, na_values=['NOT_COLLECTED'])
    validate_values('COLLECTION_METHOD', df, valid_dict)
    # DESCRIPTION_OF_COLLECTION_METHOD not checked
    validate_int('TIME_ELAPSED_FROM_COLLECTION_TO_PLATING', df, na_values=[''])
    # PHOTOGRAPH_* columns not checked
    # VOUCHER_ID not checked
    # PRESERVATION_APPROACH not checked - should match DATE_OF_PRESERVATION
    date_pres = validate_date('DATE_OF_PRESERVATION', df, na_values=['']) # allow for empty values unlike DATE_OF_COLLECTION
    compare_dates(before=date_coll, after=date_pres)
    # COLLECTOR_SAMPLE_ID not checked
    validate_int('ELEVATION', df, na_values=[''])
    # OTHER_INFORMATION	MISC_METADATA	IDENTIFIED_BY	IDENTIFIER_AFFILIATION	IDENTIFIED_HOW not checked
        
    logging.info('# ended validate_partner_manifest_v.{}'.format(version))

    return df

# fn = '../../results/partner_manifests/IRD-Neandersquito_T222Amplicon_Manifest_V2.0.xlsx'
df = validate(fn, template_fn, verbose=True)

In [2]:
df = validate('../results/20220405/NBGW-[20210527]-manifest.xlsx', template_fn, verbose=False)



In [3]:
df = validate('../results/20220405/NBGW-[20210805]-manifest.xlsx', template_fn, verbose=False)

[ERROR] FAMILY: unexpected case for "NOT_APPLICABLE", changing to "Not_applicable" for validation
[ERROR] FAMILY: {'Not_applicable'} not found in NCBI Taxonomy
[ERROR] GENUS: unexpected case for "NOT_APPLICABLE", changing to "Not_applicable" for validation
[ERROR] GENUS: {'Not_applicable'} not found in NCBI Taxonomy


In [4]:
df = validate('../results/20220405/NBGW-[20210903]-manifest.xlsx', template_fn, verbose=False)

[ERROR] ORDER: {'Opilones'} not found in NCBI Taxonomy


In [5]:
df = validate('../results/20220405/NBGW-[20210930]-manifest.xlsx', template_fn, verbose=False)



In [6]:
df = validate('../results/20220405/NBGW-[20211026]-manifest.xlsx', template_fn, verbose=False)

[ERROR] found non-integer value in TIME_ELAPSED_FROM_COLLECTION_TO_PLATING: "0.5"


In [7]:
df = validate('../results/20220405/NBGW-[20211130]-manifest.xlsx', template_fn, verbose=False)



In [8]:
df = validate('../results/20220405/NBGW-[20220105]-manifest.xlsx', template_fn, verbose=False)



In [9]:
df = validate('../results/20220405/NBGW-[20220201]-manifest.xlsx', template_fn, verbose=False)



In [11]:
df = validate('../results/20220405/NHM-BIOSCAN-Manifest_05042022.xlsx', template_fn, verbose=False)

[ERROR] template columns missing from filled manifest: {'MISC_METADATA'}
[ERROR] for blanks, ORGANISM_PART expected to be BLANK_SAMPLE, found {'NOT_APPLICABLE'}
[ERROR] ORDER: unexpected case for "NOT_APPLICABLE", changing to "Not_applicable" for validation
[ERROR] ORDER: {'Not_applicable'} not found in NCBI Taxonomy
[ERROR] invalid times in 'DURATION_OF_COLLECTION': ['P3D' 'P6D' 'P1D' 'P7D']
[ERROR] invalid dates in 'DATE_OF_PRESERVATION': ['NOT_APPLICABLE']
