In [9]:
%run validate_partner_manifest_dev.ipynb

[INFO] test
[INFO] reading data from 'data/NE BIOSCAN_Manifest_V1.0_Yarner_2021.xlsx'
[INFO] reading data from '../data/BIOSCAN_Manifest_V1.0_20211207.xlsx'
[INFO] checking manifest columns against template
[INFO] extracting value validation data from '../data/BIOSCAN_Manifest_V1.0_20211207.xlsx'
[INFO] excluding 290 ['NOT_COLLECTED', ''] samples without data in 'TIME_OF_COLLECTION'
[INFO] validating SERIES
[ERROR] Found and excluded non-numeric SERIES: ['example-small', 'example-large', 'example-handcaught']
[INFO] validating RACK_OR_PLATE_ID and TUBE_OR_WELL_ID
[ERROR] Found and excluded 240 empty rows based on RACK_OR_PLATE_ID and TUBE_OR_WELL_ID
[INFO] found 1056 samples across 11 plates
[INFO] Checking and excluding blank samples
[ERROR] last well H12 is not blank at SERIES [96, 192, 288, 384, 480, 576, 672, 768, 864, 960, 1056]: in ORGANISM_PART, expected "NOT_APPLICABLE", found ['BLANK_SAMPLE', 'BLANK_SAMPLE', 'BLANK_SAMPLE', 'BLANK_SAMPLE', 'BLANK_SAMPLE', 'BLANK_SAMPLE', 'BLAN

(1056, 38)
(1055, 38)


[INFO] validating int format in TIME_ELAPSED_FROM_COLLECTION_TO_PLATING
[ERROR] found non-integer value in TIME_ELAPSED_FROM_COLLECTION_TO_PLATING: "NOT_APPLICABLE"
[ERROR] found non-integer value in TIME_ELAPSED_FROM_COLLECTION_TO_PLATING: ""
[INFO] validating date column 'DATE_OF_COLLECTION'
[ERROR] invalid dates in 'DATE_OF_COLLECTION': ['NOT_COLLECTED' 'NOT_APPLICABLE' '']
[INFO] validating date column 'DATE_OF_PRESERVATION'
[ERROR] invalid dates in 'DATE_OF_PRESERVATION': ['' 'NOT_APPLICABLE']


Exception: Done for now

Exception: Done for now

In [10]:
def validate_anospp(fn, template_fn, verbose=False, version='4.0', samples_sheet='TAB 2: Metadata Entry'):
    '''
    Validation follows the order of columns order in data entry sheet
    '''

    setup_logging(verbose=verbose)

    logging.info(f'# started validate_anospp_partner_manifest_v.{version}')
    logging.warning(f'# manifest {fn}')

    # read data
    df = get_data(fn, sheet=samples_sheet)
    df = remove_trailing_spaces(df)
    
    # read taxonomy
    ncbi = ete3.NCBITaxa()
    
    # prepare for validation
    template_df = get_data(template_fn, sheet='TAB 2 Metadata Entry')
    check_columns(df, template_df)
    valid_dict = get_valid_dict(template_fn, validation_sheet='TAB 4Data Validation - do not e')

    # orange cols
    # exclude empty series
    df = validate_series(df)
    df = validate_plates_wells(df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID')
    
    # check and exclude blanks
    df = check_blanks(df)
    
    validate_values('PRESERVATIVE_SOLUTION', df, valid_dict)
    # CATCH_LOT not checked TODO do not allow missing
    validate_values('BOTTLE_DIRECTION', df, valid_dict)
    validate_values('ORGANISM_PART', df, valid_dict, sep='|')
    validate_values('HAZARD_GROUP', df, valid_dict)
    validate_values('REGULATORY_COMPLIANCE', df, valid_dict)
    date_coll = validate_date('DATE_OF_COLLECTION', df, na_values=['NOT_COLLECTED',''])
    check_location(df, fn)
    
    # purple cols
    # taxonomy validation - currently taxIDs are not used
    validate_ncbi_taxonomy_anospp(df, ncbi, na_values = [''])
    validate_values('SEX', df, valid_dict)
    # HABITAT not checked
    validate_time('TIME_OF_COLLECTION', df, na_values=['NOT_COLLECTED',''])
    validate_time_period('DURATION_OF_COLLECTION', df, na_values=['NOT_COLLECTED',''])
    validate_values('COLLECTION_METHOD', df, valid_dict)
    # DESCRIPTION_OF_COLLECTION_METHOD not checked
    validate_int('TIME_ELAPSED_FROM_COLLECTION_TO_PLATING', df, na_values=[''])
    # PHOTOGRAPH_* columns not checked
    # VOUCHER_ID not checked
    # PRESERVATION_APPROACH not checked - should match DATE_OF_PRESERVATION
    date_pres = validate_date('DATE_OF_PRESERVATION', df, na_values=['']) # allow for empty values unlike DATE_OF_COLLECTION
    compare_dates(before=date_coll, after=date_pres)
    # COLLECTOR_SAMPLE_ID not checked
    validate_int('ELEVATION', df, na_values=[''])
    # OTHER_INFORMATION	MISC_METADATA	IDENTIFIED_BY	IDENTIFIER_AFFILIATION	IDENTIFIED_HOW not checked
        
    logging.info('# ended validate_partner_manifest_v.{}'.format(version))

    return df

fn = '../data/Anopheles_Metadata_Manifest_V4.0.xlsx'
template_fn = '../data/Anopheles_Metadata_Manifest_V4.0.xlsx'
df = validate_anospp(fn, template_fn, verbose=True, samples_sheet='TAB 5 EXAMPLE Metadata Entry')

[INFO] # started validate_anospp_partner_manifest_v.4.0
[INFO] reading data from '../data/Anopheles_Metadata_Manifest_V4.0.xlsx'
[INFO] reading data from '../data/Anopheles_Metadata_Manifest_V4.0.xlsx'
[INFO] checking manifest columns against template
[INFO] extracting value validation data from '../data/Anopheles_Metadata_Manifest_V4.0.xlsx'
[INFO] validating SERIES
[ERROR] Found and excluded non-numeric SERIES: ['example']
[INFO] validating RACK_OR_PLATE_ID and TUBE_OR_WELL_ID
[ERROR] Found and excluded 4512 empty rows based on RACK_OR_PLATE_ID and TUBE_OR_WELL_ID
[INFO] found 288 samples across 3 plates
[INFO] Checking and excluding blank samples
[INFO] Blanks removed: 285 samples of 288 left for downstream analysis
[INFO] validating values in column 'PRESERVATIVE_SOLUTION'
[INFO] validating values in column 'BOTTLE_DIRECTION'
[ERROR] 'BOTTLE_DIRECTION' column not found in manifest
[INFO] validating values in column 'ORGANISM_PART'
[INFO] validating values in column 'HAZARD_GROUP'
[