## Setup

In [None]:
%run validate_partner_manifest_dev.ipynb

In [None]:
def validate_anospp(fn, template_fn='../data/Anopheles_Metadata_Manifest_V4.0_20240813.xlsx', 
                    verbose=False, samples_sheet='TAB 2 Metadata Entry',
                    contrib_sheet='TAB 1 Contributors', write_sts=True):
    '''
    ANOSPP partner manifest validation
    Validation follows the order of columns order in data entry sheet
    '''

    setup_logging(verbose=verbose)

    logging.info(f'# partner manifest validation v.{VALIDATION_VERSION}')
    logging.info(f'# validating against ANOSPP manifest v.{ANOSPP_VERSION}')
    logging.info(f'# manifest "{fn}"')

    # read data
    df = get_data(fn, sheet=samples_sheet)
    df = fix_date_formats(df)
    # validate series, exclude non-numeric
    df = validate_series(df)
    # clean up data
    df = remove_trailing_spaces(df, title='sample')
    
    # read NCBI taxonomy
    ncbi = ete3.NCBITaxa()
    
    # prepare for validation
    template_df = get_data(template_fn, sheet='TAB 2 Metadata Entry')
    check_columns(df, template_df)
    valid_dict = get_valid_dict(template_fn, validation_sheet='TAB 5 Data Validation - do not ')
    contrib_df = validate_contributors(fn, contrib_sheet=contrib_sheet)
    
    # check columns in order of appearance
    
    # orange cols
    # RACK_OR_PLATE_ID, TUBE_OR_WELL_ID
    df, gal, partner_code = validate_plates_wells(df, contrib_df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID')
    # check blanks
    df, is_blank = check_blanks(df)
    if df[~is_blank].shape[0] == 0:
        logging.error('no non-blank samples to validate, terminating')
        return df
    validate_values('PRESERVATIVE_SOLUTION', df, valid_dict)
    validate_values('ORGANISM_PART', df, valid_dict, sep='|')
    # columns below validated for non-blank samples only
    validate_regex('DATE_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])
    validate_regex('DECIMAL_LATITUDE', df[~is_blank])
    validate_regex('DECIMAL_LONGITUDE', df[~is_blank])
    # COLLECTION_COUNTRY, DECIMAL_LATITUDE, DECIMAL_LONGITUDE
    validate_country_and_coordinates(df[~is_blank], fn, na_values=[''])
    # COLLECTION_LOCATION not checked
    validate_values('SAMPLING_LOCATION_SIZE', df[~is_blank], valid_dict)
    
    # purple cols - validate non-blank samples only
    df = validate_taxonomy(df, ncbi, anospp=True, na_values = [''])
    validate_values('SPECIMEN_IDENTITY_RISK', df[~is_blank], valid_dict, na_values = [''])
    validate_specimen_id_risk(df)
    validate_freetext('IDENTIFIED_HOW', df[~is_blank])
    validate_values('LIFESTAGE', df[~is_blank], valid_dict)
    validate_values('SEX', df[~is_blank], valid_dict, na_values = [''])
    validate_regex('TIME_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])
    validate_regex('DURATION_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])
    validate_values('COLLECTION_METHOD', df[~is_blank], valid_dict)
    validate_values('OUTDOORS_INDOORS', df[~is_blank], valid_dict, na_values = [''])
    validate_freetext('DESCRIPTION_OF_COLLECTION_METHOD', df[~is_blank])
    validate_freetext('PRESERVATION_APPROACH', df[~is_blank])
    
    # white cols - validate all samples
    validate_values('BLOOD_MEAL', df, valid_dict, na_values=[''])
    validate_values('GRAVIDITY', df, valid_dict, na_values=[''])
    validate_freetext('HABITAT', df)
    validate_regex('DATE_OF_PRESERVATION', df, na_values=['']) # allow for empty values unlike DATE_OF_COLLECTION
    compare_dates_text('DATE_OF_COLLECTION', 'DATE_OF_PRESERVATION', df[~is_blank])
    validate_regex('ELEVATION', df, na_values=[''])
    validate_regex('WHAT_3_WORDS', df[~is_blank], na_values=[''])
    validate_freetext('OTHER_ORGANISMS', df)
    validate_freetext('BIOASSAYS', df)
    validate_freetext('COLLECTOR_SAMPLE_ID', df)
    validate_freetext('OTHER_INFORMATION', df)
    # MISC_METADATA can be removed safely
    if 'MISC_METADATA' in df.columns:
        validate_freetext('MISC_METADATA', df)
    validate_freetext('DNA_EXTRACTION_DESCRIPTION', df)
    validate_regex('DNA_EXTRACT_VOLUME_PROVIDED', df, na_values=[''])
    validate_regex('DNA_EXTRACT_CONCENTRATION', df, na_values=[''])
            
    
    
    df = add_sts_cols(df, contrib_df, gal, bioscan=False, v='NA')
    if write_sts:
        write_sts_manifest(df, fn, VALIDATION_VERSION)

    logging.info('# ended validate_anospp_partner_manifest')
    
    print('\n'.join(df.RACK_OR_PLATE_ID.unique()))
    
    return df

In [None]:
fn = '../data/Anopheles_Metadata_Manifest_V4.0_20240813.xlsx'
template_fn = fn
df = validate_anospp(fn, template_fn, verbose=True, samples_sheet='TAB 3 TEST Metadata Entry')

In [None]:
raise Exception('Setup complete')

## Validation

In [None]:
fn = '../results/20240819_mg/MANIFEST_MOSQUITOES_MADAGASCAR_am60_2.xlsx'
df = validate_anospp(fn, template_fn, samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributors')