In [None]:
%run validate_partner_manifest_dev.ipynb

In [None]:
def validate_bioscan(fn, gal='Sanger Institute', template_fn='../data/BIOSCAN_Manifest_V2.0_20230217.xlsx', 
                     samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributors', 
                     verbose=False, write_sts=True):
    '''
    BIOSCAN partner manifest validation
    Validation follows the order of columns order in data entry sheet
    '''

    setup_logging(verbose=verbose)

    logging.warning(f'# started validate_bioscan_partner_manifest_v.{BIOSCAN_VERSION}')
    logging.warning(f'# manifest {fn}')

    # read data
    df = get_data(fn, sheet=samples_sheet)
    df = remove_trailing_spaces(df)
    
    # read NCBI taxonomy
    ncbi = ete3.NCBITaxa()
    
    # prepare for validation
    template_df = get_data(template_fn, sheet='TAB 2 Metadata Entry')
    check_columns(df, template_df)
    valid_dict = get_valid_dict(template_fn, validation_sheet='TAB 4 DO NOT EDIT - Data Valida')
    contrib_df = validate_contributors(fn, contrib_sheet=contrib_sheet)

    # orange cols
    validate_freetext('CATCH_LOT', df, na_values=[])
    df = validate_plates_wells(df, contrib_df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID', bioscan=True)
    # check series
    df = validate_series(df)
    # check and exclude blanks
    is_blank = check_blanks(df)
    if df[~is_blank].shape[0] == 0:
        logging.error('no non-blank samples to validate, terminating')
        return df
    validate_values('ORGANISM_PART', df, valid_dict, sep='|')
    validate_values('PRESERVATIVE_SOLUTION', df, valid_dict)
    # columns below validated for non-blank samples only
    validate_values('BOTTLE_DIRECTION', df[~is_blank], valid_dict)
    date_coll = validate_date('DATE_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])
    # COUNTRY_OF_COLLECTION, DECIMAL_LATITUDE, DECIMAL_LONGITUDE
    validate_country_and_coordinates(df[~is_blank], fn, na_values=[''], bioscan=True)
    # COLLECTION_LOCATION not checked
    
    # purple cols - valiated for non-blank samples
    # WHAT_3_WORDS not checked
    validate_time('TIME_OF_COLLECTION', df[~is_blank])
    validate_time_period('DURATION_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED'])
    validate_values('COLLECTION_METHOD', df[~is_blank], valid_dict)
    date_plat = validate_date('DATE_OF_PLATING', df[~is_blank], na_values=['NOT_COLLECTED',''])
    compare_dates(before=date_coll, after=date_plat)
    # taxonomy validation adds taxid columns to original dataframe - skipping for now
    validate_taxonomy(df, ncbi, anospp=False, na_values = [''])
    validate_values('SPECIMEN_IDENTITY_RISK', df[~is_blank], valid_dict, na_values=[''])
    validate_specimen_id_risk(df)
    validate_values('LIFESTAGE', df[~is_blank], valid_dict, na_values=[''])
    validate_values('SEX', df[~is_blank], valid_dict, na_values=[''])
    validate_values('SORTING_SOLUTION_USED', df[~is_blank], valid_dict)
    validate_values('CATCH_BOTTLE_TEMPERATURE_STORAGE', df[~is_blank], valid_dict)
    validate_values('PLATE_TEMPERATURE_STORAGE', df[~is_blank], valid_dict)
    
    # white cols - validated for all samples
    validate_freetext('MORPHOSPECIES_DESCRIPTION', df)
    validate_freetext('DESCRIPTION_OF_COLLECTION_METHOD', df)
    validate_freetext('HABITAT', df)
    validate_freetext('PRESERVATION_APPROACH', df)
    # TODO check if STS will need something here
    validate_freetext('COLLECTOR_SAMPLE_ID', df)
    validate_freetext('VOUCHER_ID', df)
    validate_float('ELEVATION', df, na_values=[''])
    validate_freetext('OTHER_INFORMATION', df)
    validate_freetext('MISC_METADATA', df)
    # IDENTIFIED_BY not checked
    # IDENTIFIER_AFFILIATION not checked
    
    df = add_sts_cols(df, contrib_df, gal=gal)
    if write_sts:
        write_sts_manifest(df, fn, VALIDATION_VERSION)
        
    logging.warning('# ended validate_bioscan_partner_manifest_v.{}'.format(BIOSCAN_VERSION))

    return df

# fn = '../../results/partner_manifests/IRD-Neandersquito_T222Amplicon_Manifest_V2.0.xlsx'
df = validate_bioscan(fn='../data/BIOSCAN_Manifest_V2.0_20230217.xlsx', 
                      gal='Sanger Institute',
                      template_fn='../data/BIOSCAN_Manifest_V2.0_20230217.xlsx', 
                      verbose=True,
                      samples_sheet='TAB 5 DO NOT EDIT - TEST Met',
                      write_sts=True)

In [None]:
raise Exception('Setup complete')

In [None]:
fn = '../results/20230217_b_plate/FACE_BIOSCAN_Manifest_20230215.xlsx'
df = validate_bioscan(fn, samples_sheet='TAB 2 Metadata Entry', contrib_sheet='TAB 1 Contributors', verbose=False)