## Setup

In [1]:
%%capture
%run validate_partner_manifest_dev.ipynb

In [None]:
def validate_anospp(fn, template_fn='../data/Anopheles_Metadata_Manifest_V4.0_20221220.xlsx', 
                    verbose=False, version='4.0', samples_sheet='TAB 2 Metadata Entry',
                    contrib_sheet='TAB 1 Contributors'):
    '''
    ANOSPP partner manifest validation
    Validation follows the order of columns order in data entry sheet
    '''

    setup_logging(verbose=verbose)

    logging.info(f'# started validate_anospp_partner_manifest_v.{version}')
    logging.warning(f'# manifest {fn}')

    # read data
    df = get_data(fn, sheet=samples_sheet)
    df = remove_trailing_spaces(df)
    
    # read NCBI taxonomy
    ncbi = ete3.NCBITaxa()
    
    # prepare for validation
    template_df = get_data(template_fn, sheet='TAB 2 Metadata Entry')
    check_columns(df, template_df)
    valid_dict = get_valid_dict(template_fn, validation_sheet='TAB 5 Data Validation - do not ')
    contrib_df = validate_contributors(fn, contrib_sheet=contrib_sheet)
    
    # check columns in order of appearance
    
    # orange cols
    # RACK_OR_PLATE_ID, TUBE_OR_WELL_ID
    df = validate_plates_wells(df, contrib_df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID')
    # check series
    df = validate_series(df)
    # check blanks
    is_blank = check_blanks(df)
    if df[~is_blank].shape[0] == 0:
        logging.error('no non-blank samples to validate, terminating')
        return df
    validate_values('PRESERVATIVE_SOLUTION', df, valid_dict)
    validate_values('ORGANISM_PART', df, valid_dict, sep='|')
    # columns below validated for non-blank samples only
    date_coll = validate_date('DATE_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])
    # COLLECTION_COUNTRY, DECIMAL_LATITUDE, DECIMAL_LONGITUDE
    validate_country_and_coordinates(df[~is_blank], fn, na_values=[''])
    # COLLECTION_LOCATION not checked
    validate_values('SAMPLING_LOCATION_SIZE', df[~is_blank], valid_dict)
    
    # purple cols - validate non-blank samples only
    validate_taxonomy(df, ncbi, anospp=True, na_values = [''])
    validate_values('SPECIMEN_IDENTITY_RISK', df[~is_blank], valid_dict)
    validate_specimen_id_risk(df)
    validate_freetext('IDENTIFIED_HOW', df[~is_blank])
    validate_values('LIFESTAGE', df[~is_blank], valid_dict)
    validate_values('SEX', df[~is_blank], valid_dict, na_values = [''])
    validate_time('TIME_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])
    validate_time_period('DURATION_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])
    validate_values('COLLECTION_METHOD', df[~is_blank], valid_dict)
    validate_values('OUTDOORS_INDOORS', df[~is_blank], valid_dict)
    validate_freetext('DESCRIPTION_OF_COLLECTION_METHOD', df[~is_blank])
    validate_freetext('PRESERVATION_APPROACH', df[~is_blank])
    
    # white cols - validate all samples
    validate_values('BLOOD_MEAL', df, valid_dict, na_values=[''])
    validate_values('GRAVIDITY', df, valid_dict, na_values=[''])
    validate_freetext('HABITAT', df)
    date_pres = validate_date('DATE_OF_PRESERVATION', df, na_values=['']) # allow for empty values unlike DATE_OF_COLLECTION
    compare_dates(before=date_coll, after=date_pres)
    validate_float('ELEVATION', df, na_values=[''])
    # WHAT_3_WORDS not checked
    validate_freetext('OTHER_ORGANISMS', df)
    validate_freetext('BIOASSAYS', df)
    validate_freetext('COLLECTOR_SAMPLE_ID', df)
    validate_freetext('OTHER_INFORMATION', df)
    validate_freetext('MISC_METADATA', df)
    validate_freetext('DNA_EXTRACTION_DESCRIPTION', df)
    validate_float('DNA_EXTRACT_VOLUME_PROVIDED', df, na_values=[''])
    validate_float('DNA_EXTRACT_CONCENTRATION', df, na_values=[''])
            
    logging.info('# ended started validate_anospp_partner_manifest_v.{}'.format(version))

    # TODO yield table ready for STS submission
    return df

In [4]:
fn = '../data/Anopheles_Metadata_Manifest_V4.0_20221220.xlsx'
template_fn = fn
df = validate_anospp(fn, template_fn, verbose=True, samples_sheet='TAB 3 TEST Metadata Entry')

[INFO] # started validate_anospp_partner_manifest_v.4.0
[INFO] reading data from '../data/Anopheles_Metadata_Manifest_V4.0_20221220.xlsx' sheet 'TAB 3 TEST Metadata Entry'
[INFO] reading data from '../data/Anopheles_Metadata_Manifest_V4.0_20221220.xlsx' sheet 'TAB 2 Metadata Entry'
[INFO] checking manifest columns against template
[INFO] extracting value validation data from '../data/Anopheles_Metadata_Manifest_V4.0_20221220.xlsx'
[INFO] validating contributors in ../data/Anopheles_Metadata_Manifest_V4.0_20221220.xlsx
[ERROR] duplicated names ['Charles R. Darwin']
[ERROR] invalid email addresses ['darwin_darwin.darwin']
[ERROR] confirmation lacking for any contributors
[INFO] validating RACK_OR_PLATE_ID and TUBE_OR_WELL_ID
[ERROR] plate names ['sdhfibs'] do not match template "[A-Z]{4}_[0-9]{3}"
[ERROR] plate name prefixes not recognised for ['DUGA_004', 'sdhfibs']
[ERROR] duplicate TUBE_OR_WELL_ID for plate DACH_001: ['A1']
[ERROR] in TUBE_OR_WELL_ID for plate DACH_002, wells {'C1', '

[INFO] # ended started validate_anospp_partner_manifest_v.4.0


In [3]:
raise Exception('Setup complete')

Exception: Setup complete

## Validation

In [6]:
fn = '../results/20230628_olaitan/Anopheles_Metadata_Manifest_V4.0_EKUW - 21.06.2023.xlsx'
df = validate_anospp(fn, template_fn, samples_sheet='TAB 2 Metadata Entry')

[ERROR] template columns missing from filled manifest: {'MISC_METADATA'}
[ERROR] species: {'An stephensi'} not found in NCBI Taxonomy
[ERROR] invalid times in 'DURATION_OF_COLLECTION': ['PT105M' 'PT110M']
[ERROR] MISC_METADATA column not found in manifest


In [7]:
fn = '../results/20230628_olaitan/Anopheles_Metadata_Manifest_V4.0_EKUW - 27.06.2023 - historical samples.xlsx'
df = validate_anospp(fn, template_fn, samples_sheet='TAB 2 Metadata Entry')

[ERROR] template columns missing from filled manifest: {'MISC_METADATA'}
[ERROR] no non-blank samples to validate, terminating
