## Setup

In [1]:
%%capture
%run validate_partner_manifest_dev.ipynb

In [2]:
def validate_anospp(fn, template_fn='../data/Anopheles_Metadata_Manifest_V4.0_20220825.xlsx', 
                    verbose=False, version='4.0', samples_sheet='TAB 2 Metadata Entry'):
    '''
    ANOSPP partner manifest validation
    Validation follows the order of columns order in data entry sheet
    '''

    setup_logging(verbose=verbose)

    logging.info(f'# started validate_anospp_partner_manifest_v.{version}')
    logging.warning(f'# manifest {fn}')

    # read data
    df = get_data(fn, sheet=samples_sheet)
    df = remove_trailing_spaces(df)
    
    # read NCBI taxonomy
    ncbi = ete3.NCBITaxa()
    
    # prepare for validation
    template_df = get_data(template_fn, sheet='TAB 2 Metadata Entry')
    check_columns(df, template_df)
    valid_dict = get_valid_dict(template_fn, validation_sheet='TAB 5 Data Validation - do not ')

    # check columns in order of appearance
    
    # orange cols
    # exclude empty series
    # SERIES
    df = validate_series(df)
    # RACK_OR_PLATE_ID, TUBE_OR_WELL_ID
    df = validate_plates_wells(df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID')
    # check and exclude blanks
    is_blank = check_blanks(df)
    validate_values('PRESERVATIVE_SOLUTION', df, valid_dict)
    validate_values('ORGANISM_PART', df, valid_dict, sep='|')
    # columns below validated for non-blank samples only
    date_coll = validate_date('DATE_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])
    # COLLECTION_COUNTRY, DECIMAL_LATITUDE, DECIMAL_LONGITUDE
    validate_country_and_coordinates(df[~is_blank], fn, na_values=[''])
    # COLLECTION_LOCATION not checked
    validate_values('SAMPLING_LOCATION_SIZE', df[~is_blank], valid_dict)
    
    # purple cols - validate non-blank samples only
    df = validate_taxonomy(df, ncbi, anospp=True, na_values = [''])
    validate_values('SPECIMEN_IDENTITY_RISK', df[~is_blank], valid_dict)
    validate_specimen_id_risk(df)
    validate_freetext('IDENTIFIED_HOW', df[~is_blank])
    validate_values('LIFESTAGE', df[~is_blank], valid_dict)
    validate_values('SEX', df[~is_blank], valid_dict)
    validate_time('TIME_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])
    validate_time_period('DURATION_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])
    validate_values('COLLECTION_METHOD', df[~is_blank], valid_dict)
    validate_values('OUTDOORS_INDOORS', df[~is_blank], valid_dict)
    validate_freetext('DESCRIPTION_OF_COLLECTION_METHOD', df[~is_blank])
    validate_freetext('PRESERVATION_APPROACH', df[~is_blank])
    
    # white cols - validate all samples
    validate_values('BLOOD_MEAL', df, valid_dict, na_values=[''])
    validate_values('GRAVIDITY', df, valid_dict, na_values=[''])
    validate_freetext('HABITAT', df)
    date_pres = validate_date('DATE_OF_PRESERVATION', df, na_values=['']) # allow for empty values unlike DATE_OF_COLLECTION
    compare_dates(before=date_coll, after=date_pres)
    validate_float('ELEVATION', df, na_values=[''])
    # WHAT_3_WORDS not checked
    validate_freetext('OTHER_ORGANISMS', df)
    validate_freetext('BIOASSAYS', df)
    validate_freetext('COLLECTOR_SAMPLE_ID', df)
    validate_freetext('OTHER_INFORMATION', df)
    validate_freetext('MISC_METADATA', df)
    validate_freetext('DNA_EXTRACTION_DESCRIPTION', df)
    validate_float('DNA_EXTRACT_VOLUME_PROVIDED', df, na_values=[''])
    validate_float('DNA_EXTRACT_CONCENTRATION', df, na_values=[''])
    
    validate_contributors(fn)
        
    logging.info('# ended started validate_anospp_partner_manifest_v.{}'.format(version))

    # TODO yield table ready for STS submission
    return df

fn = '../data/Anopheles_Metadata_Manifest_V4.0_20220825.xlsx'
template_fn = '../data/Anopheles_Metadata_Manifest_V4.0_20220825.xlsx'
df = validate_anospp(fn, template_fn, verbose=True, samples_sheet='TAB 3 TEST Metadata Entry')

[INFO] # started validate_anospp_partner_manifest_v.4.0
[INFO] reading data from '../data/Anopheles_Metadata_Manifest_V4.0_20220825.xlsx'
[INFO] reading data from '../data/Anopheles_Metadata_Manifest_V4.0_20220825.xlsx'
[INFO] checking manifest columns against template
[INFO] extracting value validation data from '../data/Anopheles_Metadata_Manifest_V4.0_20220825.xlsx'
[INFO] validating SERIES
[ERROR] In SERIES, ['770'] are missing, ['771'] are unexpected
[INFO] validating RACK_OR_PLATE_ID and TUBE_OR_WELL_ID
[ERROR] Found and excluded 482 empty rows based on RACK_OR_PLATE_ID and TUBE_OR_WELL_ID
[INFO] found 288 samples across 3 plates
[INFO] Checking and excluding blank samples
[ERROR] last well H12 is not blank at SERIES [288]: in ORGANISM_PART, expected "NOT_APPLICABLE", found ['TEST']. These samples will be included in further analysis.
[INFO] validating values in column 'PRESERVATIVE_SOLUTION'
[INFO] validating values in column 'ORGANISM_PART'
[ERROR] invalid values in 'ORGANISM_P

[ERROR] duplicated names ['Darwin Charles R.']
[ERROR] invalid email addresses ['darwin_darwin.darwin']
[ERROR] confirmation lacking for any contributors
[INFO] # ended started validate_anospp_partner_manifest_v.4.0


In [3]:
raise Exception('Setup complete')

Exception: Setup complete

## Validation

In [None]:
fn = '../results/20221010_anospp_4_amy_lemonde/Anopheles_Metadata_Manifest_V4.0_MGIVEC_v3.xlsx'
df = validate_anospp(fn, template_fn, samples_sheet='TAB 2 Metadata Entry')

In [None]:
fn = '../results/20221010_anospp_4_amy_lemonde/BEAM_Anopheles_Metadata_Manifest_V4.0_v2.xlsx'
df = validate_anospp(fn, template_fn, samples_sheet='TAB 2 Metadata Entry')