## Setup

In [4]:
%%capture
%run validate_partner_manifest_dev.ipynb

In [5]:
def validate_anospp(fn, template_fn='../data/Anopheles_Metadata_Manifest_V4.0_20220825.xlsx', 
                    verbose=False, version='4.0', samples_sheet='TAB 2 Metadata Entry'):
    '''
    ANOSPP partner manifest validation
    Validation follows the order of columns order in data entry sheet
    '''

    setup_logging(verbose=verbose)

    logging.info(f'# started validate_anospp_partner_manifest_v.{version}')
    logging.warning(f'# manifest {fn}')

    # read data
    df = get_data(fn, sheet=samples_sheet)
    df = remove_trailing_spaces(df)
    
    # read NCBI taxonomy
    ncbi = ete3.NCBITaxa()
    
    # prepare for validation
    template_df = get_data(template_fn, sheet='TAB 2 Metadata Entry')
    check_columns(df, template_df)
    valid_dict = get_valid_dict(template_fn, validation_sheet='TAB 5 Data Validation - do not ')

    # check columns in order of appearance
    
    # orange cols
    # exclude empty series
    # SERIES
    df = validate_series(df)
    # RACK_OR_PLATE_ID, TUBE_OR_WELL_ID
    df = validate_plates_wells(df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID')
    # check and exclude blanks
    is_blank = check_blanks(df)
    validate_values('PRESERVATIVE_SOLUTION', df, valid_dict)
    validate_values('ORGANISM_PART', df, valid_dict, sep='|')
    # columns below validated for non-blank samples only
    date_coll = validate_date('DATE_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])
    # COLLECTION_COUNTRY, DECIMAL_LATITUDE, DECIMAL_LONGITUDE
    validate_country_and_coordinates(df[~is_blank], fn, na_values=[''])
    # COLLECTION_LOCATION not checked
    validate_values('SAMPLING_LOCATION_SIZE', df[~is_blank], valid_dict)
    
    # purple cols - validate non-blank samples only
    df = validate_taxonomy(df, ncbi, anospp=True, na_values = [''])
    validate_values('SPECIMEN_IDENTITY_RISK', df[~is_blank], valid_dict)
    # IDENTIFIED_HOW not checked
    validate_values('LIFESTAGE', df[~is_blank], valid_dict)
    validate_values('SEX', df[~is_blank], valid_dict)
    validate_time('TIME_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])
    validate_time_period('DURATION_OF_COLLECTION', df[~is_blank], na_values=['NOT_COLLECTED',''])
    validate_values('COLLECTION_METHOD', df[~is_blank], valid_dict)
    validate_values('OUTDOORS_INDOORS', df[~is_blank], valid_dict)
    # DESCRIPTION_OF_COLLECTION_METHOD not checked
    # PRESERVATION_APPROACH not checked
    
    # white cols - validate all samples
    validate_values('BLOOD_MEAL', df, valid_dict, na_values=[''])
    validate_values('GRAVIDITY', df, valid_dict, na_values=[''])
    # HABITAT not checked
    date_pres = validate_date('DATE_OF_PRESERVATION', df, na_values=['']) # allow for empty values unlike DATE_OF_COLLECTION
    compare_dates(before=date_coll, after=date_pres)
    validate_float('ELEVATION', df, na_values=[''])
    # WHAT_3_WORDS not checked
    # OTHER_ORGANISMS not checked
    # BIOASSAYS not checked
    # COLLECTOR_SAMPLE_ID  not checked
    # OTHER_INFORMATION  not checked
    # MISC_METADATA not checked
    # DNA_EXTRACTION_DESCRIPTION not checked
    validate_float('DNA_EXTRACT_VOLUME_PROVIDED', df, na_values=[''])
    validate_float('DNA_EXTRACT_CONCENTRATION', df, na_values=[''])
        
    logging.info('# ended started validate_anospp_partner_manifest_v.{}'.format(version))

    # TODO yield table ready for STS submission
    return df

fn = '../data/Anopheles_Metadata_Manifest_V4.0_20220825.xlsx'
template_fn = '../data/Anopheles_Metadata_Manifest_V4.0_20220825.xlsx'
df = validate_anospp(fn, template_fn, verbose=True, samples_sheet='TAB 3 TEST Metadata Entry')

[INFO] # started validate_anospp_partner_manifest_v.4.0
[INFO] reading data from '../data/Anopheles_Metadata_Manifest_V4.0_20220825.xlsx'
[INFO] reading data from '../data/Anopheles_Metadata_Manifest_V4.0_20220825.xlsx'
[INFO] checking manifest columns against template
[INFO] extracting value validation data from '../data/Anopheles_Metadata_Manifest_V4.0_20220825.xlsx'
[INFO] validating SERIES
[ERROR] In SERIES, ['770'] are missing, ['771'] are unexpected
[INFO] validating RACK_OR_PLATE_ID and TUBE_OR_WELL_ID
[ERROR] Found and excluded 482 empty rows based on RACK_OR_PLATE_ID and TUBE_OR_WELL_ID
[INFO] found 288 samples across 3 plates
[INFO] Checking and excluding blank samples
[ERROR] last well H12 is not blank at SERIES [288]: in ORGANISM_PART, expected "NOT_APPLICABLE", found ['TEST']. These samples will be included in further analysis.
[INFO] Blanks located: 286 samples of 288 left for downstream analysis
[INFO] validating values in column 'PRESERVATIVE_SOLUTION'
[INFO] excluding 

In [6]:
raise Exception('Setup complete')

Exception: Setup complete

## Validation

In [8]:
fn = '../results/20221010_anospp_4_amy_lemonde/Anopheles_Metadata_Manifest_V4.0_MGIVEC_v2.xlsx'
df = validate_anospp(fn, template_fn, samples_sheet='TAB 2 Metadata Entry')

[ERROR] in TUBE_OR_WELL_ID for plate AYDI_511, wells {'D9', 'E10', 'B6', 'B8', 'D7', 'C9', 'D12', 'E11', 'A7', 'G10', 'C5', 'F7', 'F10', 'H8', 'B10', 'A8', 'H12', 'A9', 'D4', 'B5', 'G5', 'C8', 'C12', 'H11', 'B7', 'H9', 'F11', 'D11', 'A10', 'G4', 'E6', 'G6', 'E9', 'D6', 'F9', 'E8', 'C6', 'B9', 'H6', 'G9', 'F6', 'D5', 'G8', 'A5', 'F12', 'E7', 'H4', 'F5', 'D10', 'B11', 'C10', 'A11', 'A6', 'H7', 'A12', 'G7', 'G12', 'H5', 'H10', 'C7', 'E12', 'B12', 'F4', 'B4', 'E5', 'C11', 'C4', 'G11', 'F8', 'D8', 'E4'} are missing, wells set() are excessive
[ERROR] invalid values in 'PRESERVATIVE_SOLUTION': {''}
[ERROR] invalid values in 'ORGANISM_PART': {''}
[ERROR] invalid values in 'SAMPLING_LOCATION_SIZE': {''}
[ERROR] invalid values in 'SPECIMEN_IDENTITY_RISK': {''}
[ERROR] invalid values in 'LIFESTAGE': {''}
[ERROR] invalid values in 'SEX': {''}
[ERROR] invalid values in 'COLLECTION_METHOD': {''}
[ERROR] invalid values in 'OUTDOORS_INDOORS': {''}
[ERROR] invalid values in 'BLOOD_MEAL': {'NOT_COLLECTE

In [8]:
fn = '../results/20221010_anospp_4_amy_lemonde/BEAM_Anopheles_Metadata_Manifest_V4.0.xlsx'
df = validate_anospp(fn, template_fn, samples_sheet='TAB 2 Metadata Entry')

[ERROR] duplicate SERIES: ['', '']
[ERROR] Found and excluded non-numeric SERIES: ['', '', '']
[ERROR] In SERIES, ['192', '288', '96'] are missing, ['862', '863', '864'] are unexpected
[ERROR] in TUBE_OR_WELL_ID for plate BEAM_001, wells {'H12'} are missing, wells set() are excessive
[ERROR] in TUBE_OR_WELL_ID for plate BEAM_002, wells {'H12'} are missing, wells set() are excessive
[ERROR] in TUBE_OR_WELL_ID for plate BEAM_003, wells {'H12'} are missing, wells set() are excessive
[ERROR] species: {'Anopheles flavicosta', 'Anopheles domicola', 'Anophele funestus'} not found in NCBI Taxonomy
[ERROR] species: "Anopheles flavicosta" found in Harbach list, but not in NCBI Taxonomy
[ERROR] species: "Anopheles domicola" found in Harbach list, but not in NCBI Taxonomy
[ERROR] invalid dates in 'DATE_OF_PRESERVATION': ['#REF!']
