In [6]:
%%capture
%run validate_partner_manifest_dev.ipynb

In [2]:
def validate_anospp(fn, template_fn='../data/Anopheles_Metadata_Manifest_V4.0.xlsx', 
                    verbose=False, version='4.0', samples_sheet='TAB 2 Metadata Entry'):
    '''
    Validation follows the order of columns order in data entry sheet
    '''

    setup_logging(verbose=verbose)

    logging.info(f'# started validate_anospp_partner_manifest_v.{version}')
    logging.warning(f'# manifest {fn}')

    # read data
    df = get_data(fn, sheet=samples_sheet)
    df = remove_trailing_spaces(df)
    
    # read NCBI taxonomy
    ncbi = ete3.NCBITaxa()
    
    # prepare for validation
    template_df = get_data(template_fn, sheet='TAB 2 Metadata Entry')
    check_columns(df, template_df)
    valid_dict = get_valid_dict(template_fn, validation_sheet='TAB 4Data Validation - do not e')

    # orange cols
    # exclude empty series
    df = validate_series(df)
    df = validate_plates_wells(df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID')
    
    # check and exclude blanks
    df = check_blanks(df)
    
    validate_values('PRESERVATIVE_SOLUTION', df, valid_dict)
    # CATCH_LOT not checked TODO do not allow missing
    validate_values('BOTTLE_DIRECTION', df, valid_dict)
    validate_values('ORGANISM_PART', df, valid_dict, sep='|')
    validate_values('HAZARD_GROUP', df, valid_dict)
    validate_values('REGULATORY_COMPLIANCE', df, valid_dict)
    date_coll = validate_date('DATE_OF_COLLECTION', df, na_values=['NOT_COLLECTED',''])
    check_location(df, fn)
    
    # purple cols
    # taxonomy validation - currently taxIDs are not used
    validate_ncbi_taxonomy_anospp(df, ncbi, na_values = [''])
    validate_values('SEX', df, valid_dict)
    # HABITAT not checked
    validate_time('TIME_OF_COLLECTION', df, na_values=['NOT_COLLECTED',''])
    validate_time_period('DURATION_OF_COLLECTION', df, na_values=['NOT_COLLECTED',''])
    validate_values('COLLECTION_METHOD', df, valid_dict)
    # DESCRIPTION_OF_COLLECTION_METHOD not checked
    validate_int('TIME_ELAPSED_FROM_COLLECTION_TO_PLATING', df, na_values=[''])
    # PHOTOGRAPH_* columns not checked
    # VOUCHER_ID not checked
    # PRESERVATION_APPROACH not checked - should match DATE_OF_PRESERVATION
    date_pres = validate_date('DATE_OF_PRESERVATION', df, na_values=['']) # allow for empty values unlike DATE_OF_COLLECTION
    compare_dates(before=date_coll, after=date_pres)
    # COLLECTOR_SAMPLE_ID not checked
    validate_int('ELEVATION', df, na_values=[''])
    # OTHER_INFORMATION	MISC_METADATA	IDENTIFIED_BY	IDENTIFIER_AFFILIATION	IDENTIFIED_HOW not checked
        
    logging.info('# ended validate_partner_manifest_v.{}'.format(version))

    return df

fn = '../data/Anopheles_Metadata_Manifest_V4.0.xlsx'
template_fn = '../data/Anopheles_Metadata_Manifest_V4.0.xlsx'
df = validate_anospp(fn, template_fn, verbose=True, samples_sheet='TAB 5 EXAMPLE Metadata Entry')

[INFO] # started validate_anospp_partner_manifest_v.4.0
[INFO] reading data from '../data/Anopheles_Metadata_Manifest_V4.0.xlsx'
[INFO] reading data from '../data/Anopheles_Metadata_Manifest_V4.0.xlsx'
[INFO] checking manifest columns against template
[INFO] extracting value validation data from '../data/Anopheles_Metadata_Manifest_V4.0.xlsx'
[INFO] validating SERIES
[ERROR] Found and excluded non-numeric SERIES: ['example']
[INFO] validating RACK_OR_PLATE_ID and TUBE_OR_WELL_ID
[ERROR] Found and excluded 4512 empty rows based on RACK_OR_PLATE_ID and TUBE_OR_WELL_ID
[INFO] found 288 samples across 3 plates
[INFO] Checking and excluding blank samples
[INFO] Blanks removed: 285 samples of 288 left for downstream analysis
[INFO] validating values in column 'PRESERVATIVE_SOLUTION'
[INFO] validating values in column 'BOTTLE_DIRECTION'
[ERROR] 'BOTTLE_DIRECTION' column not found in manifest
[INFO] validating values in column 'ORGANISM_PART'
[INFO] validating values in column 'HAZARD_GROUP'
[

In [5]:
validate_anospp('../results/20220825_anospp_4/BEAM_Anopheles_Metadata_Manifest_V4.0.xlsx', template_fn, samples_sheet='TAB 2 Metadata Entry')

[ERROR] duplicate SERIES: ['', '']
[ERROR] Found and excluded non-numeric SERIES: ['', '', '']
[ERROR] In SERIES, ['192', '288', '96'] are missing, ['862', '863', '864'] are unexpected
[ERROR] in TUBE_OR_WELL_ID for plate BEAM_001, wells {'H12'} are missing, wells set() are excessive
[ERROR] in TUBE_OR_WELL_ID for plate BEAM_002, wells {'H12'} are missing, wells set() are excessive
[ERROR] in TUBE_OR_WELL_ID for plate BEAM_003, wells {'H12'} are missing, wells set() are excessive
[ERROR] 'BOTTLE_DIRECTION' column not found in manifest
[ERROR] 'HAZARD_GROUP' column not found in manifest
[ERROR] 'REGULATORY_COMPLIANCE' column not found in manifest
[ERROR] SPECIES: {'Anophele funestus', 'Anopheles domicola', 'Anopheles flavicosta'} not found in NCBI Taxonomy
[ERROR] TIME_ELAPSED_FROM_COLLECTION_TO_PLATING column not found in manifest
[ERROR] invalid dates in 'DATE_OF_PRESERVATION': ['#REF!']


Unnamed: 0_level_0,RACK_OR_PLATE_ID,TUBE_OR_WELL_ID,PRESERVATIVE_SOLUTION,ORGANISM_PART,DATE_OF_COLLECTION,COLLECTION_COUNTRY,COLLECTION_LOCATION,DECIMAL_LATITUDE,DECIMAL_LONGITUDE,SAMPLING_LOCATION_SIZE,...,ELEVATION,WHAT_3_WORDS,OTHER_ORGANISMS,BIOASSAYS,COLLECTOR_SAMPLE_ID,OTHER_INFORMATION,MISC_METADATA,DNA_EXTRACTION_DESCRIPTION,DNA_EXTRACT_VOLUME_PROVIDED,DNA_EXTRACT_CONCENTRATION
SERIES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,BEAM_001,A1,100%_ETHANOL,HEAD | THORAX,2022-07-15 00:00:00,SENEGAL,SENEGAL | KEDOUGOU | KENIOTO VILLAGE,12.570607,-12.162318,1km²,...,,,,,,One of the mosquitoes in this batch of 14 was ...,,,,
2,BEAM_001,B1,100%_ETHANOL,HEAD | THORAX,2022-07-15 00:00:00,SENEGAL,SENEGAL | KEDOUGOU | KENIOTO VILLAGE,12.570607,-12.162318,1km²,...,,,,,,One of the mosquitoes in this batch of 14 was ...,,,,
3,BEAM_001,C1,100%_ETHANOL,HEAD | THORAX,2022-07-15 00:00:00,SENEGAL,SENEGAL | KEDOUGOU | KENIOTO VILLAGE,12.570607,-12.162318,1km²,...,,,,,,One of the mosquitoes in this batch of 14 was ...,,,,
4,BEAM_001,D1,100%_ETHANOL,HEAD | THORAX,2022-07-15 00:00:00,SENEGAL,SENEGAL | KEDOUGOU | KENIOTO VILLAGE,12.570607,-12.162318,1km²,...,,,,,,One of the mosquitoes in this batch of 14 was ...,,,,
5,BEAM_001,E1,100%_ETHANOL,HEAD | THORAX,2022-07-15 00:00:00,SENEGAL,SENEGAL | KEDOUGOU | KENIOTO VILLAGE,12.570607,-12.162318,1km²,...,,,,,,One of the mosquitoes in this batch of 14 was ...,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,BEAM_009,C12,100%_ETHANOL,HEAD | THORAX,2022-08-13 00:00:00,SENEGAL,SENEGAL | KEDOUGOU | BOUNDOUCOUNDI VILLAGE,12.518798,-12.335027,1km²,...,,,,,,,,,,
860,BEAM_009,D12,100%_ETHANOL,HEAD | THORAX,2022-08-13 00:00:00,SENEGAL,SENEGAL | KEDOUGOU | BOUNDOUCOUNDI VILLAGE,12.518798,-12.335027,1km²,...,,,,,,,,,,
861,BEAM_009,E12,100%_ETHANOL,HEAD | THORAX,2022-08-13 00:00:00,SENEGAL,SENEGAL | KEDOUGOU | BOUNDOUCOUNDI VILLAGE,12.518798,-12.335027,1km²,...,,,,,,,,,,
862,BEAM_009,F12,100%_ETHANOL,HEAD | THORAX,2022-08-13 00:00:00,SENEGAL,SENEGAL | KEDOUGOU | BOUNDOUCOUNDI VILLAGE,12.518798,-12.335027,1km²,...,,,,,,,,,,
