In [1]:
from datetime import date
import itertools
import logging
import pandas as pd
import numpy as np
import datetime
import glob
import os
import pickle
import ete3
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

ModuleNotFoundError: No module named 'ete3'

In [None]:
logging.getLogger().setLevel(logging.INFO)
# logging.getLogger().setFormat('[%(levelname)s] %(message)s')

def setup_logging(verbose=False):
    try: 
        del logging.root.handlers[:]
    except:
        pass
    if verbose:
        logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
    else:
        logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
setup_logging(verbose=True)   
logging.info('test')

In [None]:
fn = 'data/NE BIOSCAN_Manifest_V1.0_Yarner_2021.xlsx'
template_fn = '../data/BIOSCAN_Manifest_V1.0_20211207.xlsx'

In [None]:
# download and install taxonomy
ncbi = ete3.NCBITaxa()
# only run update if needed
# ncbi.update_taxonomy_database()

In [None]:
def get_data(fn, sheet='TAB1 Specimen Metadata Entry'):

    logging.info('reading data from {!r}'.format(fn))
    
    df = pd.read_excel(fn, dtype=str, index_col=0, keep_default_na=False,
                       sheet_name=sheet)

    if df.index.duplicated().any():
        logging.error('duplicate SERIES: {}'.format(df.index[df.index.duplicated()].to_list()))
        
    # trailing spaces
    
        
    return df
df = get_data(fn)

In [None]:
def remove_trailing_spaces(df):
    for col in df.columns:
        trailing_spaces = (df[col].str.startswith(' ') | df[col].str.endswith(' '))
        if trailing_spaces.any():
            logging.warning('trailing spaces found in column {!r}, SERIES {}. Removing for validation'.format(col,
                df.loc[trailing_spaces].index.to_list()))
            df[col] = df[col].str.strip()
            
    return df

In [None]:
template_df = get_data(template_fn)

In [None]:
def check_columns(df, template_df):
    
    logging.info('checking manifest columns against template')
    
    data_cols = set(df.columns)
    template_cols = set(template_df.columns)
        
    if data_cols - template_cols != set():
        logging.warning('extra columns in filled manifest compared to template: {}'.format(data_cols - template_cols))
    if template_cols - data_cols != set():
        logging.error('template columns missing from filled manifest: {}'.format(template_cols - data_cols))
check_columns(df, template_df)

In [None]:
def get_valid_dict(fn, validation_sheet='Data Validation - do not edit'):
    
    # pick up validation values from data validation sheet
    logging.info('extracting value validation data from {!r}'.format(fn))
    valid_df = pd.read_excel(fn, dtype=str, sheet_name=validation_sheet)
    valid_dict = dict()
    for col in valid_df.columns:
        valid_dict[col] = valid_df[col].dropna().to_list()
    
    return valid_dict
valid_dict = get_valid_dict(template_fn)

In [None]:
def exclude_missing(series, na_values=[]):
    
    # valid missing data 
    no_data = (series.isin(na_values))
    if no_data.sum() > 0:
        logging.info('excluding {} {!r} samples without data in {!r}'.format(no_data.sum(), na_values, series.name))
    return series[~no_data]
    
exclude_missing(df['TIME_OF_COLLECTION'], na_values=['NOT_COLLECTED',''])

In [None]:
def validate_series(df):
    
    # series should be 1,2, ..., nsamples
    logging.info('validating SERIES')
    
    # exclude non-numeric SERIES
    series_numeric = df.index.astype(str).str.isnumeric()
    if not series_numeric.all():
        logging.error(f'Found and excluded non-numeric SERIES: {df.index[~series_numeric].to_list()}')
        df = df.loc[series_numeric]
        
    # check the remaining SERIES are continuous
    expected_series = set([str(i) for i in range(1, df.shape[0] + 1)])
    observed_series = set(df.index.astype(str))
    if expected_series != observed_series:
        logging.error(f'In SERIES, {sorted(list(expected_series - observed_series))} are missing, '
                      f'{sorted(list(observed_series - expected_series))} are unexpected')
        
    return df
        
df = validate_series(df)

In [None]:
def validate_plates_wells(df, plate_col, well_col):
    
    # expect only complete 96-well plates
    logging.info(f'validating {plate_col} and {well_col}')
    
    empty_rows = (df[plate_col] == '') | (df[well_col] == '')
    
    if empty_rows.any():
        logging.error(f'Found and excluded {empty_rows.sum()} empty rows based on {plate_col} and {well_col}')
        df = df.loc[~empty_rows]
    
    logging.info(f'found {df.shape[0]} samples across {df[plate_col].nunique()} plates')
    
    # add 96-well plate well IDs to validation
    row_id = list('ABCDEFGH')
    col_id = range(1,13)
    expected_wells = set([r + str(c) for (r,c) in itertools.product(row_id, col_id)])
    
    for plate, pdf in df.groupby(plate_col):
        dup_wells =  pdf[well_col].duplicated()
        if dup_wells.any():
            logging.error(f'duplicate {well_col} for plate {plate}: {pdf.loc[dup_wells, well_col].unique()}')
        observed_wells = set(pdf[well_col])
        if observed_wells != expected_wells:
            logging.error(f'in {well_col} for plate {plate}, wells {expected_wells - observed_wells} '
                          f'are missing, wells {observed_wells - expected_wells} are excessive')
    
    return df
        
df = validate_plates_wells(df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID')

In [None]:
df['SCIENTIFIC_NAME'].value_counts()

In [None]:
## TODO - which columns require NA, do not remove blanks to be able to get taxids for all
def check_blanks(df):
    
    logging.info('Checking and excluding blank samples')    
    
    # blank criterion
    is_blank = (df['ORGANISM_PART'] == 'NOT_APPLICABLE')
    
    blank_df = df[is_blank]
    
    non_blank_df = df[~is_blank]
    
    # last well of plate expected to be blank
    non_blank_last_well_df = non_blank_df[non_blank_df['TUBE_OR_WELL_ID'] == 'H12']
    if non_blank_last_well_df.shape[0] > 0:
        logging.error('last well H12 is not blank at SERIES {}: in ORGANISM_PART, '
                      'expected "NOT_APPLICABLE", found {}. '
                      'These samples will be included in further analysis.'.format(
                        non_blank_last_well_df.index.to_list(),
                        non_blank_last_well_df.ORGANISM_PART.to_list()
        ))
    
    # exclude blanks from downstream analysis    
    logging.info('Blanks removed: {} samples of {} left for downstream analysis'.format(
        non_blank_df.shape[0], df.shape[0]))
    
    
    return non_blank_df
print(df.shape)
df = check_blanks(df)
print(df.shape)

In [None]:
def validate_values(col, df, valid_dict, sep=None, na_values=[], level='e'):
    
    logging.info('validating values in column {!r}'.format(col))
    
    if col not in df.columns:
        logging.error('{!r} column not found in manifest'.format(col))
        return
    if col not in valid_dict.keys():
        logging.error('{!r} column not found in validation sheet'.format(col))
        return
    assert level in ('i','w','e'), '{!r} invalid logging level for validate_values'.format(level)
    
    series = df[col]
    series = exclude_missing(series, na_values)
    
    col_values = set(series.unique())
    # use separator to split values
    if sep:
        sep_col_values = list()
        for v in col_values:
            sep_col_values.extend([x.strip() for x in v.split(sep)])
        col_values = set(sep_col_values)
    valid_values = set(valid_dict[col])
    invalid_values = col_values - valid_values
    if len(invalid_values) > 0:
        msg = 'invalid values in {!r}: {}'.format(col, invalid_values)
        if level == 'i':
            logging.info(msg)
        elif level == 'w':
            logging.warning(msg)
        elif level == 'e':
            logging.error(msg)
#     else:
#         logging.info('all values valid in {!r}'.format(col))
            
validate_values('ORGANISM_PART', df, valid_dict, sep=" | ")

In [None]:
def validate_date(col, df, na_values=[]):
    
    logging.info('validating date column {!r}'.format(col))

    if col not in df.columns:
        logging.error('{!r} column not found in manifest'.format(col))
        return
    series = df[col]
    series = exclude_missing(series, na_values)
    
    # invalid date formats
    # empty string converted to NaT
    date_series = pd.to_datetime(series, format='%Y-%m-%d', errors='coerce')
    if date_series.isna().any():
        logging.error('invalid dates in {!r}: {}'.format(col, 
                                                         series[date_series.isna()].unique()))
    valid_date_series = date_series[~date_series.isna()]
    
    # dates in future
    future_dates = (valid_date_series > datetime.datetime.today())
    if future_dates.any():
        logging.error('future dates in {!r}: {}'.format(col,
            valid_date_series[future_dates].to_list()))
        
    # dates too old
    old_dates = (valid_date_series < datetime.datetime.strptime('1900-01-01', '%Y-%m-%d'))
    if old_dates.any():
        logging.error("pre-1900 dates in {!r}: {}".format(col,
            valid_date_series[old_dates].to_list())) 
    
    return valid_date_series
df.loc[1,'DATE_OF_COLLECTION'] = 'NOT_COLLECTED'
validate_date('DATE_OF_COLLECTION', df)

In [None]:
def validate_time(col, df, na_values=[]):
    
    logging.info('validating time column {!r}'.format(col))
    
    if col not in df.columns:
        logging.error('{!r} column not found in manifest'.format(col))
        return
    series = df[col]
    series = exclude_missing(series, na_values)
        
    # invalid time formats
    # NB empty string converted to NaT
    time_series = pd.to_datetime(series, format='%H:%M:%S', errors='coerce')
    if time_series.isna().any():
        logging.error('invalid times in {!r}: {}'.format(col, 
                                                         series[time_series.isna()].unique()))
    valid_time_series = time_series[~time_series.isna()]
    
    return valid_time_series
# df.loc[1,'TIME_OF_COLLECTION'] = '23'
validate_time('TIME_OF_COLLECTION', df)

In [None]:
def validate_time_period(col, df, na_values=[]):
    
    logging.info('validating time period column {!r}'.format(col))
    
    if col not in df.columns:
        logging.error('{!r} column not found in manifest'.format(col))
        return
    series = df[col]
    series = exclude_missing(series, na_values)

    # conversion with modifications for proper parsing 
    # by pd.Timedelta (does not accept missing data, e.g. 'PT1H')
    # note - will not work for weeks and months
    def convert_iso_duration(s):
        if s == np.nan:
            return np.nan
        if not s.startswith('P') or 'T' not in s:
            return np.nan
        # add days
        if s.startswith('PT'):
            s = s.replace('PT','P0DT')
        # add trailing minutes and seconds
        if s.endswith('H'):
            s += '0M0S'
        elif s.endswith('M'):
            s += '0S'
        try:
            return pd.Timedelta(s)
        except:
            return np.nan
    time_period_series = series.apply(convert_iso_duration)
    if time_period_series.isna().any():
        logging.error('invalid times in {!r}: {}'.format(col, 
            series[time_period_series.isna()].unique()))
    valid_time_period_series = time_period_series[~time_period_series.isna()]
    return valid_time_period_series

# df.loc[1,'DURATION_OF_COLLECTION'] = 'PVT1H'
validate_time_period('DURATION_OF_COLLECTION', df);
# df['DURATION_OF_COLLECTION']

In [None]:
# to be replaced/supported by w3w check
def check_location(df, fn, na_values=[]):
    
    logging.info('validating country with coordinates')
    
    loc_col, lat_col, lon_col = 'COLLECTION_LOCATION', 'DECIMAL_LATITUDE', 'DECIMAL_LONGITUDE'

    try:
        loc_df_complete = df[[loc_col, lat_col, lon_col]].copy()
    except:
        logging.error('One of {!r} {!r} {!r} columns not found in manifest'.format(loc_col, lat_col, lon_col))
        return
    loc_df_isna = (loc_df_complete.isin(na_values)).all(axis=1)
    if loc_df_isna.any():
        logging.info('removing {} {!r} samples with missing data from coordinate analysis'.format(
                loc_df_isna.sum(), na_values))
    loc_df_complete = loc_df_complete[~loc_df_isna].copy()
    
    # coordinates in geopy format
    loc_df_complete['coord'] = loc_df_complete.apply(lambda x: '{}, {}'.format(
            x[lat_col], x[lon_col]), axis=1)
    
    # get location data for coordinates
    # use local copy of web query results for re-runs
    # this 
    loc_fn = fn+'_loc.pkl'
    if os.path.isfile(loc_fn):
        locations = pickle.load(open(loc_fn, "rb"))
    else:
        # web map server - openstreetmaps
        logging.info('querying coordinates')
        locator = Nominatim(user_agent='myGeocoder')
        rgeocode = RateLimiter(locator.reverse, min_delay_seconds=1)

        locations = dict()
        for c in loc_df_complete.coord.unique():
            # pre-fill with unknown country
            locations[c] = {'address':{'country':'UNKNOWN'}}
            # check coordniate correctness
            try:
                lat, lon = c.split(', ')
                lat, lon = float(lat), float(lon)
            except:
                logging.error('problem parsing coordinates {!r}'.format(c))
                continue
            if abs(lat) > 90:
                logging.error('invalid latitude {}, should be in [-90,90]'.format(lat))
                continue
            if abs(lon) > 180:
                logging.error('invalid longitude {}, should be in [-180,180]'.format(lon))
                continue
            # web query
            location = rgeocode(c, language='en-gb')
            # rgeocode returns empty location outside of counries and in some other situations
            if location is not None:
                locations[c] = location.raw

        # save locations to file
        pickle.dump(locations, open(loc_fn, "wb"))
        
    # parse country from partner input
    loc_df_complete['partner_country'] = loc_df_complete[loc_col].apply(lambda x: x.split('|')[0].strip().upper())
    
    # extract countries from location data
    loc_countries = dict()
    for coord in locations.keys():
        coord_country = locations[coord]['address']['country'].upper()
        loc_countries[coord] = coord_country
        
        partner_countries = loc_df_complete.loc[loc_df_complete.coord == coord, 'partner_country']
        if partner_countries.nunique() > 1:
            logging.error('multiple partner countries for coordinates {!r}: {}'
                          'skipping coordinate validation'.format(
                                coord, partner_countries.unique()))
            continue
        if partner_countries.shape[0] == 0:
            logging.error('no partner location found for coordinates {!r}'.format(coord))
            continue
        partner_country = partner_countries.iloc[0]
        if coord_country == 'UNKNOWN':
            logging.warning('could not locate country for coordinates {!r}, partner country {!r}'.format(
                    coord, partner_country))
        elif partner_country != coord_country:
            logging.error('country mismatch for coordinates {!r}, partner country {!r}, '
                          'coordinate country {!r}'.format(coord, partner_country, coord_country))
    
    # countries based on coordinates
    loc_df_complete['coord_country'] = loc_df_complete['coord'].replace(loc_countries)
    country_mismatch = (loc_df_complete.coord_country != loc_df_complete.partner_country)

#     if country_mismatch.any():
#         logging.error('coordinates do not match country for SERIES: {}'.format(
#                 country_mismatch[country_mismatch].index.to_list()))
    
    # location data can be re-used, e.g. as an additional field
    return loc_df_complete
# df.loc[2,'DECIMAL_LATITUDE'] = '65'
loc_test = check_location(df, fn)
loc_test

In [None]:
def validate_ncbi_taxonomy_anospp(df, ncbi, na_values = []):
    
    logging.info('validating species taxonomy against NCBI')

    tax_names = list(df['PREDICTED_SCIENTIFIC_NAME'].unique())
    
    for na_value in na_values:
        try:
            tax_names.remove(na_value)
        except:
            pass 
        
    for i, tax_name in enumerate(tax_names):
        if len(tax_name) == 0:
            continue
        corr_tax_name = tax_name[0].upper() + tax_name[1:].lower()
        if corr_tax_name != tax_name and tax_name != 'blank sample':
            logging.error(f'{tax_level}: unexpected case for "{tax_name}", '
                          f'changing to "{corr_tax_name}" for validation')
        tax_names[i] = corr_tax_name

    tax_info = dict()
    tax_level = 'SPECIES'
    
    tax_info[tax_level] = ncbi.get_name_translator(tax_names) 

    unmatched_names = set(tax_names) - set(tax_info[tax_level].keys())
    if len(unmatched_names) > 0:
        logging.error(f'{tax_level}: {unmatched_names} not found in NCBI Taxonomy')

    expected_rank = 'species' if (tax_level == 'SCIENTIFIC_NAME') else tax_level.lower()

    for tname, tids in tax_info[tax_level].items():

        ranks = ncbi.get_rank(tids)

        upd_tid = tids[0]

        if len(tids) == 1:
            if ranks[upd_tid] != expected_rank: 
                # TODO warning->info for ORDER
                logging.warning(f'{tax_level}: found unexpected rank for {tname} (taxid {upd_tid}): {ranks[upd_tid]}')
        if len(tids) > 1:            
            for tid, r in ranks.items():
                if r == expected_rank and len(tids) > 1:
                    logging.info(f'{tax_level}: using only first matching rank for {tname} (taxid {tid}): {r}')
                    upd_tid = tid
                    break
            else:
                logging.warning(f'{tax_level}: could not find matching rank for {tname}, '
                                f'using (taxid {upd_tid}): {ranks[upd_tid]}')

        tax_info[tax_level][tname] = upd_tid

In [None]:
def validate_ncbi_taxonomy(df, ncbi, na_values = []):
    
    logging.info('validating taxonomy against NCBI')
    
    tax_columns = [
        'ORDER',
        'FAMILY',
        'GENUS',
        'SCIENTIFIC_NAME'
    ]        
    
    hierarchies = df[tax_columns].drop_duplicates().copy()
    
    tax_info = dict()
    
    for tax_level in tax_columns:
        
        logging.info(f'validating {tax_level} against NCBI')
        
        if tax_level not in df.columns:
                logging.error(f'{tax_level} column not found in manifest')
                continue
            
        tax_names = list(hierarchies[tax_level].unique())
        
        for na_value in na_values:
            try:
                tax_names.remove(na_value)
            except:
                pass 
            
        for i, tax_name in enumerate(tax_names):
            if len(tax_name) == 0:
                continue
            corr_tax_name = tax_name[0].upper() + tax_name[1:].lower()
            if corr_tax_name != tax_name and tax_name != 'blank sample':
                logging.error(f'{tax_level}: unexpected case for "{tax_name}", '
                              f'changing to "{corr_tax_name}" for validation')
            tax_names[i] = corr_tax_name
        
        tax_info[tax_level] = ncbi.get_name_translator(tax_names) 
        
        unmatched_names = set(tax_names) - set(tax_info[tax_level].keys())
        if len(unmatched_names) > 0:
            logging.error(f'{tax_level}: {unmatched_names} not found in NCBI Taxonomy')
        
        expected_rank = 'species' if (tax_level == 'SCIENTIFIC_NAME') else tax_level.lower()
        
        for tname, tids in tax_info[tax_level].items():
            
            ranks = ncbi.get_rank(tids)
            
            upd_tid = tids[0]
            
            if len(tids) == 1:
                if ranks[upd_tid] != expected_rank: 
                    # TODO warning->info for ORDER
                    logging.warning(f'{tax_level}: found unexpected rank for {tname} (taxid {upd_tid}): {ranks[upd_tid]}')
            if len(tids) > 1:            
                for tid, r in ranks.items():
                    if r == expected_rank and len(tids) > 1:
                        logging.info(f'{tax_level}: using only first matching rank for {tname} (taxid {tid}): {r}')
                        upd_tid = tid
                        break
                else:
                    logging.warning(f'{tax_level}: could not find matching rank for {tname}, '
                                    f'using (taxid {upd_tid}): {ranks[upd_tid]}')
                    
            tax_info[tax_level][tname] = upd_tid
        
        #logging.info(f'{tax_level} {tax_info[tax_level]}')
                    
    # check correctness of taxonomy
    for _, r in hierarchies.iterrows():
        
        if r.ORDER in na_values:
            continue
        try:
            order_id = tax_info['ORDER'][r.ORDER]
        except KeyError:
            logging.info(f'cannot validate ORDER for "{r.ORDER}", skipping taxonomy consistency check')
            continue
            
        if r.FAMILY in na_values:
            continue
        try:
            family_id = tax_info['FAMILY'][r.FAMILY]
            
            family_lineage = ncbi.get_lineage(family_id)
            
            if order_id not in family_lineage:
                logging.error(f'Family {r.FAMILY} (taxid {family_id}) does not belong to {r.ORDER} (taxid {order_id})')
        except KeyError:
            logging.info(f'cannot validate FAMILY for "{r.FAMILY}", skipping taxonomy consistency check')
            continue
            
        if r.GENUS in na_values:
            continue
        try:
            genus_id = tax_info['GENUS'][r.GENUS]
            
            genus_lineage = ncbi.get_lineage(genus_id)
            
            if order_id not in genus_lineage:
                logging.error(f'Genus {r.GENUS} (taxid {genus_id}) does not belong to {r.ORDER} (taxid {order_id})')
            if family_id not in genus_lineage:
                logging.error(f'Genus {r.GENUS} (taxid {genus_id}) does not belong to {r.FAMILY} (taxid {family_id})')
        except KeyError:
            logging.info(f'cannot validate GENUS for "{r.GENUS}", skipping taxonomy consistency check')
            continue
            
        if r.SCIENTIFIC_NAME in na_values:
            continue
        try:
            species_id = tax_info['SCIENTIFIC_NAME'][r.SCIENTIFIC_NAME]
            
            species_lineage = ncbi.get_lineage(species_id)
            
            if order_id not in species_lineage:
                logging.error(f'Species {r.SCIENTIFIC_NAME} (taxid {species_id}) does not belong to {r.ORDER} (taxid {order_id})')
            if family_id not in species_lineage:
                logging.error(f'Species {r.SCIENTIFIC_NAME} (taxid {species_id}) does not belong to {r.FAMILY} (taxid {family_id})')
            if genus_id not in species_lineage:
                logging.error(f'Species {r.SCIENTIFIC_NAME} (taxid {species_id}) does not belong to {r.GENUS} (taxid {genus_id})')
        except KeyError:
            logging.info(f'cannot validate SCIENTIFIC_NAME for "{r.SCIENTIFIC_NAME}", skipping taxonomy consistency check')
            continue
            
    for tc in tax_columns:
        df[f'{tc}_TAXID'] = df[tc].replace(tax_info[tc])
            
    return df
        
                
validate_ncbi_taxonomy(df, ncbi)

In [None]:
def validate_int(col, df, na_values=[]):
    
    logging.info(f'validating int format in {col}')
    
    if col not in df.columns:
        logging.error(f'{col} column not found in manifest')
        return
    series = df[col]
    series = exclude_missing(series, na_values)
    
    for val in series.unique():
        try:
            int(val)
        except:
            logging.error(f'found non-integer value in {col}: "{val}"')
validate_int('TIME_ELAPSED_FROM_COLLECTION_TO_PLATING', df)

In [None]:
bd = validate_date('DATE_OF_COLLECTION', df)
ad = validate_date('DATE_OF_PRESERVATION', df)

In [None]:
ctdf = pd.concat([bd, ad], axis=1)

In [None]:
ctdf.iloc[:, 0] > ctdf.iloc[:, 1]

In [None]:
def compare_dates(before, after):
        
    logging.info(f'checking that {before.name} are earlier than {after.name}')

    ctdf = pd.concat([before, after], axis=1)
    date_conflict = ctdf[before.name] > ctdf[after.name]
    
    if date_conflict.any():
        logging.error(f'{before.name} values are later than {after.name} for SERIES'
                      f' {ctdf[date_conflict].index.to_list()}')

In [None]:
raise Exception('Done for now')

In [None]:
def validate_bioscan(fn, template_fn, verbose=False, version='1.0'):
    '''
    Validation follows the order of columns order in data entry sheet
    '''

    setup_logging(verbose=verbose)

    logging.info(f'# started validate_partner_manifest_v.{version}')
    logging.warning(f'# manifest {fn}')

    # read data
    df = get_data(fn)
    
    # read taxonomy
    ncbi = ete3.NCBITaxa()
    
    # prepare for validation
    template_df = get_data(template_fn)
    check_columns(df, template_df)
    valid_dict = get_valid_dict(template_fn)

    # orange cols
    # exclude empty series
    df = validate_series(df)
    df = validate_plates_wells(df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID')
    
    # check and exclude blanks
    df = check_blanks(df)
    
    validate_values('PRESERVATIVE_SOLUTION', df, valid_dict)
    # CATCH_LOT not checked TODO do not allow missing
    validate_values('BOTTLE_DIRECTION', df, valid_dict)
    validate_values('ORGANISM_PART', df, valid_dict, sep='|')
    validate_values('HAZARD_GROUP', df, valid_dict)
    validate_values('REGULATORY_COMPLIANCE', df, valid_dict)
    date_coll = validate_date('DATE_OF_COLLECTION', df, na_values=['NOT_COLLECTED'])
    check_location(df, fn)
    
    # purple cols
    # taxonomy validation adds a few columns
    df = validate_ncbi_taxonomy(df, ncbi, na_values = ['NOT_COLLECTED'])
    validate_values('SEX', df, valid_dict)
    # HABITAT not checked
    validate_time('TIME_OF_COLLECTION', df)
    validate_time_period('DURATION_OF_COLLECTION', df, na_values=['NOT_COLLECTED'])
    validate_values('COLLECTION_METHOD', df, valid_dict)
    # DESCRIPTION_OF_COLLECTION_METHOD not checked
    validate_int('TIME_ELAPSED_FROM_COLLECTION_TO_PLATING', df, na_values=[''])
    # PHOTOGRAPH_* columns not checked
    # VOUCHER_ID not checked
    # PRESERVATION_APPROACH not checked - should match DATE_OF_PRESERVATION
    date_pres = validate_date('DATE_OF_PRESERVATION', df, na_values=['']) # allow for empty values unlike DATE_OF_COLLECTION
    compare_dates(before=date_coll, after=date_pres)
    # COLLECTOR_SAMPLE_ID not checked
    validate_int('ELEVATION', df, na_values=[''])
    # OTHER_INFORMATION	MISC_METADATA	IDENTIFIED_BY	IDENTIFIER_AFFILIATION	IDENTIFIED_HOW not checked
        
    logging.info('# ended validate_partner_manifest_v.{}'.format(version))

    return df

# fn = '../../results/partner_manifests/IRD-Neandersquito_T222Amplicon_Manifest_V2.0.xlsx'
df = validate(fn, template_fn, verbose=True)


## Real manifests validation starts here - now also moved to separate notebook

In [None]:
df = validate('../results/20220322/NBGW-[20210805]-manifest.xlsx', template_fn, verbose=False)

In [None]:
df = validate('../results/20220322/NBGW-[20220105]-manifest.xlsx', template_fn, verbose=False)

In [None]:
# test columns addition
df = validate('../results/20220315/Sam_NHM-BIOSCAN-Manifest Jan 2022.xlsx', template_fn, verbose=False)

In [None]:
# test columns addition
df = validate('../results/20220303_add_tax/final-CARR-20210727-manifestV1.xlsx', template_fn, verbose=False)
df.to_csv('../results/20220303_add_tax/final-CARR-20210727-manifestV1.taxids.csv')

In [None]:
import glob
fns = list(glob.glob('../results/20220304/*.xlsx'))
fns.sort()
fns

In [None]:
df = validate(fns[0], template_fn, verbose=False)

In [None]:
df = validate(fns[1], template_fn, verbose=False)

In [None]:
df = validate(fns[2], template_fn, verbose=False)

In [None]:
df = validate(fns[3], template_fn, verbose=False)

In [None]:
df = validate(fns[4], template_fn, verbose=False)

In [None]:
df = validate(fns[5], template_fn, verbose=False)

In [None]:
df = validate(fns[6], template_fn, verbose=False)

In [None]:
df = validate('../results/20220301/CARR-20210727-manifest.xlsx', template_fn, verbose=False)

In [None]:
df = validate('../results/20220301/NSCR-20220123-manifest (A3642498).xlsx', template_fn, verbose=False)

In [None]:
df = validate('../results/20220301/YARN-20211205-manifest.xlsx', template_fn, verbose=False)

In [None]:
df = validate('../results/August 2021 Manifest (Bill _ Fred).xlsx', template_fn, verbose=False)

In [None]:
df = validate('../results/Mike Ashworth NE 2021-05-28 corrected BIOSCAN_Manifest_V1.0.xlsx', template_fn, verbose=False)

In [None]:
df = validate('../results/Mike Ashworth NE 2021-06-24 BIOSCAN_Manifest_V1.0.xlsx', template_fn, verbose=False)

In [None]:
df = validate('../results/NE BIOSCAN_Manifest_V1.0_Yarner_260621.xlsx', template_fn, verbose=False)

In [None]:
df = validate('../results/NatureScot Working Copy_ of BIOSCAN_Manifest_V1.0 (A3484399).xlsx', template_fn, verbose=False)

In [None]:
df = validate('../results/Shap 2021-05-28 corrected.xlsx', template_fn, verbose=False)

In [None]:
df = validate('../results/Bioscan metadata_Jan_22_WYTHAM_WOODS.xlsx', template_fn, verbose=False)

In [None]:
df = validate('../results/NE BIOSCAN_Manifest_V1.0_Yarner_2021.xlsx', template_fn, verbose=False)

In [None]:
df = validate('../results/CARR-20210727-manifest.xlsx', template_fn, verbose=False)