In [1]:
from datetime import date
import itertools
import logging
import pandas as pd
import numpy as np
import datetime
import glob
import os
import pickle
import ete3
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [2]:
logging.getLogger().setLevel(logging.INFO)
# logging.getLogger().setFormat('[%(levelname)s] %(message)s')

def setup_logging(verbose=False):
    try: 
        del logging.root.handlers[:]
    except:
        pass
    if verbose:
        logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
    else:
        logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
setup_logging(verbose=True)   
logging.info('test')

[INFO] test


In [3]:
anospp_fn = '../data/Anopheles_Metadata_Manifest_V4.0_20220825.xlsx'

In [4]:
# download and install taxonomy
ncbi = ete3.NCBITaxa()
# run update if needed
# ncbi.update_taxonomy_database()

In [5]:
def get_data(fn, sheet='TAB 2 Metadata Entry'):

    logging.info('reading data from {!r}'.format(fn))
    
    df = pd.read_excel(fn, dtype=str, index_col=0, keep_default_na=False,
                       sheet_name=sheet)
        
    return df
df = get_data(anospp_fn, sheet='TAB 3 TEST Metadata Entry')

[INFO] reading data from '../data/Anopheles_Metadata_Manifest_V4.0_20220825.xlsx'


In [6]:
def remove_trailing_spaces(df):
    for col in df.columns:
        trailing_spaces = (df[col].str.startswith(' ') | df[col].str.endswith(' '))
        if trailing_spaces.any():
            logging.warning('trailing spaces found in column {!r}, SERIES {}. Removing for validation'.format(col,
                df.loc[trailing_spaces].index.to_list()))
            df[col] = df[col].str.strip()
            
    return df
df = remove_trailing_spaces(df)



In [7]:
template_df = get_data(anospp_fn, sheet='TAB 2 Metadata Entry')

[INFO] reading data from '../data/Anopheles_Metadata_Manifest_V4.0_20220825.xlsx'


In [8]:
def check_columns(df, template_df):
    
    logging.info('checking manifest columns against template')
    
    data_cols = set(df.columns)
    template_cols = set(template_df.columns)
        
    if data_cols - template_cols != set():
        logging.warning('extra columns in filled manifest compared to template: {}'.format(data_cols - template_cols))
    if template_cols - data_cols != set():
        logging.error('template columns missing from filled manifest: {}'.format(template_cols - data_cols))
check_columns(df, template_df)

[INFO] checking manifest columns against template


In [9]:
def get_valid_dict(fn, validation_sheet='Data Validation - do not edit'):
    
    # pick up validation values from data validation sheet
    logging.info('extracting value validation data from {!r}'.format(fn))
    valid_df = pd.read_excel(fn, dtype=str, sheet_name=validation_sheet)
    valid_dict = dict()
    for col in valid_df.columns:
        valid_dict[col] = valid_df[col].dropna().to_list()
    
    return valid_dict
valid_dict = get_valid_dict(anospp_fn, validation_sheet='TAB 5 Data Validation - do not ')

[INFO] extracting value validation data from '../data/Anopheles_Metadata_Manifest_V4.0_20220825.xlsx'


In [10]:
def exclude_missing(series, na_values=[]):
    
    # valid missing data 
    if len(na_values) > 0:
        no_data = (series.isin(na_values))
        logging.info('excluding {} {!r} samples without data in {!r}'.format(no_data.sum(), na_values, series.name))
        return series[~no_data]
    return series
    
exclude_missing(df['TIME_OF_COLLECTION'], na_values=['NOT_COLLECTED',''])

[INFO] excluding 675 ['NOT_COLLECTED', ''] samples without data in 'TIME_OF_COLLECTION'


SERIES
1     18:00:00
2     22:43:00
3     04:00:00
4     04:00:00
5      "33.00"
        ...   
91    09:00:00
92    09:00:00
93    09:00:00
94    09:00:00
95    09:00:00
Name: TIME_OF_COLLECTION, Length: 95, dtype: object

In [11]:
def validate_series(df):
    
    # series should be 1,2, ..., nsamples
    logging.info('validating SERIES')
    
    if df.index.duplicated().any():
        logging.error('duplicate SERIES: {}'.format(df.index[df.index.duplicated()].to_list()))
    
    # exclude non-numeric SERIES
    series_numeric = df.index.astype(str).str.isnumeric()
    if not series_numeric.all():
        logging.error(f'Found and excluded non-numeric SERIES: {df.index[~series_numeric].to_list()}')
        df = df.loc[series_numeric]
        
    # check the remaining SERIES are continuous
    expected_series = set([str(i) for i in range(1, df.shape[0] + 1)])
    observed_series = set(df.index.astype(str))
    if expected_series != observed_series:
        logging.error(f'In SERIES, {sorted(list(expected_series - observed_series))} are missing, '
                      f'{sorted(list(observed_series - expected_series))} are unexpected')
        
    return df
        
df = validate_series(df)

[INFO] validating SERIES
[ERROR] In SERIES, ['770'] are missing, ['771'] are unexpected


In [12]:
def validate_plates_wells(df, plate_col, well_col, bioscan=False):
    
    # expect only complete 96-well plates
    logging.info(f'validating {plate_col} and {well_col}')
    
    empty_rows = (df[plate_col] == '') | (df[well_col] == '')
    
    if empty_rows.any():
        logging.error(f'Found and excluded {empty_rows.sum()} empty rows based on {plate_col} and {well_col}')
        df = df.loc[~empty_rows]
    
    # add 96-well plate well IDs to validation
    row_id = list('ABCDEFGH')
    col_id = range(1,13)
    expected_wells = [r + str(c) for (r,c) in itertools.product(row_id, col_id)]
    
    pdfs = []
    
    for plate, pdf in df.groupby(plate_col):
        # If plate level only metadata is being entered
        # put “PLATE_ONLY” in well 
        # and use only one row to capture the metadata for the whole plate
        if bioscan and (pdf[well_col] == 'PLATE_ONLY').any():
            # elevate to warning
            logging.warning(f'found PLATE_ONLY plate {plate}, expanding to well-level')
            if pdf.shape[0] > 1:
                logging.error(f'too many rows in PLATE_ONLY plate {plate}, expected one')
            # expand to 96 rows
            pdf = pd.DataFrame(pdf.iloc[0] for i in range(len(expected_wells)))
            pdf[well_col] = expected_wells
            # H12 blank
            for col in pdf.columns:
                if col == 'ORGANISM_PART':
                    pdf[col].iloc[-1]='NOT_APPLICABLE'
                elif col not in ['SERIES','CATCH_LOT','RACK_OR_PLATE_ID',
                                 'TUBE_OR_WELL_ID','ORGANISM_PART','PRESERVATIVE_SOLUTION']:
                    pdf[col].iloc[-1]=''
        dup_wells =  pdf[well_col].duplicated()
        if dup_wells.any():
            logging.error(f'duplicate {well_col} for plate {plate}: {pdf.loc[dup_wells, well_col].unique()}')
        observed_wells = set(pdf[well_col])
        expected_wells = set(expected_wells)
        if observed_wells != expected_wells:
            logging.error(f'in {well_col} for plate {plate}, wells {expected_wells - observed_wells} '
                          f'are missing, wells {observed_wells - expected_wells} are excessive')
        pdfs.append(pdf)
    
    if bioscan:
        df = pd.concat(pdfs)
    
    logging.info(f'found {df.shape[0]} samples across {df[plate_col].nunique()} plates')
    
    return df
        
df = validate_plates_wells(df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID')

[INFO] validating RACK_OR_PLATE_ID and TUBE_OR_WELL_ID
[ERROR] Found and excluded 482 empty rows based on RACK_OR_PLATE_ID and TUBE_OR_WELL_ID
[INFO] found 288 samples across 3 plates


In [13]:
## TODO - which columns require NA, do not remove blanks to be able to get taxids for all
def check_blanks(df):
    
    logging.info('Checking and excluding blank samples')    
    
    # blank criterion
    is_blank = (df['ORGANISM_PART'] == 'NOT_APPLICABLE')
    
    blank_df = df[is_blank]
    
    non_blank_df = df[~is_blank]
    
    # last well of plate expected to be blank
    non_blank_last_well_df = non_blank_df[non_blank_df['TUBE_OR_WELL_ID'] == 'H12']
    if non_blank_last_well_df.shape[0] > 0:
        logging.error('last well H12 is not blank at SERIES {}: in ORGANISM_PART, '
                      'expected "NOT_APPLICABLE", found {}. '
                      'These samples will be included in further analysis.'.format(
                        non_blank_last_well_df.index.to_list(),
                        non_blank_last_well_df.ORGANISM_PART.to_list()
        ))
    
    # raise to warning for sanity check
    logging.warning('{} blanks located across {} plates, {} samples of {} left for downstream analysis'.format(
        is_blank.sum(), df.RACK_OR_PLATE_ID.nunique(), non_blank_df.shape[0], df.shape[0]))
    
    
    return is_blank
print(df.shape)
is_blank = check_blanks(df)
print(df[~is_blank].shape)

[INFO] Checking and excluding blank samples
[ERROR] last well H12 is not blank at SERIES [288]: in ORGANISM_PART, expected "NOT_APPLICABLE", found ['TEST']. These samples will be included in further analysis.


(288, 35)
(286, 35)


In [14]:
def validate_values(col, df, valid_dict, sep=None, na_values=[], level='e'):
    
    logging.info('validating values in column {!r}'.format(col))
    
    if col not in df.columns:
        logging.error('{!r} column not found in manifest'.format(col))
        return
    if col not in valid_dict.keys():
        logging.error('{!r} column not found in validation sheet'.format(col))
        return
    assert level in ('i','w','e'), '{!r} invalid logging level for validate_values'.format(level)
    
    series = df[col]
    series = exclude_missing(series, na_values)
    
    col_values = set(series.unique())
    # use separator to split values
    if sep:
        sep_col_values = list()
        for v in col_values:
            sep_col_values.extend([x.strip() for x in v.split(sep)])
        col_values = set(sep_col_values)
    
    valid_values = set(valid_dict[col])
    
    invalid_values = col_values - valid_values
    
    if len(invalid_values) > 0:
        msg = 'invalid values in {!r}: {}'.format(col, invalid_values)
        if level == 'i':
            logging.info(msg)
        elif level == 'w':
            logging.warning(msg)
        elif level == 'e':
            logging.error(msg)
#     else:
#         logging.info('all values valid in {!r}'.format(col))
            
validate_values('ORGANISM_PART', df, valid_dict, sep=" | ", na_values=[], level='e')

[INFO] validating values in column 'ORGANISM_PART'
[INFO] excluding 0 [] samples without data in 'ORGANISM_PART'
[ERROR] invalid values in 'ORGANISM_PART': {'TEST'}


In [15]:
def validate_date(col, df, na_values=[]):
    
    logging.info('validating date column {!r}'.format(col))

    if col not in df.columns:
        logging.error('{!r} column not found in manifest'.format(col))
        return
    series = df[col]
    series = exclude_missing(series, na_values)
    
    # invalid date formats
    # empty string converted to NaT
    date_series = pd.to_datetime(series, format='%Y-%m-%d', errors='coerce')
    if date_series.isna().any():
        logging.error('invalid dates in {!r}: {}'.format(col, 
                                                         series[date_series.isna()].unique()))
    valid_date_series = date_series[~date_series.isna()]
    
    # dates in future
    future_dates = (valid_date_series > datetime.datetime.today())
    if future_dates.any():
        logging.error('future dates in {!r}: {}'.format(col,
            valid_date_series[future_dates].to_list()))
        
    # dates too old
    old_dates = (valid_date_series < datetime.datetime.strptime('1900-01-01', '%Y-%m-%d'))
    if old_dates.any():
        logging.error("pre-1900 dates in {!r}: {}".format(col,
            valid_date_series[old_dates].to_list())) 
    
    return valid_date_series
validate_date('DATE_OF_COLLECTION', df, na_values=[])

[INFO] validating date column 'DATE_OF_COLLECTION'
[INFO] excluding 0 [] samples without data in 'DATE_OF_COLLECTION'
[ERROR] invalid dates in 'DATE_OF_COLLECTION': ['TEST' '']
[ERROR] future dates in 'DATE_OF_COLLECTION': [Timestamp('2117-08-21 00:00:00')]


SERIES
1     2022-07-06
2     2021-03-03
3     2017-08-21
4     2017-08-21
5     2017-08-21
         ...    
284   2019-03-03
285   2019-03-03
286   2019-03-03
287   2019-03-03
288   2019-03-03
Name: DATE_OF_COLLECTION, Length: 285, dtype: datetime64[ns]

In [16]:
def validate_time(col, df, na_values=[]):
    
    logging.info('validating time column {!r}'.format(col))
    
    if col not in df.columns:
        logging.error('{!r} column not found in manifest'.format(col))
        return
    series = df[col]
    series = exclude_missing(series, na_values)
        
    # invalid time formats
    # NB empty string converted to NaT
    time_series = pd.to_datetime(series, format='%H:%M:%S', errors='coerce')
    if time_series.isna().any():
        logging.error('invalid times in {!r}: {}'.format(col, 
                                                         series[time_series.isna()].unique()))
    valid_time_series = time_series[~time_series.isna()]
    
    return valid_time_series
validate_time('TIME_OF_COLLECTION', df, na_values=[''])

[INFO] validating time column 'TIME_OF_COLLECTION'
[INFO] excluding 193 [''] samples without data in 'TIME_OF_COLLECTION'
[ERROR] invalid times in 'TIME_OF_COLLECTION': ['"33.00"']


SERIES
1    1900-01-01 18:00:00
2    1900-01-01 22:43:00
3    1900-01-01 04:00:00
4    1900-01-01 04:00:00
6    1900-01-01 04:00:00
             ...        
91   1900-01-01 09:00:00
92   1900-01-01 09:00:00
93   1900-01-01 09:00:00
94   1900-01-01 09:00:00
95   1900-01-01 09:00:00
Name: TIME_OF_COLLECTION, Length: 94, dtype: datetime64[ns]

In [17]:
def validate_time_period(col, df, na_values=[]):
    
    logging.info('validating time period column {!r}'.format(col))
    
    if col not in df.columns:
        logging.error('{!r} column not found in manifest'.format(col))
        return
    series = df[col]
    series = exclude_missing(series, na_values)

    # conversion with modifications for proper parsing 
    # by pd.Timedelta (does not accept missing data, e.g. 'PT1H')
    # note - will not work for weeks and months
    def convert_iso_duration(s):
        if s == np.nan:
            return np.nan
        if not s.startswith('P') or 'T' not in s:
            return np.nan
        # add days
        if s.startswith('PT'):
            s = s.replace('PT','P0DT')
        # add trailing minutes and seconds
        if s.endswith('H'):
            s += '0M0S'
        elif s.endswith('M'):
            s += '0S'
        try:
            return pd.Timedelta(s)
        except:
            return np.nan
    time_period_series = series.apply(convert_iso_duration)
    if time_period_series.isna().any():
        logging.error('invalid times in {!r}: {}'.format(col, 
            series[time_period_series.isna()].unique()))
    valid_time_period_series = time_period_series[~time_period_series.isna()]
    return valid_time_period_series

# df.loc[1,'DURATION_OF_COLLECTION'] = 'PVT1H'
validate_time_period('DURATION_OF_COLLECTION', df, na_values=['']);
# df['DURATION_OF_COLLECTION']

[INFO] validating time period column 'DURATION_OF_COLLECTION'
[INFO] excluding 194 [''] samples without data in 'DURATION_OF_COLLECTION'
[ERROR] invalid times in 'DURATION_OF_COLLECTION': ['P1WT1H' 'TEST']


In [18]:
# TODO include tests
def validate_country_and_coordinates(df, fn, na_values=[], bioscan=False):
    
    logging.info('validating country with coordinates')
    
    if bioscan:
        country_col, lat_col, lon_col = 'COUNTRY_OF_COLLECTION', 'DECIMAL_LATITUDE', 'DECIMAL_LONGITUDE'
    else:
        country_col, lat_col, lon_col = 'COLLECTION_COUNTRY', 'DECIMAL_LATITUDE', 'DECIMAL_LONGITUDE'

    try:
        loc_df_complete = df[[country_col, lat_col, lon_col]].copy()
    except:
        logging.error('One of {!r} {!r} {!r} columns not found in manifest'.format(country_col, lat_col, lon_col))
        return
    loc_df_isna = (loc_df_complete.isin(na_values)).all(axis=1)
    if loc_df_isna.any():
        logging.info('removing {} {!r} samples with missing data from coordinate analysis'.format(
                loc_df_isna.sum(), na_values))
    loc_df_complete = loc_df_complete[~loc_df_isna].copy()
    
    # coordinates in geopy format
    loc_df_complete['coord'] = loc_df_complete.apply(lambda x: '{}, {}'.format(
            x[lat_col], x[lon_col]), axis=1)
    
    # get location data for coordinates
    # use local copy of web query results for re-runs
    # this 
    loc_fn = fn+'_loc.pkl'
    if os.path.isfile(loc_fn):
        locations = pickle.load(open(loc_fn, "rb"))
    else:
        # web map server - openstreetmaps
        logging.info('querying coordinates')
        locator = Nominatim(user_agent='myGeocoder')
        rgeocode = RateLimiter(locator.reverse, min_delay_seconds=1)

        locations = dict()
        for c in loc_df_complete.coord.unique():
            # pre-fill with unknown country
            locations[c] = {'address':{'country':'UNKNOWN'}}
            # check coordniate correctness
            try:
                lat, lon = c.split(', ')
                lat, lon = float(lat), float(lon)
            except:
                logging.error('problem parsing coordinates {!r}'.format(c))
                continue
            if abs(lat) > 90:
                logging.error('invalid latitude {}, should be in [-90,90]'.format(lat))
                continue
            if abs(lon) > 180:
                logging.error('invalid longitude {}, should be in [-180,180]'.format(lon))
                continue
            # web query
            location = rgeocode(c, language='en-gb')
            # rgeocode returns empty location outside of counries and in some other situations
            if location is not None:
                locations[c] = location.raw

        # save locations to file
        pickle.dump(locations, open(loc_fn, "wb"))
        
    # parse country from partner input
    loc_df_complete['partner_country'] = loc_df_complete[country_col].str.strip().str.upper()
    
    # extract countries from location data
    loc_countries = dict()
    for coord in locations.keys():
        coord_country = locations[coord]['address']['country'].upper()
        loc_countries[coord] = coord_country
        
        partner_countries = loc_df_complete.loc[loc_df_complete.coord == coord, 'partner_country']
        if partner_countries.nunique() > 1:
            logging.error('multiple partner countries for coordinates {!r}: {}'
                          'skipping coordinate validation'.format(
                                coord, partner_countries.unique()))
            continue
        if partner_countries.shape[0] == 0:
            logging.error('no partner location found for coordinates {!r}'.format(coord))
            continue
        partner_country = partner_countries.iloc[0]
        if coord_country == 'UNKNOWN':
            logging.warning('could not locate country for coordinates {!r}, partner country {!r}'.format(
                    coord, partner_country))
        elif partner_country != coord_country:
            logging.error('country mismatch for coordinates {!r}, partner country {!r}, '
                          'coordinate country {!r}'.format(coord, partner_country, coord_country))
    
    # countries based on coordinates
    loc_df_complete['coord_country'] = loc_df_complete['coord'].replace(loc_countries)
    country_mismatch = (loc_df_complete.coord_country != loc_df_complete.partner_country)

#     if country_mismatch.any():
#         logging.error('coordinates do not match country for SERIES: {}'.format(
#                 country_mismatch[country_mismatch].index.to_list()))
    
    # location data can be re-used, e.g. as an additional field
    return loc_df_complete
# df.loc[2,'DECIMAL_LATITUDE'] = '65'
loc_test = validate_country_and_coordinates(df, anospp_fn)
loc_test

[INFO] validating country with coordinates


Unnamed: 0_level_0,COLLECTION_COUNTRY,DECIMAL_LATITUDE,DECIMAL_LONGITUDE,coord,partner_country,coord_country
SERIES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,UNITED KINGDOM,52.0193,0.2424,"52.0193, 0.2424",UNITED KINGDOM,UNITED KINGDOM
2,GABON,-0.218,11.584,"-0.218, 11.584",GABON,GABON
3,BURKINA FASO,11.2339822,-4.4725306,"11.2339822, -4.4725306",BURKINA FASO,BURKINA FASO
4,BURKINA FASO,11.2339822,-4.4725306,"11.2339822, -4.4725306",BURKINA FASO,BURKINA FASO
5,BURKINA FASO,11.2339822,-4.4725306,"11.2339822, -4.4725306",BURKINA FASO,BURKINA FASO
...,...,...,...,...,...,...
284,UNITED KINGDOM,52.0193,0.2424,"52.0193, 0.2424",UNITED KINGDOM,UNITED KINGDOM
285,UNITED KINGDOM,52.0193,0.2424,"52.0193, 0.2424",UNITED KINGDOM,UNITED KINGDOM
286,UNITED KINGDOM,52.0193,0.2424,"52.0193, 0.2424",UNITED KINGDOM,UNITED KINGDOM
287,UNITED KINGDOM,52.0193,0.2424,"52.0193, 0.2424",UNITED KINGDOM,UNITED KINGDOM


In [19]:
# TODO hierarchy tests in BIOSCAN
def validate_taxonomy(df, ncbi, na_values = [], anospp=False):

    logging.info('validating taxonomy against NCBI')
    
    if anospp:
        df['PREDICTED_ORDER_OR_GROUP'] = 'Diptera'
        df['PREDICTED_FAMILY'] = 'Culicidae'
        df['PREDICTED_GENUS'] = 'Anopheles'
        
        harbach_spp = []
        with open('../data/harbach_spp_201910.txt') as f:
            for line in f:
                harbach_spp.append('Anopheles ' + line.strip())
        harbach_spp = set(harbach_spp)
        
    tax_levels = {
        'PREDICTED_ORDER_OR_GROUP':'order',
        'PREDICTED_FAMILY':'family',
        'PREDICTED_GENUS':'genus',
        'PREDICTED_SCIENTIFIC_NAME':'species'
    }
    
    hierarchies = df[tax_levels.keys()].drop_duplicates().copy()
    
    hierarchies.columns = list(tax_levels.values())
        
    tax_info = dict()
    
    for tax_col, tax_level in tax_levels.items():
        
        logging.info(f'validating {tax_col} against NCBI')
        
        if tax_col not in df.columns:
                logging.error(f'{tax_col} column not found in manifest')
                continue
            
        tax_names = list(hierarchies[tax_level].unique())
        
        for na_value in na_values:
            try:
                tax_names.remove(na_value)
            except:
                pass 
            
        for i, tax_name in enumerate(tax_names):
            if len(tax_name) == 0:
                continue
            corr_tax_name = tax_name[0].upper() + tax_name[1:].lower()
            if corr_tax_name != tax_name and tax_name != 'blank sample':
                logging.error(f'{tax_level}: unexpected case for "{tax_name}", '
                              f'changing to "{corr_tax_name}" for validation')
            tax_names[i] = corr_tax_name
        
        tax_info[tax_level] = ncbi.get_name_translator(tax_names) 
        
        unmatched_names = set(tax_names) - set(tax_info[tax_level].keys())
        if len(unmatched_names) > 0:
            logging.error(f'{tax_level}: {unmatched_names} not found in NCBI Taxonomy')
            if tax_level == 'species' and anospp:
                for sp in unmatched_names:
                    if sp in harbach_spp:
                        logging.error(f'{tax_level}: "{sp}" found in Harbach list, but not in NCBI Taxonomy')
        
        expected_rank = tax_level
        
        for tname, tids in tax_info[tax_level].items():
            
            ranks = ncbi.get_rank(tids)
            
            upd_tid = tids[0]
            
            if len(tids) == 1:
                if ranks[upd_tid] != expected_rank: 
                    # TODO warning->info for ORDER
                    logging.warning(f'{tax_level}: found unexpected rank for {tname} (taxid {upd_tid}): {ranks[upd_tid]}')
            if len(tids) > 1:            
                for tid, r in ranks.items():
                    if r == expected_rank and len(tids) > 1:
                        logging.info(f'{tax_level}: using only first matching rank for {tname} (taxid {tid}): {r}')
                        upd_tid = tid
                        break
                else:
                    logging.warning(f'{tax_level}: could not find matching rank for {tname}, '
                                    f'using (taxid {upd_tid}): {ranks[upd_tid]}')
                    
            tax_info[tax_level][tname] = upd_tid
        
        #logging.info(f'{tax_level} {tax_info[tax_level]}')
                    
    # check consistency of taxonomy
    for _, r in hierarchies.iterrows():
        
        if r.order in na_values:
            continue
        try:
            order_id = tax_info['order'][r.order]
        except KeyError:
            logging.info(f'cannot validate PREDICTED_ORDER_OR_GROUP for "{r.order}", skipping taxonomy consistency check')
            continue
            
        if r.genus in na_values:
            continue
        try:
            family_id = tax_info['family'][r.family]
            
            family_lineage = ncbi.get_lineage(family_id)
            
            if order_id not in family_lineage:
                logging.error(f'Family {r.family} (taxid {family_id}) does not belong to {r.order} (taxid {order_id})')
        except KeyError:
            logging.info(f'cannot validate PREDICTED_FAMILY for "{r.family}", skipping taxonomy consistency check')
            continue
            
        if r.genus in na_values:
            continue
        try:
            genus_id = tax_info['genus'][r.genus]
            
            genus_lineage = ncbi.get_lineage(genus_id)
            
            if order_id not in genus_lineage:
                logging.error(f'Genus {r.genus} (taxid {genus_id}) does not belong to {r.order} (taxid {order_id})')
            if family_id not in genus_lineage:
                logging.error(f'Genus {r.genus} (taxid {genus_id}) does not belong to {r.family} (taxid {family_id})')
        except KeyError:
            logging.info(f'cannot validate PREDICTED_GENUS for "{r.genus}", skipping taxonomy consistency check')
            continue
            
        if r.species in na_values:
            continue
        try:
            species_id = tax_info['species'][r.species]
            
            species_lineage = ncbi.get_lineage(species_id)
            
            if order_id not in species_lineage:
                logging.error(f'Species {r.species} (taxid {species_id}) does not belong to {r.order} (taxid {order_id})')
            if family_id not in species_lineage:
                logging.error(f'Species {r.species} (taxid {species_id}) does not belong to {r.family} (taxid {family_id})')
            if genus_id not in species_lineage:
                logging.error(f'Species {r.species} (taxid {species_id}) does not belong to {r.family} (taxid {genus_id})')
        except KeyError:
            logging.info(f'cannot validate PREDICTED_SCIENTIFIC_NAME for "{r.species}", skipping taxonomy consistency check')
            continue
            
    for tc in tax_levels.keys():
        df[f'{tc}_TAXID'] = df[tc].replace(tax_info[tax_levels[tc]])
            
    return df
        
                
validate_taxonomy(df, ncbi, anospp=True)

[INFO] validating taxonomy against NCBI
[INFO] validating PREDICTED_ORDER_OR_GROUP against NCBI
[INFO] validating PREDICTED_FAMILY against NCBI
[INFO] validating PREDICTED_GENUS against NCBI
[INFO] genus: using only first matching rank for Anopheles (taxid 7164): genus
[INFO] validating PREDICTED_SCIENTIFIC_NAME against NCBI
[ERROR] species: {'', 'Anopheles stricklandi', 'Anopheles test'} not found in NCBI Taxonomy
[ERROR] species: "Anopheles stricklandi" found in Harbach list, but not in NCBI Taxonomy
[INFO] cannot validate PREDICTED_SCIENTIFIC_NAME for "Anopheles test", skipping taxonomy consistency check
[INFO] cannot validate PREDICTED_SCIENTIFIC_NAME for "Anopheles stricklandi", skipping taxonomy consistency check
[INFO] cannot validate PREDICTED_SCIENTIFIC_NAME for "", skipping taxonomy consistency check


Unnamed: 0_level_0,RACK_OR_PLATE_ID,TUBE_OR_WELL_ID,PRESERVATIVE_SOLUTION,ORGANISM_PART,DATE_OF_COLLECTION,COLLECTION_COUNTRY,COLLECTION_LOCATION,DECIMAL_LATITUDE,DECIMAL_LONGITUDE,SAMPLING_LOCATION_SIZE,...,DNA_EXTRACTION_DESCRIPTION,DNA_EXTRACT_VOLUME_PROVIDED,DNA_EXTRACT_CONCENTRATION,PREDICTED_ORDER_OR_GROUP,PREDICTED_FAMILY,PREDICTED_GENUS,PREDICTED_ORDER_OR_GROUP_TAXID,PREDICTED_FAMILY_TAXID,PREDICTED_GENUS_TAXID,PREDICTED_SCIENTIFIC_NAME_TAXID
SERIES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,DACH_001,A1,100%_ETHANOL,WHOLE_ORGANISM,2022-07-06 00:00:00,UNITED KINGDOM,ENGLAND | ESSEX | UTTLESFORD | SAFFRON WALDEN ...,52.0193,0.2424,1m²,...,,,,Diptera,Culicidae,Anopheles,7147,7157,7164,227531
2,DACH_001,B1,100%_ETHANOL,WHOLE_ORGANISM,2021-03-03 00:00:00,GABON,LOPE NATIONAL PARK,-0.218,11.584,100m²,...,,,,Diptera,Culicidae,Anopheles,7147,7157,7164,7165
3,DACH_001,C1,100%_ETHANOL,THORAX | ABDOMEN,2017-08-21 00:00:00,BURKINA FASO,HAUTS-BASSINS | HOEUT | ARRONDISSEMENT N°7 DE ...,11.2339822,-4.4725306,1km²,...,,,,Diptera,Culicidae,Anopheles,7147,7157,7164,1518534
4,DACH_001,D1,100%_ETHANOL,WHOLE_ORGANISM,2017-08-21 00:00:00,BURKINA FASO,HAUTS-BASSINS | HOEUT | ARRONDISSEMENT N°7 DE ...,11.2339822,-4.4725306,1km²,...,,,,Diptera,Culicidae,Anopheles,7147,7157,7164,Anopheles test
5,DACH_001,E1,100%_ETHANOL,WHOLE_ORGANISM,2017-08-21 00:00:00,BURKINA FASO,HAUTS-BASSINS | HOEUT | ARRONDISSEMENT N°7 DE ...,11.2339822,-4.4725306,1m²,...,,,,Diptera,Culicidae,Anopheles,7147,7157,7164,Anopheles stricklandi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,DACH_003,D12,100%_ETHANOL,HEAD,2019-03-03 00:00:00,UNITED KINGDOM,ENGLAND | ESSEX | UTTLESFORD | SAFFRON WALDEN ...,52.0193,0.2424,10m²,...,,,,Diptera,Culicidae,Anopheles,7147,7157,7164,227531
285,DACH_003,E12,100%_ETHANOL,HEAD,2019-03-03 00:00:00,UNITED KINGDOM,ENGLAND | ESSEX | UTTLESFORD | SAFFRON WALDEN ...,52.0193,0.2424,10m²,...,,,,Diptera,Culicidae,Anopheles,7147,7157,7164,227531
286,DACH_003,F12,100%_ETHANOL,HEAD,2019-03-03 00:00:00,UNITED KINGDOM,ENGLAND | ESSEX | UTTLESFORD | SAFFRON WALDEN ...,52.0193,0.2424,10m²,...,,,,Diptera,Culicidae,Anopheles,7147,7157,7164,227531
287,DACH_003,G12,100%_ETHANOL,HEAD,2019-03-03 00:00:00,UNITED KINGDOM,ENGLAND | ESSEX | UTTLESFORD | SAFFRON WALDEN ...,52.0193,0.2424,10m²,...,,,,Diptera,Culicidae,Anopheles,7147,7157,7164,227531


In [20]:
def validate_specimen_id_risk(df):
    
    logging.info(f'validating SPECIMEN_IDENTITY_RISK')
    
    if 'SPECIMEN_IDENTITY_RISK' not in df.columns:
        logging.error(f'SPECIMEN_IDENTITY_RISK column not found in manifest')
        return
    
    # missing species name, but no idenitity risk
    invalid_risk = ((df.PREDICTED_SCIENTIFIC_NAME == '') & (df.SPECIMEN_IDENTITY_RISK == 'N'))
    
    if invalid_risk.any():
        logging.error(f'SPECIMEN_IDENTITY_RISK should be Y in SERIES {df.loc[invalid_risk].index.to_list()}')
validate_specimen_id_risk(df)

[INFO] validating SPECIMEN_IDENTITY_RISK
[ERROR] SPECIMEN_IDENTITY_RISK should be Y in SERIES [8]


In [21]:
def validate_float(col, df, na_values=[]):
    
    logging.info(f'validating numeric format in {col}')
    
    if col not in df.columns:
        logging.error(f'{col} column not found in manifest')
        return
    series = df[col]
    series = exclude_missing(series, na_values)
    
    for val in series.unique():
        try:
            float(val)
        except:
            logging.error(f'found non-numeric value in {col}: "{val}"')
validate_float('ELEVATION', df, na_values=[''])

[INFO] validating numeric format in ELEVATION
[INFO] excluding 193 [''] samples without data in 'ELEVATION'
[ERROR] found non-numeric value in ELEVATION: "700m"


In [43]:
def validate_freetext(col, df, na_values=['']):
    
    logging.info(f'validating freetext chars in {col}')
    
    if col not in df.columns:
        logging.error(f'{col} column not found in manifest')
        return
    series = df[col]
    series = exclude_missing(series, na_values)
    
    regex = '^[A-z0-9.,\-_ ]+$'
    
    is_valid_freetext = series.str.match(regex)
    if not is_valid_freetext.all():
        logging.warning('found non-standard characters in column {}, SERIES {}. Regex: "{}"'.format(
        col, series.loc[~is_valid_freetext].index.to_list(), regex))

validate_freetext('IDENTIFIED_HOW', df)

[INFO] validating freetext chars in IDENTIFIED_HOW
[INFO] excluding 97 [''] samples without data in 'IDENTIFIED_HOW'
[ERROR] found non-standard characters in column IDENTIFIED_HOW, SERIES [6]. Regex: "^[A-z0-9.,\- ]+$"


In [23]:
bd = validate_date('DATE_OF_COLLECTION', df)
ad = validate_date('DATE_OF_PRESERVATION', df)

[INFO] validating date column 'DATE_OF_COLLECTION'
[INFO] excluding 0 [] samples without data in 'DATE_OF_COLLECTION'
[ERROR] invalid dates in 'DATE_OF_COLLECTION': ['TEST' '']
[ERROR] future dates in 'DATE_OF_COLLECTION': [Timestamp('2117-08-21 00:00:00')]
[INFO] validating date column 'DATE_OF_PRESERVATION'
[INFO] excluding 0 [] samples without data in 'DATE_OF_PRESERVATION'
[ERROR] invalid dates in 'DATE_OF_PRESERVATION': ['']


In [24]:
def compare_dates(before, after):
        
    logging.info(f'checking that {before.name} are earlier than {after.name}')

    ctdf = pd.concat([before, after], axis=1)
    date_conflict = ctdf[before.name] > ctdf[after.name]
    
    if date_conflict.any():
        logging.error(f'{before.name} values are later than {after.name} for SERIES'
                      f' {ctdf[date_conflict].index.to_list()}')
compare_dates(bd, ad)

[INFO] checking that DATE_OF_COLLECTION are earlier than DATE_OF_PRESERVATION
[ERROR] DATE_OF_COLLECTION values are later than DATE_OF_PRESERVATION for SERIES [9]


In [25]:
def validate_contributors(fn, contrib_sheet='TAB 1 Contributors'):
    
    df = logging.info('validating contributors in {!r}'.format(fn))
    
    df = pd.read_excel(fn, dtype=str, keep_default_na=False,
                       sheet_name=contrib_sheet)
    
    df = remove_trailing_spaces(df)
    
    df['NAME'] = df['SURNAME'] + ' ' + df['FIRST_NAME']
    is_template_name = (df['NAME'] == 'Darwin Charles R.')
    if is_template_name.any():
        logging.warning('suspect template contributor was not removed')
    
    is_dup_name = df['NAME'].duplicated()
    if is_dup_name.any():
        logging.error('duplicated names {}'.format(
                df.loc[is_dup_name, 'NAME'].to_list()))
    
    # TODO update template with underscore in email column
    is_valid_email = df['EMAIL ADDRESS'].str.match('^[A-z0-9._%+-]+@[A-z0-9.-]+\.[A-z]{2,}$')
    if not is_valid_email.all():
        logging.error('invalid email addresses {}'.format(
                df.loc[~is_valid_email, 'EMAIL ADDRESS'].to_list()))
    
    is_confirmed = (df['CONFIRMATION'] == 'YES')
    if not is_confirmed.any():
        logging.error('confirmation lacking for any contributors')
        
    return df
        
contrib_df = validate_contributors(anospp_fn)

[INFO] validating contributors in '../data/Anopheles_Metadata_Manifest_V4.0_20220825.xlsx'
[ERROR] duplicated names ['Darwin Charles R.']
[ERROR] invalid email addresses ['darwin_darwin.darwin']
[ERROR] confirmation lacking for any contributors
