In [5]:
import itertools
import logging
import pandas as pd
import numpy as np
import datetime
import glob
import os
import pickle
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [6]:
logging.getLogger().setLevel(logging.INFO)
# logging.getLogger().setFormat('[%(levelname)s] %(message)s')


def setup_logging(verbose=False):
    try: 
        del logging.root.handlers[:]
    except:
        pass
    if verbose:
        logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
    else:
        logging.basicConfig(level=logging.WARNING, format='[%(levelname)s] %(message)s')
setup_logging(verbose=True)   
logging.info('test')

[INFO] test


In [14]:
fn = '../../results/partner_manifests/IRD-Neandersquito_T222Amplicon_Manifest_V2.0.xlsx'
template_fn = '../../analysis/0_partner/data/T222Amplicon_Manifest_V2.0.xlsx'
spp_fn = '../../analysis/0_partner/data/harbach_spp_201910.txt'

In [15]:
def get_data(fn):

    logging.info('reading data from {!r}'.format(fn))
    
    try:
        df = pd.read_excel(fn, dtype=str, index_col=0, keep_default_na=False,
                           sheet_name='TAB1 Specimen Metadata Entry')
    except:
        df = pd.read_excel(fn, dtype=str, index_col=0, keep_default_na=False,
                           sheet_name='Specimen Metadata Entry')
    
    if df.index.duplicated().any():
        logging.error('duplicate SERIES: {}'.format(df.index[df.index.duplicated()].to_list()))
        
    # trailing spaces
    for col in df.columns:
        trailing_spaces = (df[col].str.startswith(' ') | df[col].str.endswith(' '))
        if trailing_spaces.any():
            logging.warning('trailing spaces found in {!r}, removing for validation'.format(col,
                df.loc[trailing_spaces, col].to_list()))
            df[col] = df[col].str.strip()
        
    return df
df = get_data(fn)

[INFO] reading data from '../../results/partner_manifests/IRD-Neandersquito_T222Amplicon_Manifest_V2.0.xlsx'


In [16]:
template_df = get_data(template_fn)

[INFO] reading data from '../../analysis/0_partner/data/T222Amplicon_Manifest_V2.0.xlsx'


In [17]:
def check_columns(df, template_df):
    
    logging.info('checking manifest columns against template')
    
    data_cols = set(df.columns)
    template_cols = set(template_df.columns)
        
    if data_cols - template_cols != set():
        logging.warning('extra columns in filled manifest compared to template: {}'.format(data_cols - template_cols))
    if template_cols - data_cols != set():
        logging.error('template columns missing from filled manifest: {}'.format(template_cols - data_cols))
check_columns(df, template_df)

[INFO] checking manifest columns against template


In [18]:
def check_exclude_blanks(df):
    
    logging.info('Checking and excluding blank samples')
    
    # last well of plate expected to be blank
    last_well = df[df['TUBE_OR_WELL_ID'] == 'H12']
    last_well_blanks = (last_well['SCIENTIFIC_NAME'] == 'blank sample')
    if not last_well_blanks.all():
        logging.error('last well H12 is not blank at SERIES {}'.format(
                        last_well[~last_well_blanks].index.to_list()))
    
    is_blank = (df['SCIENTIFIC_NAME'] == 'blank sample')
    blanks = df[is_blank]
    
    logging.info('found {} blank samples based on SCIENTIFIC_NAME'.format(blanks.shape[0]))
    
    # check organism part
    organism_part_pass = (blanks['ORGANISM_PART'] == 'BLANK_SAMPLE')
    if not organism_part_pass.all():
        logging.error('for blanks, ORGANISM_PART expected to be BLANK_SAMPLE, found {}'.format(
                set(blanks.loc[~organism_part_pass, 'ORGANISM_PART'])))
    
    # check that NOT_APPLICABLE is filled in all columns
    blanks_na = blanks.drop(columns=['ORGANISM_PART','SCIENTIFIC_NAME',
                                     'TUBE_OR_WELL_ID','RACK_OR_PLATE_ID',
                                     'PRESERVATIVE_SOLUTION'])
    na_filled = (blanks_na == 'NOT_APPLICABLE').all(axis=0)
    if not na_filled.all():
        logging.warning('for blanks, NOT_APPLICABLE expected, but not found in columns {}'.format(
                            na_filled[~na_filled].index.to_list()))
    # exclude blanks from downstream analysis
    
    df_flt = df[~is_blank]
    
    logging.info('{} samples of {} left for downstream analysis'.format(df_flt.shape[0], df.shape[0]))
    
    return df_flt
        
df = check_exclude_blanks(df)

[INFO] Checking and excluding blank samples
[INFO] found 1 blank samples based on SCIENTIFIC_NAME
[ERROR] for blanks, ORGANISM_PART expected to be BLANK_SAMPLE, found {'CONTROL_WELL'}
[INFO] 95 samples of 96 left for downstream analysis


In [19]:
def get_valid_dict(fn):
    # pick up validation values from data validation sheet
    logging.info('extracting value validation data from {!r}'.format(fn))
    valid_df = pd.read_excel(fn, dtype=str, sheet_name='Data Validation')
    valid_dict = dict()
    for col in valid_df.columns:
        valid_dict[col] = valid_df[col].dropna().to_list()
        
    # add 96-well plate well IDs to validation
    rid = list('ABCDEFGH')
    cid = range(1,13)
    valid_dict['TUBE_OR_WELL_ID'] = [r + str(c) for (r,c) in itertools.product(rid, cid)]
    
    # taxonomy placeholder
    valid_dict['ORDER'] = ['Diptera']
    valid_dict['FAMILY'] = ['Culicidae']
    valid_dict['GENUS'] = ['Anopheles']

    
    return valid_dict
valid_dict = get_valid_dict(template_fn)

[INFO] extracting value validation data from '../../analysis/0_partner/data/T222Amplicon_Manifest_V2.0.xlsx'


In [20]:
def exclude_missing(series, na_values=None):
    
    # valid missing data 
    no_data = (series.isin(na_values))
    if no_data.sum() > 0:
        logging.info('excluding {} {!r} samples without data in {!r}'.format(no_data.sum(), na_values, series.name))
    return series[~no_data]
    
exclude_missing(df['TIME_OF_COLLECTION'], na_values=['NOT_COLLECTED', ''])

[INFO] excluding 95 ['NOT_COLLECTED', ''] samples without data in 'TIME_OF_COLLECTION'


Series([], Name: TIME_OF_COLLECTION, dtype: object)

In [21]:
def validate_values(col, df, valid_dict, sep=None, na_values=None, level='e'):
    
    logging.info('validating values in column {!r}'.format(col))
    
    if col not in df.columns:
        logging.error('{!r} column not found in manifest'.format(col))
        return
    if col not in valid_dict.keys():
        logging.error('{!r} column not found in validation sheet'.format(col))
        return
    assert level in ('i','w','e'), '{!r} invalid logging level for validate_values'.format(level)
    
    series = df[col]
    if na_values:
        series = exclude_missing(series, na_values)
    
    col_values = set(series.unique())
    # use separator to split values
    if sep:
        sep_col_values = list()
        for v in col_values:
            sep_col_values.extend([x.strip() for x in v.split(sep)])
        col_values = set(sep_col_values)
    valid_values = set(valid_dict[col])
    invalid_values = col_values - valid_values
    if len(invalid_values) > 0:
        msg = 'invalid values in {!r}: {}'.format(col, invalid_values)
        if level == 'i':
            logging.info(msg)
        elif level == 'w':
            logging.warning(msg)
        elif level == 'e':
            logging.error(msg)
#     else:
#         logging.info('all values valid in {!r}'.format(col))
            
validate_values('ORGANISM_PART', df, valid_dict, sep=" | ")

[INFO] validating values in column 'ORGANISM_PART'


In [22]:
def validate_date(col, df):
    
    logging.info('validating date column {!r}'.format(col))

    if col not in df.columns:
        logging.error('{!r} column not found in manifest'.format(col))
        return
    series = df[col]
    # missing date not allowed
    # series = exclude_missing(series, na_values)
    
    # invalid date formats
    # empty string converted to NaT
    date_series = pd.to_datetime(series, format='%Y-%m-%d', errors='coerce')
    if date_series.isna().any():
        logging.error('invalid dates in {!r}: {}'.format(col, 
                                                         series[date_series.isna()].unique()))
    valid_date_series = date_series[~date_series.isna()]
    
    # dates in future
    future_dates = (valid_date_series > datetime.datetime.today())
    if future_dates.any():
        logging.error('future dates in {!r}: {}'.format(col,
            valid_date_series[future_dates].to_list()))
        
    # dates too old
    old_dates = (valid_date_series < datetime.datetime.strptime('1900-01-01', '%Y-%m-%d'))
    if old_dates.any():
        logging.error("pre-1900 dates in {!r}: {}".format(col,
            valid_date_series[old_dates].to_list())) 
    
    return valid_date_series
df.loc[1,'DATE_OF_COLLECTION'] = 'NOT_COLLECTED'
validate_date('DATE_OF_COLLECTION', df)

[INFO] validating date column 'DATE_OF_COLLECTION'
[ERROR] invalid dates in 'DATE_OF_COLLECTION': ['NOT_COLLECTED']


SERIES
2    1905-05-26
3    1905-05-26
4    1905-05-26
5    1905-05-26
6    1905-05-26
        ...    
91   1905-05-14
92   1905-05-16
93   1905-05-16
94   1905-05-10
95   1905-05-14
Name: DATE_OF_COLLECTION, Length: 94, dtype: datetime64[ns]

In [23]:
series = pd.Series(['2000'])
pd.to_datetime(series, format='%Y-%m-%d', errors='coerce')

0   2000-01-01
dtype: datetime64[ns]

In [24]:
def validate_time(col, df, na_values=['NOT_COLLECTED']):
    
    logging.info('validating time column {!r}'.format(col))
    
    if col not in df.columns:
        logging.error('{!r} column not found in manifest'.format(col))
        return
    series = df[col]
    series = exclude_missing(series, na_values)
        
    # invalid time formats
    # NB empty string converted to NaT
    time_series = pd.to_datetime(series, format='%H:%M:%S', errors='coerce')
    if time_series.isna().any():
        logging.error('invalid times in {!r}: {}'.format(col, 
                                                         series[time_series.isna()].unique()))
    valid_time_series = time_series[~time_series.isna()]
    
    return valid_time_series
# df.loc[1,'TIME_OF_COLLECTION'] = '23'
validate_time('TIME_OF_COLLECTION', df)

[INFO] validating time column 'TIME_OF_COLLECTION'
[ERROR] invalid times in 'TIME_OF_COLLECTION': ['']


Series([], Name: TIME_OF_COLLECTION, dtype: datetime64[ns])

In [25]:
def validate_time_period(col, df, na_values=['NOT_COLLECTED']):
    
    logging.info('validating time period column {!r}'.format(col))
    
    if col not in df.columns:
        logging.error('{!r} column not found in manifest'.format(col))
        return
    series = df[col]
    series = exclude_missing(series, na_values)

    # conversion with modifications for proper parsing 
    # by pd.Timedelta (does not accept missing data, e.g. 'PT1H')
    # note - will not work for weeks and months
    def convert_iso_duration(s):
        if s == np.nan:
            return np.nan
        if not s.startswith('P') or 'T' not in s:
            return np.nan
        # add days
        if s.startswith('PT'):
            s = s.replace('PT','P0DT')
        # add trailing minutes and seconds
        if s.endswith('H'):
            s += '0M0S'
        elif s.endswith('M'):
            s += '0S'
        try:
            return pd.Timedelta(s)
        except:
            return np.nan
    time_period_series = series.apply(convert_iso_duration)
    if time_period_series.isna().any():
        logging.error('invalid times in {!r}: {}'.format(col, 
            series[time_period_series.isna()].unique()))
    valid_time_period_series = time_period_series[~time_period_series.isna()]
    return valid_time_period_series

# df.loc[1,'DURATION_OF_COLLECTION'] = 'PVT1H'
validate_time_period('DURATION_OF_COLLECTION', df);
# df['DURATION_OF_COLLECTION']

[INFO] validating time period column 'DURATION_OF_COLLECTION'
[ERROR] invalid times in 'DURATION_OF_COLLECTION': ['']


In [26]:
def check_location(df, fn):
    
    logging.info('validating country with coordinates')
    
    loc_col, lat_col, lon_col = 'COLLECTION_LOCATION', 'DECIMAL_LATITUDE', 'DECIMAL_LONGITUDE'

    try:
        loc_df_complete = df[[loc_col, lat_col, lon_col]].copy()
    except:
        logging.error('One of {!r} {!r} {!r} columns not found in manifest'.format(loc_col, lat_col, lon_col))
        return
#     loc_df_isna = (loc_df.isin(na_values)).all(axis=1)
#     if loc_df_isna.any():
#         logging.info('removing {} {!r} samples with missing data from coordinate analysis'.format(
#                 loc_df_isna.sum(), na_values))
#     loc_df_complete = loc_df[~loc_df_isna].copy()
    
    # coordinates in geopy format
    loc_df_complete['coord'] = loc_df_complete.apply(lambda x: '{}, {}'.format(
            x[lat_col], x[lon_col]), axis=1)
    
    # get location data for coordinates
    # use local copy of web query results for re-runs
    # this 
    loc_fn = fn+'_loc.pkl'
    if os.path.isfile(loc_fn):
        locations = pickle.load(open(loc_fn, "rb"))
    else:
        # web map server - openstreetmaps
        logging.info('querying coordinates')
        locator = Nominatim(user_agent='myGeocoder')
        rgeocode = RateLimiter(locator.reverse, min_delay_seconds=1)

        locations = dict()
        for c in loc_df_complete.coord.unique():
            # pre-fill with unknown country
            locations[c] = {'address':{'country':'UNKNOWN'}}
            # check coordniate correctness
            try:
                lat, lon = c.split(', ')
                lat, lon = float(lat), float(lon)
            except:
                logging.error('problem parsing coordinates {!r}'.format(c))
                continue
            if abs(lat) > 90:
                logging.error('invalid latitude {}, should be in [-90,90]'.format(lat))
                continue
            if abs(lon) > 180:
                logging.error('invalid longitude {}, should be in [-180,180]'.format(lon))
                continue
            # web query
            location = rgeocode(c, language='en-gb')
            # rgeocode returns empty location outside of counries and in some other situations
            if location is not None:
                locations[c] = location.raw

        # save locations to file
        pickle.dump(locations, open(loc_fn, "wb"))
        
    # parse country from partner input
    loc_df_complete['partner_country'] = loc_df_complete[loc_col].apply(lambda x: x.split('|')[0].strip().upper())
    
    # extract countries from location data
    loc_countries = dict()
    for coord in locations.keys():
        coord_country = locations[coord]['address']['country'].upper()
        loc_countries[coord] = coord_country
        
        partner_countries = loc_df_complete.loc[loc_df_complete.coord == coord, 'partner_country']
        if partner_countries.nunique() > 1:
            logging.error('multiple partner countries for coordinates {!r}: {}'
                          'skipping coordinate validation'.format(
                                coord, partner_countries.unique()))
            continue
        if partner_countries.shape[0] == 0:
            logging.error('no partner location found for coordinates {!r}'.format(coord))
            continue
        partner_country = partner_countries.iloc[0]
        if coord_country == 'UNKNOWN':
            logging.warning('could not locate country for coordinates {!r}, partner country {!r}'.format(
                    coord, partner_country))
        elif partner_country != coord_country:
            logging.error('country mismatch for coordinates {!r}, partner country {!r}, '
                          'coordinate country {!r}'.format(coord, partner_country, coord_country))
    
    # countries based on coordinates
    loc_df_complete['coord_country'] = loc_df_complete['coord'].replace(loc_countries)
    country_mismatch = (loc_df_complete.coord_country != loc_df_complete.partner_country)

#     if country_mismatch.any():
#         logging.error('coordinates do not match country for SERIES: {}'.format(
#                 country_mismatch[country_mismatch].index.to_list()))
    
    # location data can be re-used, e.g. as an additional field
    return loc_df_complete
# df.loc[2,'DECIMAL_LATITUDE'] = '65'
loc_test = check_location(df, fn)
loc_test

[INFO] validating country with coordinates
[ERROR] country mismatch for coordinates '-3.889444, 14.452778', partner country 'REPUBLIC OF THE CONGO', coordinate country 'CONGO-BRAZZAVILLE'
[ERROR] country mismatch for coordinates '9.593611, -5.197222', partner country "COTE D'IVOIRE", coordinate country "CÔTE D'IVOIRE"
[ERROR] country mismatch for coordinates '5.340816, -4.133226', partner country "COTE D'IVOIRE", coordinate country "CÔTE D'IVOIRE"
[ERROR] country mismatch for coordinates '6.6545, -4.710111', partner country "COTE D'IVOIRE", coordinate country "CÔTE D'IVOIRE"
[ERROR] country mismatch for coordinates '6.729722, -3.496389', partner country "COTE D'IVOIRE", coordinate country "CÔTE D'IVOIRE"
[ERROR] no partner location found for coordinates ', '


Unnamed: 0_level_0,COLLECTION_LOCATION,DECIMAL_LATITUDE,DECIMAL_LONGITUDE,coord,partner_country,coord_country
SERIES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Republic of the Congo | Louhoulou (forêt de Ba...,-3.889444,14.452778,"-3.889444, 14.452778",REPUBLIC OF THE CONGO,CONGO-BRAZZAVILLE
2,Republic of the Congo | Louhoulou (forêt de Ba...,-3.889444,14.452778,"-3.889444, 14.452778",REPUBLIC OF THE CONGO,CONGO-BRAZZAVILLE
3,Republic of the Congo | Louhoulou (forêt de Ba...,-3.889444,14.452778,"-3.889444, 14.452778",REPUBLIC OF THE CONGO,CONGO-BRAZZAVILLE
4,Republic of the Congo | Louhoulou (forêt de Ba...,-3.889444,14.452778,"-3.889444, 14.452778",REPUBLIC OF THE CONGO,CONGO-BRAZZAVILLE
5,Republic of the Congo | Louhoulou (forêt de Ba...,-3.889444,14.452778,"-3.889444, 14.452778",REPUBLIC OF THE CONGO,CONGO-BRAZZAVILLE
...,...,...,...,...,...,...
91,Cameroon | Yaoundé Awaé,3.901848,11.880727,"3.901848, 11.880727",CAMEROON,CAMEROON
92,Mauritania | Kaedi,16.153722,-13.503694,"16.153722, -13.503694",MAURITANIA,MAURITANIA
93,Burkina Faso | Volta Blanche (au sud route Oua...,12.233479,-1.083331,"12.233479, -1.083331",BURKINA FASO,BURKINA FASO
94,Cote d'Ivoire | Adiopodoumé,5.340816,-4.133226,"5.340816, -4.133226",COTE D'IVOIRE,CÔTE D'IVOIRE


In [27]:
l = pd.DataFrame(loc_dict).T
pd.DataFrame(l['address'].to_list())

NameError: name 'loc_dict' is not defined

In [28]:

# valid_spp = list()
valid_spp = pd.read_csv(spp_fn, header=None)[0].to_list()
len(valid_spp)

496

In [29]:
def validate_scientific_names(col, df, spp_fn, na_values = ['NOT_COLLECTED']):
    
    logging.info('validating species names against {!r}'.format(spp_fn))

    # read species names from file
    valid_spp = pd.read_csv(spp_fn, header=None)[0].to_list()
    
    if col not in df.columns:
        logging.error('{!r} column not found in manifest'.format(col))
        return
    series = df[col]
    series = exclude_missing(series, na_values)
    
    correct_genus = series.str.startswith('Anopheles')
    if not correct_genus.all():
        logging.error('expected Anopheles as genus in {!r} column, found: {}'.format(
                      col, series[~correct_genus].unique()))
    
    species = series.str.split(' ').str.get(1)
    correct_species = species.isin(valid_spp)
    
    if not correct_species.all():
        logging.error('species not in {!r} found in {!r} column: {}'.format(
                      spp_fn, col, series[~correct_species].unique()))
# df.loc[1, 'SCIENTIFIC_NAME'] = 'Anapherla askfnvjn'
validate_scientific_names('SCIENTIFIC_NAME', df, spp_fn)

[INFO] validating species names against '../../analysis/0_partner/data/harbach_spp_201910.txt'


In [30]:
def check_undeleted_example(df):
    
    logging.info('checking if example row was not deleted')
    
    if (df.index == 'example').any() | \
       (df.RACK_OR_PLATE_ID == 'PLATE_MMMM2222').any() | \
       (df.CATCH_ID == 'ignore this field').any():
        logging.error('example row was not deleted')

In [33]:
def validate(fn, template_fn, spp_fn, verbose=False, version='2.0'):
    '''
    Validation follows the order of columns order in data entry sheet
    '''

    setup_logging(verbose=verbose)

    logging.info('# started validate_partner_manifest_v.{}'.format(version))

    # read data
    df = get_data(fn)
    
    # prepare validation
    template_df = get_data(template_fn)
    check_columns(df, template_df)
    valid_dict = get_valid_dict(template_fn)

    # exclude blanks
    df = check_exclude_blanks(df)
    
    
    validate_values('TUBE_OR_WELL_ID', df, valid_dict)
    validate_values('ORGANISM_PART', df, valid_dict, sep='|')

    validate_values('ORDER', df, valid_dict)
    validate_values('FAMILY', df, valid_dict)
    validate_values('GENUS', df, valid_dict)
    
    validate_scientific_names('SCIENTIFIC_NAME', df, spp_fn, na_values=['NOT_COLLECTED'])
        
    validate_values('LIFESTAGE', df, valid_dict)
    validate_values('SEX', df, valid_dict)
    
    validate_values('BLOOD_MEAL', df, valid_dict, na_values=[''])
    validate_values('GRAVIDITY', df, valid_dict, na_values=[''])
    
    validate_date('DATE_OF_COLLECTION', df)

    check_location(df, fn)
    
    #validate_format('ELEVATION', dtype=int)
    
    validate_time('TIME_OF_COLLECTION', df)
    validate_time_period('DURATION_OF_COLLECTION', df)

    validate_values('COLLECTION_METHOD', df, valid_dict, level='w')
    validate_values('OUTDOORS_INDOORS', df, valid_dict)
    validate_values('PRESERVATIVE_SOLUTION', df, valid_dict, level='w')
    
    #validate_format('TIME_ELAPSED_FROM_COLLECTION_TO_PRESERVATION', dtype=int)
    validate_date('DATE_OF_PRESERVATION', df)
    #compare_dates(before='DATE_OF_COLLECTION', after='DATE_OF_PRESERVATION')
    
    validate_values('HAZARD_GROUP', df, valid_dict)
    validate_values('REGULATORY_COMPLIANCE', df, valid_dict)
    
    check_undeleted_example(df)
    
    logging.info('# ended validate_partner_manifest_v.{}'.format(version))

    return df

fn = '../../results/partner_manifests/IRD-Neandersquito_T222Amplicon_Manifest_V2.0.xlsx'
df = validate(fn, template_fn, spp_fn, verbose=False)

[ERROR] for blanks, ORGANISM_PART expected to be BLANK_SAMPLE, found {'CONTROL_WELL'}
[ERROR] country mismatch for coordinates '-3.889444, 14.452778', partner country 'REPUBLIC OF THE CONGO', coordinate country 'CONGO-BRAZZAVILLE'
[ERROR] country mismatch for coordinates '9.593611, -5.197222', partner country "COTE D'IVOIRE", coordinate country "CÔTE D'IVOIRE"
[ERROR] country mismatch for coordinates '5.340816, -4.133226', partner country "COTE D'IVOIRE", coordinate country "CÔTE D'IVOIRE"
[ERROR] country mismatch for coordinates '6.6545, -4.710111', partner country "COTE D'IVOIRE", coordinate country "CÔTE D'IVOIRE"
[ERROR] country mismatch for coordinates '6.729722, -3.496389', partner country "COTE D'IVOIRE", coordinate country "CÔTE D'IVOIRE"
[ERROR] no partner location found for coordinates ', '
[ERROR] invalid times in 'TIME_OF_COLLECTION': ['']
[ERROR] invalid times in 'DURATION_OF_COLLECTION': ['']
[ERROR] invalid values in 'OUTDOORS_INDOORS': {''}
[ERROR] invalid dates in 'DAT

In [21]:
fn = '../../results/partner_manifests/1264_PAMCA_16Apr2021_T222Amplicon_Manifest_V1.1.xlsx'
df = validate(fn, template_fn, spp_fn, verbose=False)

[ERROR] last well H12 is not blank at SERIES [97, 193, 289, 385, 481, 668, 764]
[ERROR] invalid times in 'DURATION_OF_COLLECTION': ['']
[ERROR] invalid dates in 'DATE_OF_PRESERVATION': ['NOT_COLLECTED']
[ERROR] example row was not deleted


In [22]:
fn = '../../results/partner_manifests/1272_PAMCA_16_Apr2021_T222Amplicon_Manifest_V1.1.xlsx'
df = validate(fn, template_fn, spp_fn,  verbose=False)

[ERROR] template columns missing from filled manifest: {'MISC_METADATA'}
[ERROR] for blanks, ORGANISM_PART expected to be BLANK_SAMPLE, found {''}
[ERROR] invalid values in 'TUBE_OR_WELL_ID': {'6A'}
[ERROR] invalid values in 'ORGANISM_PART': {''}
[ERROR] expected Anopheles as genus in 'SCIENTIFIC_NAME' column, found: ['unknown']
[ERROR] species not in 'data/harbach_spp_201910.txt' found in 'SCIENTIFIC_NAME' column: ['unknown']
[ERROR] invalid dates in 'DATE_OF_PRESERVATION': ['NOT_COLLECTED' 'NOT_APPLICABLE']
[ERROR] example row was not deleted


In [23]:
fn = '../../results/partner_manifests/AYALA_20210426_T222Amplicon_Manifest_V1.1.xlsx'
validate(fn, template_fn, spp_fn,  verbose=False)

FileNotFoundError: [Errno 2] No such file or directory: '../../results/partner_manifests/AYALA_20210426_T222Amplicon_Manifest_V1.1.xlsx'

In [None]:
test_loc_fn = 'data/WALTON_09_Mar2021_T222Amplicon_Manifest_V1.0.xlsx_loc.pkl'
locations = pickle.load(open(test_loc_fn, "rb"))