# This is the code we are using to re-create chase's tool for the infauna data 

First we import the necessary packages

In [85]:
import pandas as pd
from sqlalchemy import create_engine
import numpy as np

Then read in the raw data and prepare it

In [86]:
initial = pd.read_excel("data/Infauna_QC_B13_OCSD.xlsm", sheet_name = 'Original_Data')
qc = pd.read_excel("data/Infauna_QC_B13_OCSD.xlsm", sheet_name = 'QC_Data')
qc.rename(columns={" SITE": "SITE"}, inplace=True)

Here we build the dataframe that will become the discrepancy report by merging the initial and QC dataframes

In [87]:
output = pd.merge(initial, qc, left_on = ['SITE', 'ORIGINAL SPECIES'], right_on = ['SITE', 'QC SPECIES'], how = 'outer')
output = output[['SITE', 'ORIGINAL SPECIES', 'ORIGINAL ABUNDANCE', 'ORIGINAL VOUCHER', 'QC SPECIES', 'QC ABUNDANCE']]

The next two lines add the columns stating whether or not there is a match, and the type of discrepancy

In [88]:
output['Match/Not Match'] = output.apply(lambda x: 'Match' if ((x['ORIGINAL SPECIES'] != x['QC SPECIES']) & (x['ORIGINAL ABUNDANCE'] == x['QC ABUNDANCE'])) else 'No Match', axis=1)

In [89]:
output['Type'] = output.apply(lambda x: 'ID' if (x['ORIGINAL SPECIES'] != x['QC SPECIES']) else 'Count' if (x['ORIGINAL ABUNDANCE'] != x['QC ABUNDANCE']) else np.nan, axis = 1)

This next cell is purely for cosmetic purposes

In [90]:
output.sort_values(['SITE', 'Match/Not Match', 'ORIGINAL SPECIES', 'QC SPECIES'], inplace=True)
output.reset_index(inplace=True)
output.drop("index", axis=1, inplace=True)

And here are the dataframes, the initial, the QC data, and the output

In [91]:
initial.head()

Unnamed: 0,ORIGINAL SPECIES,ORIGINAL ABUNDANCE,ORIGINAL VOUCHER,SITE
0,Theora lubrica,7,,B13-8328
1,Psammotreta obesa,1,,B13-8328
2,Leukoma staminea,5,,B13-8328
3,Chione californiensis,2,,B13-8328
4,Caecum californicum,1,,B13-8328
5,Bulla gouldiana,1,,B13-8328
6,Acteocina carinata,4,,B13-8328
7,Nicolea sp A,1,1.0,B13-8328
8,Diplocirrus sp SD1,1,,B13-8328
9,Tubulanus sp SD1,1,1.0,B13-8328


In [92]:
qc.head()

Unnamed: 0,QC SPECIES,QC ABUNDANCE,SITE
0,Protomima imitatrix,42,B13-8328
1,Scolelepis (Parascolelepis) texana,1,B13-8328
2,Rudilemboides stenopropodus,18,B13-8328
3,Psammotreta obesa,1,B13-8328
4,Pseudopolydora paucibranchiata,11,B13-8328
5,Prionospio (Prionospio) heterobranchia,4,B13-8328
6,Polycirrus sp,1,B13-8328
7,Prionospio (Prionospio) sp,2,B13-8328
8,Sphaerosyllis californiensis,2,B13-8328
9,Scoletoma sp B,1,B13-8328


Below is the final output from our re-creation of Chase's tool. You can view the whole thing by clicking on DiscrepancyReport.csv

In [93]:
output.head()

Unnamed: 0,SITE,ORIGINAL SPECIES,ORIGINAL ABUNDANCE,ORIGINAL VOUCHER,QC SPECIES,QC ABUNDANCE,Match/Not Match,Type
0,B13-8328,ARTHROPODA,2.0,,,,No Match,ID
1,B13-8328,Acteocina carinata,4.0,,,,No Match,ID
2,B13-8328,Acuminodeutopus heteruropus,4.0,,,,No Match,ID
3,B13-8328,Ampelisca brachycladus,1.0,,,,No Match,ID
4,B13-8328,Ampharete labrops,2.0,,,,No Match,ID
5,B13-8328,Amphicteis scaphobranchiata,1.0,,Amphicteis scaphobranchiata,1.0,No Match,
6,B13-8328,Amphideutopus oculatus,101.0,,Amphideutopus oculatus,105.0,No Match,Count
7,B13-8328,Amphiodia digitata,2.0,,Amphiodia digitata,1.0,No Match,Count
8,B13-8328,Amphipholis squamata,6.0,,Amphipholis squamata,6.0,No Match,
9,B13-8328,Anoplodactylus erectus,1.0,,Anoplodactylus erectus,1.0,No Match,
