<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setup" data-toc-modified-id="Setup-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Setup</a></span></li><li><span><a href="#Ingest-the-Master-Taxonomy" data-toc-modified-id="Ingest-the-Master-Taxonomy-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Ingest the Master Taxonomy</a></span><ul class="toc-item"><li><span><a href="#Ingest-the-Table-Itself" data-toc-modified-id="Ingest-the-Table-Itself-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Ingest the Table Itself</a></span></li><li><span><a href="#Map-Sample-IDs-to-Scientific-Names" data-toc-modified-id="Map-Sample-IDs-to-Scientific-Names-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Map Sample IDs to Scientific Names</a></span></li></ul></li><li><span><a href="#Ingest-the-Sample-Plates" data-toc-modified-id="Ingest-the-Sample-Plates-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Ingest the Sample Plates</a></span></li><li><span><a href="#Ingest-the-Picogreen-Sheet" data-toc-modified-id="Ingest-the-Picogreen-Sheet-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Ingest the Picogreen Sheet</a></span></li><li><span><a href="#Ingest-the-Genbank-Loci-Sheet" data-toc-modified-id="Ingest-the-Genbank-Loci-Sheet-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Ingest the Genbank Loci Sheet</a></span></li><li><span><a href="#Ingest-Nitfix-1" data-toc-modified-id="Ingest-Nitfix-1-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Ingest Nitfix 1</a></span></li></ul></div>

# Setup

In [1]:
import os
import re
from pathlib import Path

import pandas as pd
import dropbox
from dotenv import load_dotenv, find_dotenv

import lib.db as db
import lib.util as util
import lib.google as google

In [2]:
load_dotenv(find_dotenv())

True

In [3]:
CXN = db.connect()

In [4]:
INTERIM_DATA = Path('..') / 'data' / 'interim'
PROCESSED_DATA = Path('..') / 'data' / 'processed'

In [5]:
DROPBOX = os.getenv('DROPBOX')

DBX = dropbox.Dropbox(DROPBOX)

# Ingest the Master Taxonomy

## Ingest the Table Itself

In [6]:
csv_name = 'taxonomy.csv'

csv_path = INTERIM_DATA / csv_name

with open(csv_path, 'wb') as csv_out:
    google.export_sheet_csv('NitFixMasterTaxonomy', csv_out)

taxonomy = pd.read_csv(
    csv_path,
    header=0,
    names=[
        'taxon_key',
        'family',
        'scientific_name',
        'authority',
        'synonyms',
        'sample_ids',
        'provider_acronym',
        'provider_id',
        'quality_notes',
    ])
taxonomy['genus'] = taxonomy.scientific_name.str.split().str[0]

taxonomy.to_sql('taxons', CXN, if_exists='replace')

csv_path = PROCESSED_DATA / csv_name
taxonomy.to_csv(csv_path, index=False)

taxonomy.head()

Unnamed: 0,taxon_key,family,scientific_name,authority,synonyms,sample_ids,provider_acronym,provider_id,quality_notes,genus
0,kew-2640275,Anisophylleaceae,Anisophyllea apetala,Scort. ex King,,,,,,Anisophyllea
1,kew-2640276,Anisophylleaceae,Anisophyllea beccariana,Baill.,,,,,,Anisophyllea
2,kew-2640277,Anisophylleaceae,Anisophyllea boehmii,Engl.,"Anisophyllea exellii, Anisophyllea gossweileri",,,,,Anisophyllea
3,kew-2640279,Anisophylleaceae,Anisophyllea buchneri,Engl. & Brehmer,,,,,,Anisophyllea
4,kew-2640280,Anisophylleaceae,Anisophyllea buettneri,Engl.,Anisophyllea brachystila,,,,,Anisophyllea


## Map Sample IDs to Scientific Names

In [7]:
csv_name = 'samples.csv'

taxons = []
for key, taxon in taxonomy.iterrows():
    guids = util.split_uuids(taxon.sample_ids)
    for guid in guids:
        taxons.append({
            'sample_id': guid,
            'scientific_name': taxon.scientific_name,
            'family': taxon.family,
            'genus': taxon.genus
        })

df = pd.DataFrame(taxons)

df.to_sql('samples', CXN, if_exists='replace')

csv_path = PROCESSED_DATA / csv_name
df.to_csv(csv_path, index=False)

df.head()

Unnamed: 0,family,genus,sample_id,scientific_name
0,Begoniaceae,Begonia,a44674eb-9a44-469e-a63f-8d062d9e1c60,Begonia acerifolia
1,Begoniaceae,Begonia,4360e849-b172-4ed9-973f-c5899b17842c,Begonia acetosella
2,Begoniaceae,Begonia,45ed57c1-fcbc-4aca-adaa-1c3938c9cf24,Begonia almedana
3,Begoniaceae,Begonia,45e91b4e-fca1-485e-9dce-bb030900a807,Begonia angustiloba
4,Begoniaceae,Begonia,45e3b5f1-9f85-4b15-b96c-ca03aca2148d,Begonia aptera


# Ingest the Sample Plates

Get the entered data from the sample_plates Google sheet.

There is a fixed format to the plates:
```
                        Plate column 1  ...     Plate column 12
plate_id:UUID
entry_date:ISO_Date
local_id:Text
protocol:Text
notes:Text
results:Text
Plate row A                UUID?          ...     UUID?
    .                        .            ...       .
    .                        .            ...       .
    .                        .            ...       .
Plate row H                UUID?          ...     UUID?
```

In [8]:
csv_name = 'sample_plates.csv'

csv_path = INTERIM_DATA / csv_name
step = 14

with open(csv_path, 'wb') as csv_out:
    google.export_sheet_csv('sample_plates', csv_out)

df = pd.read_csv(csv_path)

has_data = df['Plate ID'].notna()
df = df[has_data]
df.reset_index(drop=True, inplace=True)

# Get all of the per plate information into a data frame
plates = []
for i in range(6):
    plate = df.iloc[i::step, [0]]
    plate.reset_index(drop=True, inplace=True)
    plates.append(plate)

plates = pd.concat(plates, axis=1, ignore_index=True)

# Append per well information with the per plate information for each well
row_start = 6
rows = 'ABCDEFGH'
wells = []
for row in range(row_start, row_start + len(rows)):
    for col in range(1, 13):
        well = pd.DataFrame(df.iloc[row::step, col])
        well.reset_index(drop=True, inplace=True)
        row_offset = row - row_start
        well['row'] = rows[row_offset:row_offset + 1]
        well['col'] = col
        well = pd.concat([plates, well], axis=1, ignore_index=True)
        wells.append(well)

wells = pd.concat(wells, axis=0, ignore_index=True)
wells.rename(
    columns={
        0: 'plate_id',
        1: 'entry_date',
        2: 'local_id',
        3: 'protocol',
        4: 'notes',
        5: 'results',
        6: 'sample_id',
        7: 'row',
        8: 'col',
    },
    inplace=True)

wells.to_sql('plates', CXN, if_exists='replace')

csv_path = PROCESSED_DATA / csv_name
df.to_csv(csv_path, index=False)

wells.head()

Unnamed: 0,plate_id,entry_date,local_id,protocol,notes,results,sample_id,row,col
0,00f5f483-3657-40de-8aad-7639c6b8e74a,2018-01-11,Local identifier: NITFIX_1,Protocol: Protocol_NitFix_1,"Notes: OSU SAMPLES. Failed grinding,low yield,...",Quantification NA,x,A,1
1,02b81f81-0fc3-45de-9ad4-0d85eb8d5c94,2018-01-17,Local identifier: NITFIX_2,Protocol,"Notes: OSU SAMPLES. Tube and cap failure, samp...",Quantification NA,x,A,1
2,031fc196-3587-477d-8bd2-4a9f5167be4d,2018-01-18,Local identifier:NITFIX_3,Protocol,Notes: OSU SAMPLES,Quantification 3/5,ade73b3b-79de-407d-b9d2-6c4f850309bc,A,1
3,037a4923-94f1-4134-b6dc-b36478e37bcc,2018-01-19,Local identifier: NITFIX_4,Protocol,"Notes:Contamination, Samples Discarded",Quantification NA,x,A,1
4,04a4aca9-a339-40f6-b2f0-047b1513e4de,2018-01-23,Local identifier: NITFIX_5,Protocol,Notes: OSU SAMPLES.,Quantification 3/5,b5bc9a61-1be8-4d9c-9722-6ebe5fa0f244,A,1



# Ingest the Picogreen Sheet

In [9]:
csv_name = 'picogreen.csv'

csv_path = INTERIM_DATA / csv_name

with open(csv_path, 'wb') as csv_out:
    google.export_sheet_csv('picogreen_2_14_2_15', csv_out)

df = pd.read_csv(
    csv_path,
    header=0,
    names=[
        'picogreen_id',
        'well',
        'rfu',
        'ng_microliter',
        'ng_microliter_mean',
        'quant_method',
        'quant_date',
        'sample_id',
    ])

df.to_sql('picogreen', CXN, if_exists='replace')

csv_path = PROCESSED_DATA / csv_name
df.to_csv(csv_path, index=False)

df.head()

Unnamed: 0,picogreen_id,well,rfu,ng_microliter,ng_microliter_mean,quant_method,quant_date,sample_id
0,13_01,A1,195.286,9.645493,10.189264,picogreen,2_15_18,c84c6871-887f-479e-bf1e-ff1c68b1c490
1,,B1,217.399,10.733035,,picogreen,2_15_18,
2,13_02,A2,1149.629,56.581164,65.824389,picogreen,2_15_18,e58072c6-ce0f-4029-9246-9756c391d944
3,,B2,1525.514,75.067614,,picogreen,2_15_18,
4,13_03,A3,337.331,16.631427,14.516884,picogreen,2_15_18,d07a86d9-dab9-45c9-a547-36b15ccc1dd7


# Ingest the Genbank Loci Sheet

In [10]:
csv_name = 'genbank_loci.csv'

csv_path = INTERIM_DATA / csv_name

with open(csv_path, 'wb') as csv_out:
    google.export_sheet_csv('genbank_loci', csv_out)

df = pd.read_csv(
    csv_path,
    header=0,
    names=['scientific_name', 'its', 'atpb', 'matk', 'matr', 'rbcl'])

df.to_sql('loci', CXN, if_exists='replace')

csv_path = PROCESSED_DATA / csv_name
df.to_csv(csv_path, index=False)

df.head()

Unnamed: 0,scientific_name,its,atpb,matk,matr,rbcl
0,Abarema abbottii,0,0,0,0,0
1,Abarema acreana,0,0,0,0,0
2,Abarema adenophora,0,0,0,0,0
3,Abarema adenophorum,0,0,0,0,0
4,Abarema agropecuaria,0,0,0,0,0


# Ingest Nitfix 1

In [11]:
file_name = 'nitfix01.csv'

csv_path = str(INTERIM_DATA / file_name)
dbx_path = 'id:zSBrtnqOfSAAAAAAAAAAKw/5657_Nit_Fix_I.reconcile.4.2.csv'

file_metadata = DBX.files_download_to_file(csv_path, dbx_path)

df = pd.read_csv(csv_path)
columns = {}
for old in df.columns:
    new = old.lower()
    new = new.replace('⁰', 'deg')
    new = new.replace("''", 'sec')
    new = new.replace("'", 'min')
    new = re.sub(r'[^a-z0-9_]+', '_', new)
    new = re.sub(r'^_|_$', '', new)
    columns[old] = new
columns['subject_qr_code'] = 'sample_id'

df.rename(columns=columns, inplace=True)

df.to_sql('reconciled', CXN, if_exists='replace')

csv_path = PROCESSED_DATA / file_name
df.to_csv(csv_path, index=False)

df.head()

Unnamed: 0,subject_id,country,state_province,county,location,minimum_elevation,maximum_elevation,main_dropdown,latitude_deg,latitude_min,...,month_1,day_1,year_1,month_2,day_2,year_2,subject_image_name,subject_nybg_bar_code,subject_resolved_name,sample_id
0,16192935,United States of America,North Carolina,Wayne,"Town of Fremont, along NC Rt.222, east of Evan...",,,feet,35,32,...,9 - September,25,2011,,,,R0001220.JPG,NYBG 3196996,Senna obtusifolia,8e37959f-dfa6-44b6-a201-b94215340016
1,16192937,United States of America,Arizona,Maricopa,"Salt Rivr at 35th Avenue bridge in Phoenix, ju...",1022.0,,feet,33.411913,,...,1 - January,21,2012,,,,R0001205.JPG,NYBG 3196995,Senna artemisioides,90a9d5ee-a1c6-4dd3-b6b1-6932ea796abd
2,16192938,Gabon,Ogooué-Lolo,,"Makande surroundings, c. 65 km SSW of Booué. I...",,,,- 0,41 S,...,2 - February,11,1999,,,,R0001202.JPG,NYBG 3196994,Scorodophloeus zenkeri,90f68e06-c5cb-48dc-9de1-5c0512314486
3,16192939,"Tanzania, United Republic of",Tanga,,"Mkaramo Parish, Mkwaja Subchiefeom, Mwera Chie...",150.0,,feet,,,...,7 - July,10,1957,,,,R0001201.JPG,NYBG 3196992,Scorodophloeus fischeri,90fb8362-a4ed-407d-a8b1-32dc56506101
4,16192941,Congo (Democratic Republic of the),Kasaï-Central,,Babadi - Kasai,,,unknown,,,...,12 - December,Not Shown,1934,,,,R0001199.JPG,NYBG 3196991,Leonardoxa romii,911525c9-04f7-4213-8781-a9842216c2d8
