In [1]:
from pathlib import Path
from datetime import datetime

import pandas as pd
from jinja2 import Environment, FileSystemLoader

import lib.db as db
import lib.util as util

In [2]:
CXN = db.connect()

NOW = datetime.now()

OUTPUT_DIR = Path('..') / 'output'
TEMPLATE_DIR = str(Path('.') / 'reports')

REPORT_NAME = f'sample_plates_report_{NOW.strftime("%Y-%m-%d")}.html'
REPORT_TEMPLATE = 'sample_plates_report.html'
REPORT_PATH = OUTPUT_DIR / REPORT_NAME

In [3]:
def print_report(cxn):
    """Generate the report."""
    env = Environment(loader=FileSystemLoader(TEMPLATE_DIR))
    template = env.get_template(REPORT_TEMPLATE)

    wells = get_wells(cxn)
    samples = pd.read_sql('SELECT * FROM samples', cxn)

    report = template.render(
        now=NOW,
        wells=get_plate_wells(wells),
        plates=get_plates(wells).to_dict(orient='records'),
        genera=get_genus_coverage(cxn, samples).to_dict(orient='records'),
        missing=get_missing(cxn, wells, samples).to_dict(orient='records'))

    with open(REPORT_PATH, 'w') as out_file:
        out_file.write(report)

In [4]:
def get_plates(wells):
    columns = ['plate_id', 'entry_date', 'local_id', 'protocol', 'notes']
    plates = wells.loc[:, columns]
    plates = plates.drop_duplicates()
    return plates

In [5]:
def get_plate_wells(wells):
    """Get the plate well data for the report."""
    plate_wells = {}
    for group, plate in wells.groupby('group'):
        plate_id = plate['plate_id'].iloc[0]
        plate_wells[plate_id] = plate.fillna('').to_dict(orient='records')
    return plate_wells

In [6]:
def get_wells(cxn):
    wells = pd.read_sql('SELECT * FROM plates', cxn)
    wells['group'] = wells.local_id.str.replace(r'\D+', '').astype('int')
    wells.sort_values(['group', 'row', 'col'], inplace=True)
    return wells

In [7]:
def get_missing(cxn, wells, samples):
    """Get the plate samples that are not in the master taxonomy."""
    in_samples = wells.sample_id.isin(samples.sample_id)
    is_uuid = wells.sample_id.map(util.split_uuids)
    missing = wells[~in_samples & is_uuid]
    return missing

In [8]:
def get_genus_coverage(cxn, samples):
    taxons = pd.read_sql(
        'SELECT family, genus, scientific_name FROM taxons', cxn)
    images = pd.read_sql('SELECT * FROM images', cxn)

    taxons['total'] = 1
    taxons['imaged'] = 0

    sample_has_image = samples.sample_id.isin(images.sample_id)
    sampled_image = samples[sample_has_image]

    is_imaged = taxons.scientific_name.isin(sampled_image.scientific_name)
    taxons.loc[is_imaged, 'imaged'] = 1

    aggs = {'total': sum, 'imaged': sum}

    families = taxons.groupby('family').agg(aggs)
    families['genus'] = ''
    families['family'] = families.index

    genera = taxons.groupby(['family', 'genus']).agg(aggs)
    genera['family'] = genera.index.get_level_values('family')
    genera['genus'] = genera.index.get_level_values('genus')

    coverage = pd.concat([families, genera])
    total = pd.DataFrame([{
        'family': '~Total~',
        'genus': '',
        'imaged': taxons.imaged.sum(),
        'total': taxons.total.sum()}])
    coverage = coverage.append(total)

    coverage.sort_values(['family', 'genus'], inplace=True)
    coverage.index = range(len(coverage))
    has_genus = coverage.genus != ''
    coverage.loc[has_genus, 'family'] = ''
    coverage['percent'] = coverage['imaged'] / coverage['total'] * 100.0
    return coverage

In [9]:
print_report(CXN)