In [3]:
import glob 
import os
import numpy as np
import pandas as pd
from IPython.core.pylabtools import figsize
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%matplotlib inline
figsize(12, 6)

## Summarize coverage files

In [5]:
cov_files = glob.glob(os.path.join('data','batch_4','coverage', '*.bam.cov'))

In [19]:
os.path.splitext(os.path.splitext( os.path.basename(cov_files[2]))[0])[0]

'CMSTILL10_0036'

In [10]:
def parse_cov_file(filename):
    """Returns the coverage over all loci"""
    cov = pd.read_csv(filename, sep = '\t', header = None)
    return(cov.iloc[:, 3].sum())

In [20]:
def parse_name(filename):
    return(os.path.splitext(os.path.splitext( os.path.basename(filename))[0])[0])

In [30]:
cov_df = pd.DataFrame({'IID':[parse_name(xx) for xx in cov_files], 'cov':[parse_cov_file(xx) for xx in cov_files]})
cov_df = cov_df[['IID', 'cov']]
cov_df.head()

Unnamed: 0,IID,cov
0,CMSHERW94S_0009,2831500
1,CMUW10_0007,440163
2,CMSTILL10_0036,257778
3,CMKALA03_0049,864623
4,CMKALA03_0008,3210738


In [25]:
cov_df.to_csv(os.path.join('paper', 'tables', 'coverage.txt'), index = False, sep = '\t')

In [31]:
hetmiss = pd.read_csv(os.path.join('paper','tables','hetmiss.txt'), sep = '\t')
hetmiss.head()

Unnamed: 0,FID,CLUSTER,POPNAME,YEAR,TIMING,REGION,IID,F_MISS,O_het,F
0,1,1,Hamma Hamma,2010,Summer,Hood Canal,CMHAMM10_0002,0.1057,0.289566,0.1323
1,1,1,Hamma Hamma,2010,Summer,Hood Canal,CMHAMM10_0005,0.2353,0.270723,0.1892
2,1,1,Hamma Hamma,2010,Summer,Hood Canal,CMHAMM10_0008,0.1362,0.304202,0.08892
3,1,1,Hamma Hamma,2010,Summer,Hood Canal,CMHAMM10_0011,0.2086,0.277489,0.1679
4,1,1,Hamma Hamma,2010,Summer,Hood Canal,CMHAMM10_0012,0.1541,0.298913,0.1055


In [34]:
table_1_raw = pd.merge(cov_df, hetmiss)
table_1_raw.head()

Unnamed: 0,IID,cov,FID,CLUSTER,POPNAME,YEAR,TIMING,REGION,F_MISS,O_het,F
0,CMSHERW94S_0009,2831500,5,5,Sherwood River Summer,1994,Summer,South Puget Sound,0.0267,0.333692,0.004884
1,CMUW10_0007,440163,10,10,Hoodsport Hatchery,2010,Fall,Hood Canal,0.1674,0.297075,0.1054
2,CMKALA03_0049,864623,2,2,Nisqually Kalama Creek,2003,Winter,South Puget Sound,0.1226,0.243313,0.2707
3,CMKALA03_0008,3210738,2,2,Nisqually Kalama Creek,2003,Winter,South Puget Sound,0.03081,0.33644,-0.003084
4,CMSHERW94S_0087,2313801,5,5,Sherwood River Summer,1994,Summer,South Puget Sound,0.07597,0.32766,0.02336


### Calcualte percent called gentypes
1 - F_MISS

In [None]:
table_1_raw['Genotyping rate'] = 1- table_1_raw['F_MISS']

In [90]:
table_1_raw = table_1_raw.rename(columns={'cov':'Aligned sequences'})

In [91]:
table1_grouped = table_1_raw.groupby('POPNAME')
table1_grouped.aggregate(np.average)[['Aligned sequences', 'Genotyping rate']]


Unnamed: 0_level_0,Aligned sequences,Genotyping rate
POPNAME,Unnamed: 1_level_1,Unnamed: 2_level_1
Hamma Hamma,1419541,0.868075
Hoodsport Hatchery,509422,0.848213
Lilliwaup Creek,2760125,0.983305
Nisqually Kalama Creek,2270022,0.96262
Sherwood River Fall,3235188,0.957814
Sherwood River Summer,2504974,0.910072
Skookum Creek,1644932,0.954064
Snohomish River,1135085,0.942091
Squakum Creek,999084,0.86379
Stillaguamish River,710538,0.907182


In [106]:
table1_grouped['n'] = table1_grouped.count()['FID']

TypeError: 'DataFrameGroupBy' object does not support item assignment

In [116]:
table1 = table1_grouped[['Aligned sequences', 'Genotyping rate']].agg([np.mean,np.std])
table1.loc[:, ('Aligned sequences', 'std')] = [np.int(xx) for xx in table1['Aligned sequences']['std']]
table1.insert(0, 'n', table1_grouped.count()['FID'])

In [117]:
#formatters = {('cov', 'std'):  lambda x: '%10.2f' % x}
formatters = {('Aligned sequences', 'std'):  lambda x: "{:,}".format(x),
              ('Aligned sequences', 'mean'):  lambda x: "{:,}".format(x),
              ('Genotyping rate', 'mean'):lambda x: '%10.2f' % x,
              ('Genotyping rate', 'std'):lambda x: '%10.2f' % x
             }


print table1.to_latex(formatters=formatters, index_names = False)

\begin{tabular}{lrrrrr}
\toprule
{} &   n & Aligned sequences &           & Genotyping rate &            \\
{} &     &              mean &       std &            mean &        std \\
\midrule
Hamma Hamma            &  20 &         1,419,541 & 1,427,760 &            0.87 &       0.08 \\
Hoodsport Hatchery     &   8 &           509,422 &   148,391 &            0.85 &       0.08 \\
Lilliwaup Creek        &  20 &         2,760,125 &   999,141 &            0.98 &       0.01 \\
Nisqually Kalama Creek &  17 &         2,270,022 & 1,432,866 &            0.96 &       0.03 \\
Sherwood River Fall    &  32 &         3,235,188 &   966,091 &            0.96 &       0.04 \\
Sherwood River Summer  &  31 &         2,504,974 & 1,183,089 &            0.91 &       0.07 \\
Skookum Creek          &  11 &         1,644,932 &   637,844 &            0.95 &       0.09 \\
Snohomish River        &  14 &         1,135,085 &   495,888 &            0.94 &       0.07 \\
Squakum Creek          &   8 &           999,084

## replace a header row with:
{} & {} & \multicolumn{2}{c}{Genotyping rate} & \multicolumn{2}{c}{Aligned sequences}\\
Collection &  n   &            mean &        std &              mean &       std \\

\caption{\label{tab:table-name}Sample sizes. sequencing and genotyping rates}
