In [1]:
import os
import errno
import gzip
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# inputs

In [2]:
work_dir = './test_read_processing/'
data_dir = './test_data/'
COUNTS_PATH = 'mapped_counts_with_umi.tsv'
SAMPLE_TABLE_PATH = data_dir + 'TEST_SAMPLE_INFO.tsv'
POOL_TABLE_PATH = data_dir + 'pool_assignment.xlsx'

# outputs

In [3]:
POOL = 'P1'
CONDS = ['SCM']
POOL_DESIGN = 'ergosterol'

# load pool well numbers

In [4]:
pool_dict = pd.read_excel(POOL_TABLE_PATH).to_dict(orient='list')
well_list = [x for x in pool_dict[POOL_DESIGN] if x>0]
well_list.append(192)

# generate paired samplesheet and counts file for deseq2

In [5]:
def deseq_input(COUNTS_PATH,SAMPLE_TABLE_PATH,POOL,COND):
    # output files
    DESEQ2_COUNTS = work_dir + 'DESEQ2_COUNTS_' + POOL + '_' + COND + '.tsv'
    DESEQ2_COUNTS_STACKED = work_dir + 'DESEQ2_STACKED_COUNTS_' + POOL + '_' + COND + '.tsv'
    DESEQ2_SAMPLES = work_dir + 'DESEQ2_SAMPLES_' + POOL + '_' + COND + '.tsv'
    # load counts table
    all_counts=pd.read_csv(COUNTS_PATH, sep = '\t', index_col='oligo_name').filter(regex='('+ POOL + '_' + COND +')')
    all_counts.columns = [x.rstrip('_counts') for x in all_counts.columns]
    # filter oligos by wells in pool
    all_counts['full_oligo_name']=all_counts.index
    all_counts['well'] = all_counts['full_oligo_name'].str.split('_', expand=True).iloc[:,1:2].astype('int')
    all_counts = all_counts[all_counts['well'].isin(well_list)]
    all_counts = all_counts.drop(['well','full_oligo_name'],axis=1)

    # write counts at UMI level
    all_counts.to_csv(DESEQ2_COUNTS, sep='\t')

    # collapse barcodes to oligo level
    stacked_counts = all_counts.reset_index()
    stacked_counts['oligo_name'] = stacked_counts['oligo_name'].str.split('_',expand=True,n=3).iloc[:,1:3].agg('_'.join, axis=1)
    stacked_counts = stacked_counts.groupby('oligo_name', sort=False).agg('sum')
    # write counts at oligo level
    stacked_counts.to_csv(DESEQ2_COUNTS_STACKED, sep='\t')

    # load sample generation time table
    sample_table=pd.read_csv(SAMPLE_TABLE_PATH, sep = '\t').iloc[:,:5]
    sample_table = sample_table[sample_table['COND']==COND]
    sample_table.index = sample_table[['POOL','COND','TIMEPOINT','FLASK']].agg('_'.join, axis=1)
    #write sample information
    sample_table.to_csv(DESEQ2_SAMPLES, sep='\t')
    return

In [6]:
for COND in CONDS:
    deseq_input(COUNTS_PATH,SAMPLE_TABLE_PATH,POOL,COND)