In [1]:
import os
import pandas as pd
import numpy as np
from functools import reduce
import matplotlib.pyplot as plt

# input/output

In [2]:
#input
# reads processed by nextflow pipeline "ReadProcessing_Nextflow.config"
counts_dir = './test_read_processing/counts_withUMI/'
# barcode design with SphI RE site and UMIs appended (ergosterol pool only)
barcode_table = './test_data/barcode_table_with_6umi.tsv'
# conditions (if more than one to be processed) as list
conditions = ['SCM']
#output
# combined counts table for all reads
counts_table = 'all_counts_umi.tsv'
# combined counts table for all MAPPED reads
mapped_counts = './mapped_counts_with_umi.tsv'

In [3]:
# generate pandas DataFrame and export combined counts table 
all_counts_df = pd.DataFrame()
counts_df_list = []
for file_name in sorted(os.listdir(counts_dir)):
    counts_file = pd.read_csv(counts_dir+file_name, sep='\t', header=0, names=['index','barcode',file_name[:-4]]).iloc[:,1:]
    counts_df_list.append(counts_file)
all_counts_df = reduce(lambda x,y: pd.merge(x,y, on='barcode', how='outer'), counts_df_list).fillna(0)
# filter or conditions for analysis
all_counts_df = all_counts_df.filter(regex='(barcode|' + '|'.join(conditions) + ')', axis=1)
all_counts_df.to_csv(counts_table, sep='\t')

In [4]:
# examine total counts for each sample
all_counts_df.iloc[:,1:].sum()

P1_SCM_T1_F4_counts    38503.0
P1_SCM_T1_F5_counts    26891.0
P1_SCM_T1_F6_counts    24749.0
P1_SCM_T2_F4_counts    38789.0
P1_SCM_T2_F5_counts    29706.0
P1_SCM_T2_F6_counts    25393.0
P1_SCM_T3_F4_counts    37532.0
P1_SCM_T3_F5_counts    26969.0
P1_SCM_T3_F6_counts    24598.0
P1_SCM_T4_F4_counts    38754.0
P1_SCM_T4_F5_counts    24152.0
P1_SCM_T4_F6_counts    23773.0
P1_SCM_T5_F4_counts    40565.0
P1_SCM_T5_F5_counts    23916.0
P1_SCM_T5_F6_counts    24611.0
P1_SCM_T6_F4_counts    35319.0
P1_SCM_T6_F5_counts    29732.0
P1_SCM_T6_F6_counts    27268.0
dtype: float64

In [5]:
len(all_counts_df)

29233

# map perfect barcodes to barcode table

In [6]:
# import barcode table for appending to map reads that exactly match designed barcodes and UMI
barcode_table_df = pd.read_csv(barcode_table, sep='\t').iloc[:,-2:]
barcode_table_df.columns=['oligo_name','barcode']
def reverse_complement(dna):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
    return ''.join(complement[base] for base in reversed([b for b in dna]))
barcode_table_df['barcode'] = barcode_table_df['barcode'].apply(reverse_complement)
barcode_table_df['barcode_id'] = barcode_table_df['oligo_name'].str.split('_',expand=True)[1] + '_' + \
                                 barcode_table_df['oligo_name'].str.split('_',expand=True)[2]

In [7]:
# examine imported barcode table
barcode_table_df.head(20)

Unnamed: 0,oligo_name,barcode,barcode_id
0,ergosterol_001_001_UMI_1,GAGCCACATGCATGCTGTCAATGCGAT,001_001
1,ergosterol_001_001_UMI_2,TTGCCACAGGCATGCTGTCAATGCGAT,001_001
2,ergosterol_001_001_UMI_4,GATCCTCTGGCATGCTGTCAATGCGAT,001_001
3,ergosterol_001_001_UMI_6,AGTCCTCTAGCATGCTGTCAATGCGAT,001_001
4,ergosterol_001_001_UMI_13,GAATCACACGCATGCTGTCAATGCGAT,001_001
5,ergosterol_001_001_UMI_14,TTCACGCGTGCATGCTGTCAATGCGAT,001_001
6,ergosterol_001_002_UMI_1,GAGCCACATGCATGCGTTGTGTAGATT,001_002
7,ergosterol_001_002_UMI_2,TTGCCACAGGCATGCGTTGTGTAGATT,001_002
8,ergosterol_001_002_UMI_4,GATCCTCTGGCATGCGTTGTGTAGATT,001_002
9,ergosterol_001_002_UMI_6,AGTCCTCTAGCATGCGTTGTGTAGATT,001_002


In [8]:
# append barcode and UMI information for sample counts
perfect_read_count = barcode_table_df.merge(all_counts_df, on='barcode', how='left').fillna(0)
perfect_read_count.to_csv(mapped_counts,sep='\t')

In [9]:
print("total reads mapped:")
print(int(perfect_read_count.iloc[:,3:].sum().sum()))

total reads mapped:
495002
