# Note
This document provides information about how to process recording data from fastq files.\
Briefly, barcodes were extracted and counted from raw fastq files by pattern matching function regex. \
There are two types of tape used in the paper: endogenous HEK3 locus (HEK3-Tape) in HEK293T and K562 cells and synthetic HEK3 locus integrated into the genome via piggybac (synHEK3-Tape). \
Depending on the sequnencing structure, the barcodes can be extracted by two patterns (see below).\
3 bp, 5 bp and 6 bp insertions were used in different cases. the pattern length should cover all barcodes including unedited Tapes. 

In [1]:
import gzip
import subprocess
import os,sys,csv,re
from optparse import OptionParser,OptionGroup
import pandas as pd
# HEK3 R2 (HEK293, K562) or synHEK R1 (mESC,K562)
path = '../Figure' # data dir here
samples =[s for s in os.listdir(path) if 'fastq.gz' in s if 'R2' in s] # for HEK3-TAPE
samples =[s for s in os.listdir(path) if 'fastq.gz' in s if 'R1' in s] # for synHEK3-TAPE 

for s in samples:
    cmd = "zcat "+ path+s + " | awk \'{if(NR%4==2) print $0} \' | awk \' match($1,/CATCA([ATCG]{0,6})CGTGC/) {print substr($1, RSTART+5,RLENGTH-10)}\'  | sort -k1n | uniq -c | sort -k1nr | awk '{a[NR]=$2;x+=(b[NR]=$1)}END{while(++i<=NR) print a[i]\"\\t\"b[i]\"\\t\"100*b[i]/x}'  > "+ path  + s.split('_S')[0] +"_bc_count.tsv&"
    p=subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
    p.wait()

In [20]:
import gzip
import subprocess
import os,sys,csv,re
from optparse import OptionParser,OptionGroup
import pandas as pd
# HEK3 R1 (HEK293, K562) or synHEK R2 (mESC,K562)
path = '../Figure' # data dir here
samples =[s for s in os.listdir(path) if 'fastq.gz' in s if 'R1' in s ]
for s in samples:
    cmd = "zcat "+ path+s + " | awk \'{if(NR%4==2) print $0} \' | awk \' match($1,/GCACG([ATCG]{0,6})TGATG/) {print substr($1, RSTART+5,RLENGTH-10)}\'  | sort -k1n | uniq -c | sort -k1nr | awk '{a[NR]=$2;x+=(b[NR]=$1)}END{while(++i<=NR) print a[i]\"\\t\"b[i]\"\\t\"100*b[i]/x}'  > "+ path  + s.split('_S')[0] +"_bc_count.tsv&"
    p=subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
    p.wait()

## Processing DNA Typewriter data
DTT data was first aligned to the reference and then analyzed by the custom python script to extract and count the barcode

In [None]:
path = '../Figure4/DTT/' # or pulse 
samples =[s for s in os.listdir(path) if '.gz' in s and 'R1' in s]
# don't forget to bwa index DTT.fasta
for s in samples:
    cmd = 'bwa mem ../analysis/DTT.fasta '+ path+s + \
    '| samtools view -F 0x904 | awk \'$3 == "5X_TAPE" {{print $1"\\t"$3"\\t"$10; next}}\'> ' + path  + s.split('_S')[0] + '.txt&'
    p=subprocess.Popen(cmd,stdout=subprocess.DEVNULL,stderr=subprocess.STDOUT, shell=True);
    p.wait();

In [None]:
path = '../Figure4/pulse/'
samples =sorted([s.split('.txt')[0] for s in os.listdir(path) if '.txt' in s and 'dox' not in s])

pipeline = ''
for sample in samples:
    cmd = 'python ../analysis/PE_analysis_5XTAPE_wnt_dox.py '+ sample + ' ' + path + ' &'
    print (cmd)