In [1]:
import os
import numpy as np
import pandas as pd
from difflib import SequenceMatcher
import gzip
import importlib.util
import reverse_complement as rc

In [2]:
''' Barcodes generated using barcode_design_rk_2
For barcode generation:
    - Length: 10nt
    - Edit distance >= 3
    
Example: 
        python barcode_design_rk_2.py -l 10 -e 3 -NB MaxTenBarcodes -n 8700 -i BlankBarcodes.txt
where BlankBarcodes is an empty .txt file.  Followed by:
        python barcode_design_rk_2.py -l 10 -e 3 -NB 9kTenBarcodes -n 300 -i MaxTenBarcodes.txt     
'''

# Read in barcode .txt files
df_txt1 = pd.read_fwf('/Users/rchin/PycharmProjects/IPS_Primer_Generator/MaxTenBarcodes.txt', 
                      header=None).rename(columns={0: "barcode"})
df_txt2 = pd.read_fwf('/Users/rchin/PycharmProjects/IPS_Primer_Generator/9kTenBarcodes.txt', 
                      header=None).rename(columns={0: "barcode"})
#repeat for more barcodes files

df_barcode = pd.concat([df_txt1,df_txt2]).reset_index(drop=True)


In [3]:
# QC zone
display (df_txt2.tail())
display(df_barcode.tail())
print(len(df_barcode))

Unnamed: 0,barcode
295,TCTCGTACGT
296,TTATGGCTAA
297,GCAAAGATGT
298,CGGACAATGC
299,AGAATCCGCC


Unnamed: 0,barcode
8995,TCTCGTACGT
8996,TTATGGCTAA
8997,GCAAAGATGT
8998,CGGACAATGC
8999,AGAATCCGCC


9000


In [4]:
''' Filtering Rules

Post-generation filter:
    - Must have at least 3 out of 4 nucleotide types.  {nt_type}
    - Must not have more than 2 repeating nucleotides.
    - GC% must be between 0.125 and 0.875%.
    - Must not start with GG.
'''

def nt_type(df):
    bases = ["A", "C", "G", "T"]

    for b in bases:
        df[b] = df['barcode'].str.count(b)
        
    # Add flag for nt_type
    df['nt_flag'] = [True if x > 2 else False for x in np.sum(df[bases].values == 0,1)]
        
    return df

def threepeat(df):
    # Add threepeat_flag = True if a nucleotide is repeated at least 3 times
#     triplets = ['AAA', 'CCC', 'GGG', 'TTT']
    triplets = 'AAA|CCC|GGG|TTT'

    df['threepeat_flag'] = df['barcode'].str.contains(triplets, regex=True)

    return df
        

def GCpct(df):
    # Add GCpct_flag = True if GC content is between 0.125 and 0.875
    bases = ["A", "C", "G", "T"]
    GC = ["C", "G"]
#     df['GC_pct'] = np.sum(df[GC].values) / np.sum(df[bases].values)
    df['GC_pct'] = (df['C'] + df['G'])/(df['A']+df['C']+df['G']+df['T'])
    
#     df['GCpct_flag'] = [True if (x > 0.125 or x < 0.875) else False for x in df['GC_pct']]
    df['GCpct_flag'] = df['GC_pct'].between(0.125,0.875,inclusive=True)
    
    return df

def no_GG(df):
    # Add GG_flag if barcode starts with GG
    
    df['GG_flag'] = df['barcode'].str[:2] == 'GG'
    
    return df
    
df_nt = nt_type(df_barcode)
df_tp = threepeat(df_nt)
df_gcpct = GCpct(df_tp)
df_GG = no_GG(df_gcpct)


# display(df_tp.sort_values(by="barcode").head(20)
# display(df_nt['barcode'].head(10).str.count('A'))
# display(df_GG.sort_values(by='barcode', ascending=False).head(20))
display(df_GG.head(20))
print(len(df_GG))

Unnamed: 0,barcode,A,C,G,T,nt_flag,threepeat_flag,GC_pct,GCpct_flag,GG_flag
0,CCAGCTAACT,3,4,1,2,False,False,0.5,True,False
1,GGTGCAGCTA,2,2,4,2,False,False,0.6,True,True
2,TGTCGCAGAT,2,2,3,3,False,False,0.5,True,False
3,AAGAGGGCCA,4,2,4,0,False,True,0.6,True,False
4,AATAAAAGCC,6,2,1,1,False,True,0.3,True,False
5,TAATCGGCGC,2,3,3,2,False,False,0.6,True,False
6,TATGCGCGGA,2,2,4,2,False,False,0.6,True,False
7,TAGATCGTTC,2,2,2,4,False,False,0.4,True,False
8,ACCCCTAATC,3,5,0,2,False,True,0.5,True,False
9,CCGCGGGGAG,1,3,6,0,False,True,0.9,False,False


9000


In [5]:
def all_filter(df):
    df_nt = nt_type(df)
    df_tp = threepeat(df_nt)
    df_gcpct = GCpct(df_tp)
    df_GG = no_GG(df_gcpct)
    
    filters = ['nt_flag','threepeat_flag','GG_flag']
    
#     df_GG['pass_filter'] = [True if x == False else False for x in [df_GG[['nt_flag','threepeat_flag','GG_flag']]]]
    df_GG['pass_filter'] = [False if x != 0 else True for x in 
                            np.sum(df_GG[filters] == True,1)]

#     print (df_GG.head(10))
    df_filtered = df_GG.copy()
    return df_filtered

In [7]:
df_filtered = all_filter(df_barcode)

df_pf = df_filtered[df_filtered.pass_filter == True]
print (len(df_pf))
display (df_pf.head(20))
# display(df_filtered.head(20))

df_pf.to_csv = ('/Users/rchin/PycharmProjects/IPS_Primer_Generator/Filtered_10nt_Barcodes.csv')


5337


Unnamed: 0,barcode,A,C,G,T,nt_flag,threepeat_flag,GC_pct,GCpct_flag,GG_flag,pass_filter
0,CCAGCTAACT,3,4,1,2,False,False,0.5,True,False,True
2,TGTCGCAGAT,2,2,3,3,False,False,0.5,True,False,True
5,TAATCGGCGC,2,3,3,2,False,False,0.6,True,False,True
6,TATGCGCGGA,2,2,4,2,False,False,0.6,True,False,True
7,TAGATCGTTC,2,2,2,4,False,False,0.4,True,False,True
10,TCGTGGAACC,2,3,3,2,False,False,0.6,True,False,True
11,ACTCAGGTGA,3,2,3,2,False,False,0.5,True,False,True
12,CGTGTATGCC,1,3,3,3,False,False,0.6,True,False,True
13,AATCAGCGTC,3,3,2,2,False,False,0.5,True,False,True
14,GTTGTAGTGT,1,0,4,5,False,False,0.4,True,False,True


In [119]:
word = "ATGCTCAGGGCATAT"
# print (word.find('CGG'))
# print ('CGG' in word)
s_word = pd.Series(['AGTAGCATCGGGATAG'])
# print(s_word)

any(g in word for g in ['GGG', 'AAA'])
# if any(ext in url_string for ext in extensionsToCheck)

True

In [136]:
s_nt = pd.DataFrame({'a':    [ 0,  29, 5],
                     'b': [1, 170, 3],
                     'c': [31, 115, 2],
                    'd': [3, 5, 7]})
[True if x > 1 else False for x in np.sum(s_nt.values == 0,1)]

[False, False, False]

In [8]:
df_pf.to_csv = ('/Users/rchin/Downloads/Filtered_10nt_Barcodes.csv')

In [15]:
os.getcwd()

'/Users/rchin/PycharmProjects/IPS_Primer_Generator'