### import libraries and paths

In [1]:
import re
import os
import gzip
import pandas as pd

# only look at rounds files; not ZeroCycle
data_dir = "/n/holyscratch01/wadduwage_lab/ramith/SELEX/SELEX/"

file_names = sorted([x for x in os.listdir(data_dir) if ('.gz' in x and "ZeroCycle" not in x)])

In [2]:
assert len(file_names) == 2069

In [3]:
print(f"found {len(file_names)} number of files:")
for i in range(0,10):
    print(file_names[i])

found 2069 number of files:
ALX3_ESAE_TGCAAG20NGA_1.txt.gz
ALX3_ESAE_TGCAAG20NGA_2.txt.gz
ALX3_ESAE_TGCAAG20NGA_3.txt.gz
ALX3_ESAE_TGCAAG20NGA_4.txt.gz
ALX3_ESZ_TGTAAA20NAAG_1.txt.gz
ALX3_ESZ_TGTAAA20NAAG_2.txt.gz
ALX3_ESZ_TGTAAA20NAAG_3.txt.gz
ALX3_ESZ_TGTAAA20NAAG_4.txt.gz
ALX4_ESW_TGTGTC20NGA_1.txt.gz
ALX4_ESW_TGTGTC20NGA_2.txt.gz


In [4]:
def get_regex_matches(string, round_id, verbose = False):
    """
        We are interested only a particular {round_id} so only retrieve those files from the base dir
    """
    
    pattern = rf"([A-Za-z0-9-]+)_([A-Za-z]+)_([A-Za-z]+)(\d+N)([A-Z]+)?_{round_id}\.txt\.gz"
    matches = re.findall(pattern, string)
    
    if matches and verbose:
        code1, code2, code3, code4, code5, code6 = matches[0]
        print(code1)  # Output: Alx1
        print(code2)  # Output: ESZ
        print(code3)  # Output: TAAAGC
        print(code4)  # Output: 20N
        print(code5)  # Output: CG

    if not(matches):
        if(verbose):
            print(f"Regex failed at {string}")
        return None 
        
    return matches[0]
    
def additional_check(file_name, regex_out):
    codes = file_name.split("_")

    assert regex_out[0]  == codes[0], f"{file_name} --> {regex_out[0]}<>{codes[0]}"
    assert regex_out[1]  == codes[1]
    #assert regex_out[-1] == codes[-1][:-len('.txt.gz')], f"{file_name} --> {regex_out[-1]}<>{codes[-1]}"

    return

In [5]:
## Dictionaries to store 

rp_dict = { '1' : [], '2' : [], '3' : [], '4' : [], '5' : []} #proteins in each round_id
rf_dict = { '1' : [], '2' : [], '3' : [], '4' : [], '5' : []} #file_name in each round_id
regex_dict = { '1' : [], '2' : [], '3' : [], '4' : [], '5' : []} #the whole segmented file keys

### looping through file names to gather some summary of data

In [6]:
for round_id in range(1,6):
    
    with open('temp.txt', 'w') as file:
        
        for string in file_names:
            
            # get file matches for round_id
            matches = get_regex_matches(string, round_id)
            
            if(matches == None): # not something we are interested in
                continue
            else:
                ## valid match
                
                rp_dict[str(round_id)].append(matches[0])
                rf_dict[str(round_id)].append(string)
                regex_dict[str(round_id)].append(matches)
                
                file.write(str(matches + (round_id,))+'\n')
                
                # sanity check
                additional_check(string, matches)

                
    ### Some stats
    print(f"Round {round_id}")
    print(rf_dict[str(round_id)][:3])
    print(len(rp_dict[str(round_id)])) #number of proteins in round
    
    u_protiens = set(rp_dict[str(round_id)]) #number of unique proteins in round
    print(sorted(u_protiens)[0], sorted(u_protiens)[-1])
    print(len(u_protiens))
    print("__________")

Round 1
['ALX3_ESAE_TGCAAG20NGA_1.txt.gz', 'ALX3_ESZ_TGTAAA20NAAG_1.txt.gz', 'ALX4_ESW_TGTGTC20NGA_1.txt.gz']
513
ALX3 Zic3
441
__________
Round 2
['ALX3_ESAE_TGCAAG20NGA_2.txt.gz', 'ALX3_ESZ_TGTAAA20NAAG_2.txt.gz', 'ALX4_ESW_TGTGTC20NGA_2.txt.gz']
516
ALX3 Zic3
443
__________
Round 3
['ALX3_ESAE_TGCAAG20NGA_3.txt.gz', 'ALX3_ESZ_TGTAAA20NAAG_3.txt.gz', 'ALX4_ESW_TGTGTC20NGA_3.txt.gz']
516
ALX3 Zic3
441
__________
Round 4
['ALX3_ESAE_TGCAAG20NGA_4.txt.gz', 'ALX3_ESZ_TGTAAA20NAAG_4.txt.gz', 'ALX4_ESW_TGTGTC20NGA_4.txt.gz']
518
ALX3 Zic3
444
__________
Round 5
['Foxk1_ESQ_TGGGTA20NCG_5.txt.gz', 'Gbx1_ESQ_TCGGTT20NCG_5.txt.gz', 'Hlf_ESP_TCAATT20NTA_5.txt.gz']
6
Foxk1 Sox10
6
__________


### Checks to see if new different proteins appear in subsequent rounds

In [7]:
set(rp_dict['1']) == set(rp_dict['3'])

False

In [8]:
set(rp_dict['4']).difference(set(rp_dict['1']))

{'DRGX', 'E2F1', 'HES5', 'TFEB'}

In [9]:
set(rp_dict['4']).difference(set(rp_dict['2']))

{'E2F1', 'ZSCAN4'}

In [10]:
set(rp_dict['4']).difference(set(rp_dict['3']))

{'BHLHB2', 'FIGLA', 'SCRT2', 'ZNF232'}

In [11]:
'E2F1' in rp_dict['2']

False

In [12]:
set(rp_dict['1']).difference(set(rp_dict['4']))

{'MEF2D'}

In [13]:
set(rp_dict['1']).difference(set(rp_dict['3']))

{'BHLHB2', 'FIGLA', 'SCRT2', 'ZNF232'}

### Read codes.txt to see how many unique proteins are there

In [14]:
df = pd.read_csv('protein_info.csv')
all_proteins = set(df['HNGC-name'].tolist())

In [15]:
len(all_proteins)

463

In [16]:
all_proteins.difference(set(rp_dict['1']))

{'CTCF',
 'DRGX',
 'E2F1',
 'EHF',
 'ELF4',
 'ELK4',
 'ETV2',
 'Egr1_E410D_FARSDERtoFARSDDR',
 'GLIS1',
 'GLIS3',
 'HES5',
 'MEIS1',
 'NFKB2',
 'NRL',
 'RFX4',
 'SNAI2',
 'SP1',
 'SP3',
 'SPIB',
 'TFEB',
 'TP63',
 'Trp53',
 'Trp73'}

### Reading round files to count number of DNA fragments (read HISEQ2 data to retrieve DNA data)

In [17]:
def is_valid_dna(seq):
    valid_chars = set('ACGT')
    return all(c in valid_chars for c in seq)

In [None]:
## dictionary in which we will keep our sequence data
ambiguous_dna = 0

r_idx = 4

r_dna_seq_len_dict = {str(r_idx): [] }
r_dna_seq_dict     = {str(r_idx): [] }

file_names = rf_dict[str(r_idx)] #retrieve file names

protein_dna_matched_dict = {}

print(f" Number of files we have = {len(file_names)}")

for idx, file in enumerate(file_names): ## loop through files in round_idx
    
    if(idx%10==0):
        print(idx, end = ",")
        
    file_name = file_names[idx]
    protein_name = file_name.split("_")[0]
    
    ## Open file
    with gzip.open(data_dir + file_names[idx], 'rt') as f:
        
        assert protein_name == regex_dict[str(r_idx)][idx][0] #make sure that the protein name matches
        
        
        if(protein_name not in protein_dna_matched_dict.keys()): ## we haven't made an entry in the dictionary yet
            protein_dna_matched_dict[protein_name] = []
        
        for i, line in enumerate(f):
            if i % 4 == 1:  # Every fourth line, starting with the second line (index 1)
                temp = line.strip()
                
                r_dna_seq_dict[str(r_idx)].append(temp)
                r_dna_seq_len_dict[str(r_idx)].append(len(temp)) ## keep track of lengths
                
                is_valid = is_valid_dna(temp)
                
                if(not(is_valid)):
                    ambiguous_dna+=1
                
                ## Add to matched dictionary as well !
                protein_dna_matched_dict[protein_name].append((temp,is_valid))

 Number of files we have = 518
0,10,20,30,40,50,60,70,80,

In [None]:
ambiguous_dna

In [None]:
regex_dict[str(r_idx)][0][0]

In [None]:
file_names[0]

In [None]:
assert len(r_dna_seq_len_dict['4'])==len(r_dna_seq_dict['4'])

In [None]:
len(r_dna_seq_dict['4'])

In [None]:
import numpy as np

In [None]:
dna_sequence_lengths = np.array(dna_sequence_lengths)

In [None]:
dna_sequence_lengths.max()

In [None]:
dna_sequence_lengths.min()

### Load Zero Cycles

In [None]:
import re
import os

file_names = sorted([x for x in os.listdir(data_dir) if ('.gz' in x and "ZeroCycle" in x)])

file_names[:5], len(file_names)

zero_cycle_dna_seq = []
issue_pos = []
zero_cycle_dna_seq_len = []

count = 0

for idx, file in enumerate(file_names):
    if(idx%10==0):
        print(idx, end = ",")
    
    with gzip.open(data_dir + file_names[idx], 'rt') as f:
        for i, line in enumerate(f):
            if i % 4 == 1:  # Every fourth line, starting with the second line (index 1)
                temp = line.strip()
                zero_cycle_dna_seq.append(temp)
                zero_cycle_dna_seq_len.append(len(temp))


                if(not(is_valid_dna(temp))):
                    issue_pos.append(count)
                count+=1

In [None]:
len(protein_dna_matched_dict.keys())

In [None]:
assert len(zero_cycle_dna_seq) == len(zero_cycle_dna_seq_len)

In [None]:
len(zero_cycle_dna_seq_len)

In [None]:
l = 0
issue_pos[l], zero_cycle_dna_seq[issue_pos[l]]

In [None]:
protein_dna_matched_dict[protein_id][:10]

In [None]:
search_sequence = 'TAAGCTCACAGCAGGGGGTT'

exists

In [None]:
protein_id
matches_set = set(item[0] for item in protein_dna_matched_dict[protein_id][:100])
len(matches_set)

In [None]:
import csv

headers = [
    ["Protein_Name","DNA_Fragment","Label"],
]

In [None]:
for protein_id in protein_dna_matched_dict.keys(): #loop through each protein
    
    with open(f"{protein_id}_negatives.csv", 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(headers)
        
        matches_set = set(item[0] for item in protein_dna_matched_dict[protein_id])
        
        ## loop through zero cycle 300M
        for idx, dna_fragment in enumerate(zero_cycle_dna_seq):
            
            if(idx in issue_pos): #if invalid continue
                continue
            
            # check if fragment is there in round 4, if not it's a negative
            if(not(dna_fragment in matches_set)):
                writer.writerows([[ protein_id, dna_fragment, 0]])

        print(protein_id, len(protein_dna_matched_dict[str(protein_id)]))


In [None]:
6