### import libraries and paths

In [1]:
import re
import os
import gzip
import pandas as pd

# only look at rounds files; not ZeroCycle
data_dir = "/n/holyscratch01/wadduwage_lab/ramith/SELEX/SELEX/"
data_dir = "/home/ruh/SELEX_Cluster/"

file_names = sorted([x for x in os.listdir(data_dir) if ('.gz' in x and "ZeroCycle" not in x)])

In [2]:
assert len(file_names) == 2069

In [3]:
print(f"found {len(file_names)} number of files:")
for i in range(0,10):
    print(file_names[i])

found 2069 number of files:
ALX3_ESAE_TGCAAG20NGA_1.txt.gz
ALX3_ESAE_TGCAAG20NGA_2.txt.gz
ALX3_ESAE_TGCAAG20NGA_3.txt.gz
ALX3_ESAE_TGCAAG20NGA_4.txt.gz
ALX3_ESZ_TGTAAA20NAAG_1.txt.gz
ALX3_ESZ_TGTAAA20NAAG_2.txt.gz
ALX3_ESZ_TGTAAA20NAAG_3.txt.gz
ALX3_ESZ_TGTAAA20NAAG_4.txt.gz
ALX4_ESW_TGTGTC20NGA_1.txt.gz
ALX4_ESW_TGTGTC20NGA_2.txt.gz


In [4]:
def get_regex_matches(string, round_id, verbose = False):
    """
        We are interested only a particular {round_id} so only retrieve those files from the base dir
    """
    
    pattern = rf"([A-Za-z0-9-]+)_([A-Za-z]+)_([A-Za-z]+)(\d+N)([A-Z]+)?_{round_id}\.txt\.gz"
    matches = re.findall(pattern, string)
    
    if matches and verbose:
        code1, code2, code3, code4, code5, code6 = matches[0]
        print(code1)  # Output: Alx1
        print(code2)  # Output: ESZ
        print(code3)  # Output: TAAAGC
        print(code4)  # Output: 20N
        print(code5)  # Output: CG

    if not(matches):
        if(verbose):
            print(f"Regex failed at {string}")
        return None 
        
    return matches[0]
    
def additional_check(file_name, regex_out):
    codes = file_name.split("_")

    assert regex_out[0]  == codes[0], f"{file_name} --> {regex_out[0]}<>{codes[0]}"
    assert regex_out[1]  == codes[1]
    #assert regex_out[-1] == codes[-1][:-len('.txt.gz')], f"{file_name} --> {regex_out[-1]}<>{codes[-1]}"

    return

In [5]:
## Dictionaries to store 

rp_dict    = { '1' : [], '2' : [], '3' : [], '4' : [], '5' : []} #proteins in each round_id
rf_dict    = { '1' : [], '2' : [], '3' : [], '4' : [], '5' : []} #file_name in each round_id
regex_dict = { '1' : [], '2' : [], '3' : [], '4' : [], '5' : []} #the whole segmented file keys

### looping through file names to gather some summary of data

In [6]:
for round_id in range(1,6):
    
    with open('temp.txt', 'w') as file:
        
        for string in file_names:
            
            # get file matches for round_id
            matches = get_regex_matches(string, round_id)
            
            if(matches == None): # not something we are interested in
                continue
            else:
                ## valid match
                
                rp_dict[str(round_id)].append(matches[0])
                rf_dict[str(round_id)].append(string)
                regex_dict[str(round_id)].append(matches)
                
                file.write(str(matches + (round_id,))+'\n')
                
                # sanity check
                additional_check(string, matches)

                
    ### Some stats
    print(f"Round {round_id}")
    print(rf_dict[str(round_id)][:3])
    print(len(rp_dict[str(round_id)])) #number of proteins in round
    
    u_protiens = set(rp_dict[str(round_id)]) #number of unique proteins in round
    print(sorted(u_protiens)[0], sorted(u_protiens)[-1])
    print(len(u_protiens))
    print("__________")

Round 1
['ALX3_ESAE_TGCAAG20NGA_1.txt.gz', 'ALX3_ESZ_TGTAAA20NAAG_1.txt.gz', 'ALX4_ESW_TGTGTC20NGA_1.txt.gz']
513
ALX3 Zic3
441
__________
Round 2
['ALX3_ESAE_TGCAAG20NGA_2.txt.gz', 'ALX3_ESZ_TGTAAA20NAAG_2.txt.gz', 'ALX4_ESW_TGTGTC20NGA_2.txt.gz']
516
ALX3 Zic3
443
__________
Round 3
['ALX3_ESAE_TGCAAG20NGA_3.txt.gz', 'ALX3_ESZ_TGTAAA20NAAG_3.txt.gz', 'ALX4_ESW_TGTGTC20NGA_3.txt.gz']
516
ALX3 Zic3
441
__________
Round 4
['ALX3_ESAE_TGCAAG20NGA_4.txt.gz', 'ALX3_ESZ_TGTAAA20NAAG_4.txt.gz', 'ALX4_ESW_TGTGTC20NGA_4.txt.gz']
518
ALX3 Zic3
444
__________
Round 5
['Foxk1_ESQ_TGGGTA20NCG_5.txt.gz', 'Gbx1_ESQ_TCGGTT20NCG_5.txt.gz', 'Hlf_ESP_TCAATT20NTA_5.txt.gz']
6
Foxk1 Sox10
6
__________


### Reading round 4 files to save positives 🤓

In [7]:
def is_valid_dna(seq):
    valid_chars = set('ACGT')
    return all(c in valid_chars for c in seq)

In [8]:
## dictionary in which we will keep our sequence data
ambiguous_dna = 0
r_idx = 4

r_dna_seq_len_dict = {str(r_idx): [] }
r_dna_seq_dict     = {str(r_idx): [] }

file_names = rf_dict[str(r_idx)] #retrieve file names

print(f" Number of files we have = {len(file_names)}")

 Number of files we have = 518


In [9]:
import csv

In [10]:
with open('positives.csv', 'w', newline='') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['protein_id', 'dna_fragment', 'label'])  # Write header

    ## Open each round 4 protein file
    for idx, file in enumerate(file_names):

        ## extract out protein name
        file_name    = file_names[idx]
        protein_name = file_name.split("_")[0]
    
        if(idx%50==0):
            print(f"{idx} --> {protein_name}", end = "\n")
        
        ## Open file
        with gzip.open(data_dir + file_names[idx], 'rt') as f:
            
            assert protein_name == regex_dict[str(r_idx)][idx][0] #make sure that the protein name matches
            
            for i, line in enumerate(f):
                if i % 4 == 1:  # Every fourth line, starting with the second line (index 1)
                    temp = line.strip() #dna_fragment
                    
                    is_valid = is_valid_dna(temp)
                    
                    if(not(is_valid)):
                        ambiguous_dna+=1
                    else:
                        writer.writerow([protein_name, temp, 1])

0 --> ALX3
50 --> Creb5
100 --> ESX1
150 --> GCM1
200 --> HSF1
250 --> MEF2B
300 --> NOTO
350 --> POU3F1
400 --> SCRT1
450 --> TCF4
500 --> ZNF143


In [11]:
ambiguous_dna

1789089

In [12]:
file_names[0]

'ALX3_ESAE_TGCAAG20NGA_4.txt.gz'