In [1]:
import re
import os
import gzip
import pandas as pd
# only look at rounds files; not ZeroCycle
data_dir = "/home/ruh/SELEX_Cluster/"

file_names = sorted([x for x in os.listdir(data_dir) if ('.gz' in x and "ZeroCycle" not in x)])

In [2]:
assert len(file_names) == 2069

In [3]:
print(f"found {len(file_names)} number of files:\n{' \n'.join(file_names[:10])}")

found 2069 number of files:
ALX3_ESAE_TGCAAG20NGA_1.txt.gz 
ALX3_ESAE_TGCAAG20NGA_2.txt.gz 
ALX3_ESAE_TGCAAG20NGA_3.txt.gz 
ALX3_ESAE_TGCAAG20NGA_4.txt.gz 
ALX3_ESZ_TGTAAA20NAAG_1.txt.gz 
ALX3_ESZ_TGTAAA20NAAG_2.txt.gz 
ALX3_ESZ_TGTAAA20NAAG_3.txt.gz 
ALX3_ESZ_TGTAAA20NAAG_4.txt.gz 
ALX4_ESW_TGTGTC20NGA_1.txt.gz 
ALX4_ESW_TGTGTC20NGA_2.txt.gz


In [4]:
def get_regex_matches(string, round_id, verbose = False):
    
    pattern = rf"([A-Za-z0-9-]+)_([A-Za-z]+)_([A-Za-z]+)(\d+N)([A-Z]+)?_{round_id}\.txt\.gz"
    matches = re.findall(pattern, string)
    
    if matches and verbose:
        code1, code2, code3, code4, code5, code6 = matches[0]
        print(code1)  # Output: Alx1
        print(code2)  # Output: ESZ
        print(code3)  # Output: TAAAGC
        print(code4)  # Output: 20N
        print(code5)  # Output: CG
        # print(code6)  # Output: 1

    if not(matches):
        if(verbose):
            print(f"Regex failed at {string}")
        return None 
        
    return matches[0]
    
def additional_check(file_name, regex_out):
    codes = file_name.split("_")

    assert regex_out[0]  == codes[0], f"{file_name} --> {regex_out[0]}<>{codes[0]}"
    assert regex_out[1]  == codes[1]
    #assert regex_out[-1] == codes[-1][:-len('.txt.gz')], f"{file_name} --> {regex_out[-1]}<>{codes[-1]}"

    return

In [5]:
rp_dict = { '1' : [], '2' : [], '3' : [], '4' : [], '5' : []}
rf_dict = { '1' : [], '2' : [], '3' : [], '4' : [], '5' : []}

In [6]:
for round_id in range(1,6):

    proteins = []
    filtered_files = []
    
    with open('temp.txt', 'w') as file:
        
        for string in file_names:
            matches = get_regex_matches(string, round_id)
            
            if(matches == None): #not something we are interested in
                continue
            else:
                ## these are the ones we are intersted in
                rp_dict[str(round_id)].append(matches[0])
                rf_dict[str(round_id)].append(string)
                
                
                file.write(str(matches + (round_id,))+'\n')
                
                additional_check(string, matches)

    #dna_sequences = []
                
    ### Some stats
    print(f"Round {round_id}")
    print(rf_dict[str(round_id)][:5])
    print(len(rp_dict[str(round_id)]))
    
    u_protiens = set(rp_dict[str(round_id)])
    print(sorted(u_protiens)[0], sorted(u_protiens)[-1])
    print(len(u_protiens))
    print("__________")

Round 1
['ALX3_ESAE_TGCAAG20NGA_1.txt.gz', 'ALX3_ESZ_TGTAAA20NAAG_1.txt.gz', 'ALX4_ESW_TGTGTC20NGA_1.txt.gz', 'ARNTL_ESW_TCAAAA20NCG_1.txt.gz', 'ARX_ESZ_TGCGTT20NTGC_1.txt.gz']
513
ALX3 Zic3
441
__________
Round 2
['ALX3_ESAE_TGCAAG20NGA_2.txt.gz', 'ALX3_ESZ_TGTAAA20NAAG_2.txt.gz', 'ALX4_ESW_TGTGTC20NGA_2.txt.gz', 'ARNTL_ESW_TCAAAA20NCG_2.txt.gz', 'ARX_ESZ_TGCGTT20NTGC_2.txt.gz']
516
ALX3 Zic3
443
__________
Round 3
['ALX3_ESAE_TGCAAG20NGA_3.txt.gz', 'ALX3_ESZ_TGTAAA20NAAG_3.txt.gz', 'ALX4_ESW_TGTGTC20NGA_3.txt.gz', 'ARNTL_ESW_TCAAAA20NCG_3.txt.gz', 'ARX_ESZ_TGCGTT20NTGC_3.txt.gz']
516
ALX3 Zic3
441
__________
Round 4
['ALX3_ESAE_TGCAAG20NGA_4.txt.gz', 'ALX3_ESZ_TGTAAA20NAAG_4.txt.gz', 'ALX4_ESW_TGTGTC20NGA_4.txt.gz', 'ARNTL_ESW_TCAAAA20NCG_4.txt.gz', 'ARX_ESZ_TGCGTT20NTGC_4.txt.gz']
518
ALX3 Zic3
444
__________
Round 5
['Foxk1_ESQ_TGGGTA20NCG_5.txt.gz', 'Gbx1_ESQ_TCGGTT20NCG_5.txt.gz', 'Hlf_ESP_TCAATT20NTA_5.txt.gz', 'Pknox2_ESQ_TGCTCG20NCG_5.txt.gz', 'Rfx3_ESP_TGGGTA20NCG_5.txt.gz']


### Covered unique protein count

In [7]:
set(rp_dict['1']) == set(rp_dict['3'])

False

In [8]:
set(rp_dict['4']).difference(set(rp_dict['1']))

{'DRGX', 'E2F1', 'HES5', 'TFEB'}

In [9]:
set(rp_dict['4']).difference(set(rp_dict['2']))

{'E2F1', 'ZSCAN4'}

In [10]:
set(rp_dict['4']).difference(set(rp_dict['3']))

{'BHLHB2', 'FIGLA', 'SCRT2', 'ZNF232'}

In [11]:
'E2F1' in rp_dict['2']

False

In [12]:
set(rp_dict['1']).difference(set(rp_dict['4']))

{'MEF2D'}

In [13]:
set(rp_dict['1']).difference(set(rp_dict['3']))

{'BHLHB2', 'FIGLA', 'SCRT2', 'ZNF232'}

In [14]:
df = pd.read_csv('proteinInfo.csv')
all_proteins = set(df['HNGC-name'].tolist())

In [15]:
len(all_proteins)

463

In [16]:
all_proteins.difference(set(rp_dict['1']))

{'CTCF',
 'DRGX',
 'E2F1',
 'EHF',
 'ELF4',
 'ELK4',
 'ETV2',
 'Egr1_E410D_FARSDERtoFARSDDR',
 'GLIS1',
 'GLIS3',
 'HES5',
 'MEIS1',
 'NFKB2',
 'NRL',
 'RFX4',
 'SNAI2',
 'SP1',
 'SP3',
 'SPIB',
 'TFEB',
 'TP63',
 'Trp53',
 'Trp73'}

### Reading round files to count number of DNA fragments

In [None]:
dna_sequence_lengths = []

file_names = rf_dict['4']
print(f"number of files we have = {len(file_names)}")

for idx, file in enumerate(file_names):
    if(idx%10==0):
        print(idx, end = ",")
    
    with gzip.open(data_dir + file_names[idx], 'rt') as f:
        for i, line in enumerate(f):
            if i % 4 == 1:  # Every fourth line, starting with the second line (index 1)
                temp = line.strip()
                #dna_sequences.append(temp)
                dna_sequence_lengths.append(len(temp))

number of files we have = 518
0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,

In [29]:
len(dna_sequence_lengths)

807690805

In [30]:
import numpy as np

In [31]:
dna_sequence_lengths = np.array(dna_sequence_lengths)

In [32]:
dna_sequence_lengths.max()

40

In [33]:
dna_sequence_lengths.min()

14