In [1]:
import re
import os
import sys
import gzip
import pandas as pd

sys.path.append("../")

In [2]:
data_dir = "/net/dali/home/chikina/shared_data/SELEX"

valid_chars = set('ACGT')

def is_valid_dna(seq):    
    return all(c in valid_chars for c in seq)

In [3]:
df = pd.read_csv('protein_info.csv')
all_proteins = set(df['HNGC-name'].tolist())

In [4]:
# all_proteins

In [7]:
unique_zero_cycle_dna_seq = set()

count = 0

zero_cycle_file_names = sorted([x for x in os.listdir(data_dir) if ('.gz' in x and "ZeroCycle" in x)])

for idx, file in enumerate(zero_cycle_file_names):
    if(idx%10==0):
        print(idx, end = ",")
    
    with gzip.open(f"{data_dir}/{zero_cycle_file_names[idx]}", 'rt') as f:
        for i, line in enumerate(f):
            if i % 4 == 1:  # Every fourth line, starting with the second line (index 1)
                temp = line.strip()
                
                # add valids to a 
                if((is_valid_dna(temp))):
                    #zero_cycle_dna_seq.append(temp)
                    unique_zero_cycle_dna_seq.add(temp)
                    unique_zero_cycle_dna_seq.add(temp[::-1])
                    count+=1

0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200,210,220,230,240,250,260,270,280,290,300,310,320,330,340,350,360,370,380,390,400,410,420,430,440,

In [8]:
count

287265215

In [11]:
unique_round_one_dna_seq = set()

count = 0

round_one_file_names = sorted([x for x in os.listdir(data_dir) if ('.gz' in x and "_1.txt.gz" in x)])

for idx, file in enumerate(round_one_file_names):
    if(idx%10==0):
        print(idx, end = ",")
        
    prot_id = round_one_file_names[idx].split("_")[0]
    
    if(prot_id not in all_proteins):
        print(f"Skipping {prot_id}")
        continue
    
    with gzip.open(f"{data_dir}/{round_one_file_names[idx]}", 'rt') as f:
        for i, line in enumerate(f):
            if i % 4 == 1:  # Every fourth line, starting with the second line (index 1)
                temp = line.strip()
                
                # add valids to a 
                if((is_valid_dna(temp))):
                    
                    unique_round_one_dna_seq.add(temp)
                    unique_round_one_dna_seq.add(temp[::-1])
                    count+=1
                    
count

0,10,20,30,40,50,60,70,80,90,100,Skipping Egr1
110,120,130,140,150,160,170,180,190,200,210,220,230,240,250,260,270,280,290,300,310,320,330,340,350,360,370,380,390,400,410,420,430,440,450,460,470,480,490,500,510,

1212165697

In [13]:
unique_round_four_dna_seq = set()

count = 0

round_four_file_names = sorted([x for x in os.listdir(data_dir) if ('.gz' in x and "_4.txt.gz" in x)])

for idx, file in enumerate(round_four_file_names):
    if(idx%10==0):
        print(idx, end = ",")
    
    prot_id = round_four_file_names[idx].split("_")[0]
    
    if(prot_id not in all_proteins):
        print(f"Skipping {prot_id}")
        continue
    
    with gzip.open(f"{data_dir}/{round_four_file_names[idx]}", 'rt') as f:
        for i, line in enumerate(f):
            if i % 4 == 1:  # Every fourth line, starting with the second line (index 1)
                temp = line.strip()
                
                # add valids to a 
                if((is_valid_dna(temp))):
                    
                    unique_round_four_dna_seq.add(temp)
                    unique_round_four_dna_seq.add(temp[::-1])
                    count+=1
                    
count

0,10,20,30,40,50,60,70,80,90,100,110,Skipping Egr1
120,130,140,150,160,170,180,190,200,210,220,230,240,250,260,270,280,290,300,310,320,330,340,350,360,370,380,390,400,410,420,430,440,450,460,470,480,490,500,510,

802906198

## Summarizing the unique count (considering both forward & reverse)

In [14]:
print(f"Round 0 : {len(unique_zero_cycle_dna_seq)}")

Round 0 : 573818279


In [15]:
print(f"Round 1 : {len(unique_round_one_dna_seq)}")

Round 1 : 2094201073


In [16]:
print(f"Round 4 : {len(unique_round_four_dna_seq)}")

Round 4 : 783078213


### Looking at intersections:

In [17]:
intersection_0_4 = unique_zero_cycle_dna_seq.intersection(unique_round_four_dna_seq)

In [27]:
print(f"Intersection of 0 & 4: {len(intersection_0_4)}")

Intersection of 0 & 4: 973875


In [19]:
intersection_1_4 = unique_round_one_dna_seq.intersection(unique_round_four_dna_seq)

In [28]:
print(f"Intersection of 1 & 4: {len(intersection_1_4)}")

Intersection of 1 & 4: 47750344


In [21]:
intersection_0_1 = unique_zero_cycle_dna_seq.intersection(unique_round_one_dna_seq)

In [29]:
print(f"Intersection of 0 & 1: {len(intersection_0_1)}")

Intersection of 0 & 1: 2089325


In [23]:
intersection_0_1_4 = unique_zero_cycle_dna_seq.intersection(unique_round_one_dna_seq,unique_round_four_dna_seq)

In [30]:
print(f"Intersection of 0 & 1 & 4: {len(intersection_0_1_4)}")

Intersection of 0 & 1 & 4: 137254


### Set union of 0,1,4

In [31]:
union_set = unique_zero_cycle_dna_seq.union(unique_round_one_dna_seq, unique_round_four_dna_seq)

In [32]:
print(f"Union of 0 & 1 & 4: {len(union_set)}")

Union of 0 & 1 & 4: 3400421275


In [33]:
save_dir = "/net/dali/home/mscbio/rah304/shared_data/SELEX/dna_fragments"

In [34]:
def write_set_to_file(large_set, filename):
    with open(filename, 'w', buffering=1024*1024) as f:
        for item in large_set:
            f.write(f"{item}\n")

In [35]:
write_set_to_file(unique_zero_cycle_dna_seq, f"{save_dir}/ZeroCycle_unique_fragments.txt")

In [None]:
write_set_to_file(unique_round_one_dna_seq, f"{save_dir}/round_1_unique_fragments.txt")

In [None]:
write_set_to_file(unique_round_four_dna_seq, f"{save_dir}/round_4_unique_fragments.txt")