In [12]:
import numpy as np
import pandas as pd
import os 
import glob
from more_itertools import windowed

In [13]:
## Define a function to reverse-complement a sequence. 

def rc(sequence):
    """
    A function returning the reverse-complement of a sequence. 
    Input sequences containing 'N' will break this function.
    """
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
    return "".join(complement.get(base, base) for base in reversed(sequence))


In [14]:
## Define a function to get all subsequences of a given size out, given a step size.

def get_kmers(record: str, k: int, s: int):
    """
    Return list of all kmers of length k with step size s from current record
    Will not report the last kmer if the step size causes the window to advance
    past the end of the record
    """
    kmers = windowed(str(record), k, step=s)
    return ["".join(kmer) for kmer in kmers if None not in kmer]


In [15]:
## Define a function to flatten a list. 

def flatten_extend(matrix):
    """
    https://realpython.com/python-flatten-list/
    """
    flat_list = []
    for row in matrix:
        flat_list.extend(row)
    return flat_list

In [16]:
## Given a list of input sequences, get their tiled (dist 1) 27mers and RCs. 

def get_anchors(sequence_list):
    
    seqs = flatten_extend([get_kmers(i, 27, 1) for i in sequence_list])
    rcs = [rc(i) for i in seqs]
    return np.unique(list(set(rcs) | set(seqs))), rcs

In [17]:
## First, take a set of input sequences. These were shared by Peter on 2/7/24.  

inputt = """>K:18+19
ACGCCCTGAAAGGGCAGAAGCTCCTAGCCCAGGGCAACGCCCTGGGTATAATAGCAGTCAGCAAGGCGCCCTGTAAGGGCAAAAGCCTTATAAAATCCTGACTCTTTGAAGCTTTTGCTTGGAGTTTTAAAGCTTTTGCCCTTACAGGGCGACAGGTTTGCGTCCGAATTTACCCAGGGCGTTGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>K:2+3
ACGCCCTGAAAGGGCAGAAGCTCCTAGCCCAGGGCATCGCCCTGGGTATAATAGCAATCAACAAGACGCCCTGTAAGGGCAAAAGCTTTGTTTATTGCCTGTTATTTAAAAGCTTTTGCCCTTACAGGGCGACGGGTTTGCGTTCATAATTACCCAGGGCGATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>K:8+9
ACGCCCTGAAAGGGCAGAAGCTCCTAGCCCAGGGCATCGCCCTGGGTATAATAGCAATCAGCAAGGCGCCCTGTAAGGGCAAAAGCTTTATGTTTTGTCTGGAGTTTTAAAGCTTTTGCCCTTACAGGGCGACAGGTTTGCGTCCATAACAACCCAGGGCGATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>K:4
ACGCCCTGAAAGGGCAGAAACTCCTAGCCCAGGGCAGCGCCCTGGGTATAATAGCAATAAACAAGACGCCCTGTAAGGGCAAAAGCTTCCTATATACAAAGCTTTTGCCCTTACAGGGCGACAGGTTTGCGTATGTAATTACCCAGGGTGCTACCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>K:11
ACGCCCTGAAAGGGCAGAAGCTCCTAGCCCAGGGTAACACCCTGGGGATATGATGATTTTAACAACCTCGCCCTGTAAGGGCAAAAGCTTTGATTATGTGAAGCTTTTGCCCTTACAGGGCGACATTTTTGTTTGTATGTTAACCCAGGGTGTTACCCTGGGCTAGGAGTTTCTGCCCTTTCAGGGCGT
>K:13+14
ACGCCCTGAAAGGGCAGAAGCTCCTAGCCCAGGGCATCGCCCTGGGTATAATAGCAATCAGCAAGACGCCCTGTAAGGGCAAAAGCTTTATGTTTTGCCTGGAGTTTTAAAGCTTTTGCCCTTACAGGGCGACAGGTTTGTGTCCGTAATTACCCAGGGCGTTGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>K:16+17
ACGCCCTGAAAGGGCAGAAGCTCCTAGCCCAGGGCATCGCCCTGGGTATAATAGCAATCAGCAAGACGCCCTGTAAGGGCAAAAGCTTTGTATATTGCCAGGTATTTGAAAGCTTTTGCCCTTACAGGGCGACAGGTTTGTGTCCGTAATTACCCAGGGCGATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>K:20
ACGCCCTGAAAGGGCAGAAGCCACTAGCCCAGGGCATCGCCCTGGGTATAATAGCAATCAGCAAGACGCCCTGTAAGGGCAAAAGCTTTGTATACTGCCAGGTATTTGAAAGCTTTTGCCCTTACAGGGCGACAGGTTTGTGTCCGTAATTACCCAGGGCGATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>K:21+22
ACGCCCTGAAAGGGCAGAAGCTCCTAGCCCAGGGCATCGCCCTGGGTATAATGACAATCAGCAAGGCGCCCTGTAAGGGCAAAAGCTTTATTAATGGTCTGGTATTTTAAAGCTTTTGCCCTTACAGGGCGACAGGTTTGCGACCGTAATAACCCAGGGCGATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>K:23+24
ACGCCCTGAAAGGGCAGAAGCTCCTAGCCCAGGGCATCGCCCTGGGTATAATAGCAATCAGCAAGGCGCCCTGTAAGGGCAAAAGCTTTGTATATTGCCAGGTATTTTAAAGCTTTTGCCCTTACAGGGCGACAGGTTTGCGCCCATAAACACCCAGGGCGATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>K:25
ACGCCCTGAAAGGGCAGAAGCTCCTAGCCCAGGGCATCGCCCTGGGTATAATGGCAATCAGTAAGGCGCCCTGTAAGGGCAAAAGCTTTATGTTTTGTCTGGAGTTTTAAAGCTTTTGCCCTTACAGGGCGACAGGTTTGCGTCCGTAATTACCCAGGGCGCTGCCCTGGGCTAGGGGCTTCTGCCCTTTCAGGGCGT
>K:26+27
ACGCCCTGAAAGGGCAGAAGCTCCTAGCCCAGGGCATCGCCCTGGGTATAATAGCAATCAGCAAGGCGCCCTGTAAGGGCAAAAGCTTTGTATGTTGCCAGGTATTTTAAAGCTTTTGCCCTTACAGGGCGACAGGGTTGCGTCCATAATAACCCAGGGCGATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>K:12
ACGGGCTGAAAGCCCAAAAGCACCTAGCCCAGGGCAGCGCCCTGGGTATACTAGCAATCAGCAAGGCGCCCTGTAAGGGCAAAAGCTTTGTTTATTGTCTAATATTTGAAAGCTTTTGCCCTTACAGGGCGACAGGCTTGCGTCCATAAATACCCAGGGCGCTGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>K:6
ACGGGCTGAAGGCCCAAAAGCTCCTAGCCCAGGGCATCGCCCTGGGTATAATGGCAATCAGCAAGCCGCCCTGTAAGGGCAAAAGCTTTGTTAATTGCCTGATATTTAAAAGCTTTTGCCCTTACAGGGCGACAGGGTTGCGTCCATAAAAACCCAGGGCGATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>K:5
ACGGGCTGAAGGCCCAAAAGCTCCTAGCCCAGGGCATCGCCCTGGGTATAATAGCAATCAGCAAGGCGCCCTGTAAGGGCAAAAGCTTTATGTATTGCCTGGAGTTTTAAAGCTTTTGCCCTTACAGGGCGACAGGTTTGCGCCCGTAATAACCCAGGGCGATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>K:10
ACGGGCTGAAGGCCCAAAAGCTCCTAGCCCAGGGCATCGCCCTGGGTATAATGGCAATCAACAAGACGCCCTGTAAGGGCAAAAGCTTTAATATACCCGGCAATATACAAAGCTTTTGCCCTTACAGGGCGACAGGTTTTGCGTCCGTAATTACCCAGGGCGATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>K:15
ACGGGCTGAAGGCCCAAAAGCTCCTAGCCCAGGGCATCGCCCTGGGTATAATGGCAATCAGCAAGGCGCCCTGTAAGGGCAAAAGCTTTGTATATTGCCAGGTATTTGAAAGCTTTTGCCCTTACAGGGCGACAGGTTTGCGCCCATAAACACCCAGGGCGATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>K:1
ACGGGCTGAAAGCCCAAAAGCTCCTAGCCCAGGGCATCGCCCTGGGTAATGTTGGTAAAAAGCAAATCGCCCTGAAAGGGCAAAAGCTTTCAAATTCAAAGCTTTTGCCCTTTCAGGGCGACAGGTTTACGTCTATAATAACCCAGGGCGATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>L:5
ACGGGCTGAAGGCCCAAAAGCTCCTAGCCCAGGGCATCGCCCTGGGTAATATAGAGATTATGCAATGCGCCCTGTAAGGGCAAAAGCTTCGTCAAAATATTGGCGTTTTAAAGCTTTTGCCCTTACAGGGCGACAGGTTTGTGTCCGTAATTACCCAGGGCGATGCCCTGGGCTAGGAGTTTCTGCCCTTTCAGGGCGT
>L:6
ACGGGCTGAAGGCCCAAAAGCTCCTAGCCCAGGGCAACGCCCTGGGTATAATTGCAATCAGCAAGGCGCCCTGTAAGGGCAAAAGCTTTGTTAATTGCCTGGTATTTGAAAGCTTTTGCCCTTACAGGGCGACAGGTTTGCGTCCGTAATTACCCAGGGCGATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>L:7
ACGGGCTGAAGGCCCAAAAGCTCCTAGCCCAGGGCATCGCCCTGGGTATAATAGCAGTCAGCAAGGCGCCCTGTAAGGGCAAAAGCCTTATATATTGCCCGGAGTTTTAAAGCTTTTGCCCTTACAGGGCGACAGGGTTGCGCCCGTAATAACCCAGGGCATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>L:8
ACGGGCTGAAGGCCCAAAAGCTCCTAGCCCAGGGCATCGCCCTGGGTATGATGGCAATCGGTAAGGCGCCCTGTAAGGGCAAAAGCTTTGTAAATTGCCCAGAGCTTTTTAAAGCTTTTGCCCTTACAGGGCGATAGGCTAGCGTCCGAATTAACCCAGGGCGTTGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>L:9
ACGGGCTGAAGGCCCAAAAGCTCCTAGCCCAGGGCAGCGCCCTGGGTATAATAGCAATCAGCAAGGCGCCCTGTAAGGGCAAAAGCCTTGTAAATAGCCCGGTTCTTTAAAGCTTTTGCCCTTACAGGGCGACAGGGTTGCGCCCATAAGAACCCAGGGCGATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>L:10
ACGGGCTGAAGGCCCAAAAGCTCCTAGCCCAGGGCATCGCCCTGGGTATAATGGCAATCTGTAAGGCGCCCTGTAAGGGCAAAAGCTTTATATATTGCCCGGTATTTGGAAGCTTTTGCCCTTACAGGGCGACGGTTTGTGTCCGTTATTACCCAGGGCGACGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>H:5
ACGCCCTGAAAGGGCAGAAGCTCTTAGCCCAGGGCATCGCCCTGGGTATAGTAGCAGTCAGCAAGGCGCCCTGTAAGGGCAAAAGCTTTATAAATTGCCTCGTGTTTTGAAGCTTTTGCCCTTACAGGGCGACAGGCTTGCGTCCATGATTACCCAGGGTGCTGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>H:24+33
ACGCCCTGAAAGGGCAGAAGCTCCTAGCCCAGGGCATCGCCCTGGGTATAATAACAATCAGCAAAGCGCCCTGTAAGGGCAAAAGCTTTGTATATTGCTGGTATTTAAAAGCTTTTGCCCTTACAGGGCGACAGGCTTGCTTCCATCATTACCCAGGGCGATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>H:8+29
ACGCCCTGAAAGGGCAGAAGCTCCTAGCCCAGGGCAACGCCCTGGGTAAATACGCAGACACAACAACGCCCTGTAAGGGCAAAAGCTTCTCATATTCAAAGCTTTTGCCCTTACAGGGCGACTTCGTAGAGATGGGCTTAACCCAGAGCGTTGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>H:11+34
ACGCCCTGAAAGGGCAGAAGCTCTTAGCCCAGGGTAACACCCTGGGTATTATGGACATATACAATACCGCCCTGTAAGGGCAAAAGCTTTCTTATATTTTTAAAGCTTTTGCCCTTACAGGGCGGCATTGTTGATTGCGGCTTTTAACCCAGGGTGTTGCCCTGGGCTGGGAGCTTCTGCCCTTTCAGGGCGT
>J:12
ACGCCCTGAAAGGGCAGAAGCTCTTAGCCCAGGGCAGCGCCCTGGGTAATGTTGACAAAAAACAAAGTCGCCCTGAAAGGGCAAAAGCTTTGAAATAGAAGGCTTTTGCCCTTTCAGGGCGACTTTGTTGTATTTATTCTAAAACCCAGGGCGCTGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>H:1+3
ACGGGCTGAAAGCCCAGAAGCTCCTAGCCCAGGGCAACACCCTGGGTTTACAAGGTATATAGCAAAATCGCCCTGTAAGGGCAAAAGCTTCTCATACACAAAGCTTTTGCCCTTACAGGGCGACAAACTTGCGCTCATGATTACCCAGGGTGTTGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>H:6+12
ACGGGCTGAAGGCCCAAAAGCTCCTAGCCCAGGGTAACACCCTGGGTATAATAGCAATCAGCAAGGCGCCCTGAAAGGGCAAAAGCTTTGTATATTGCCTGGAATTTTAAAGCTTTTGCCCTTACAGGGCGACAGGCTTGCGGCCATGATAACCCAGGGTGTTACCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>H:17+26
ACGCCCTGAAAGGGCAGAAGCTCCTAGCCCAGGGCATCGCCCTGGGTATAATAGCAGTCAACAAGGCGCCCTGTAAGGGCAAAAGCTTCGTATATTGCCTGGTACTTGAAAGCTTTTGCCCTTACAGGGCGACAAACTTGCGTCAATGATTACCCAGGGCGATGCCCTGGGCTAGGAGCTTTTGGGCCTTCAGCCCGT
>M:61+62
ACGCCCTGAAAGGGCAGAAGCTCCTAGCCCAGGGCATCGCCCTGGGTATAATAGCAATCAACTAGTCGCCCTGAAAGGGCAAAAGCTTTGTTAATTGCCTGGTATTTGAAAGCTTTTGCCCTTACAGGGCGACAGGCTTGCGACCATAATTACCCAGGGCGATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>M:57+58
ACGCCCTGAAAGGGCAGAAGCTCCTAGCCCAGGGCAGCGCCCTGGGTATAATGTCAATCAGCAAGGCGCCCTGTAAGGGCAAAAGCTTTGTATGTTACCCGGTATTTTAAAGCTTTTGCCCTTACAGGGCGAGAGGTTTGTGTTCGTGATTACCCAGGGCGATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>M:52+53
ACGCCCTGAAAGGGCAGAAGCTCCTAGCCCAGGGCATCGCCCTGGGTATAATAGCAATCAGCAAGGCGCCCTGTAAGGGCAAAAGCTTTCAAATACCTGGCAATATACAAAGCTTTTGCCCTTACAGGGCGACAGGTTTGCGTCCATAAACACCCAGGGCGATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>M:50+51
ACGCCCTGAAAGGGCAGAAGCTCCTAGCCCAGGGCATCGCCCTGGGTGTAATAGCAATCAGCAAGTCGCCCTGTAAGGGCAAAAGCTTTATTAATTCCGGTATTTTAAAGCTTTCGCCCTTACAGGGCGACAGGTTTGCGTCCGTAATTACCCAGGGCGTTGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>M:48+49
ACGCCCTGAAAGGGCAGAAGCTCCTAGCCCAGGGCATCGCCCTGGGTATAATAGCAATCAGCAAGGCGCCCTGTAAGGGCAAAAGCTTTGTAAATTGCCTGGTACTTGAAAGCTTTTGCCCTTACAGGGCGACAGGTTTGCGTCCGTAATAACCCAGGGCGATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>M:46+47
ACGCCCTGAAAGGGCAGAAGCTCCTAGCCCAGGGCATCGCCCTGGGTATAATGGCAATCAGCAAGTCGCCCTGTAAGGGCAAAAGCTTTCAAATACCGGTCAAGATATAAAGCTTTTGCCCTTACAGGGCGACGGGTTTGCGTTTATAATTACCCAGGGCGATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>M:60
ACGGGCTGAAGGCCCAAAAGCTCCTAGCCCAGGGCATCGCCCTGGGTATAATAGCAGCTAGCAAGGCGCCCTGTAAGGGCAAAAGCCTTATAAATTGCCTATTTTTTAAAAGCTTTTGCCCTTACAGGGCGACAGGTTTGCGCCCGTAATAACCCAGGGCGATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>M:56
ACGGGCTGAAAGCCCAAAAGCTCCTAGCCCAGGGCAACGCCCTGGGTATAATAGTAATCAGCAAGACGCCCTGTAAGGGCAAAAGCTTTATTATTGCTAAGTATTTTAAAGCTTTTGCCCTTACAGGGCGAGAGGTTTGCGTTCATGATTACCCAGGGTGATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>M:55
ACGGGCTGAAAGCCCAAAAGCTCCTAGCCCAGGGCAACGCCCTGGGTATAATAGTAATCAGCAAGACGCCCTGTAAGGGCAAAAGCTTTATTATTGCTAAGTATTTTAAAGCTTTTGCCCTTACAGGGCGAGAGGTTTGCGTTCATGATTACCCAGGGTGATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>N:3
ACGCCCTGAAAGGGCAGAAGCTCCTAGCCCAGGGCATCGCCCTGGGTATAATAGCAATCAGCAAGACGCCCTGTAAGGGCAAAAGCTTTATGTATTATCTGTGAGTTTTTTAAAGCTTTTGCCCTTATAGGGCGACAGGTTTGCATCCGTAATAACCCAGGGCGATGCCCTGGGCTAGGAGCTTTTGGGCCTTCAGCCCGT
>N:6
ACGCCCTGAAAGGGCAGAAGCTCCTAGCCCAGGGCAGCACCCTGGGTTTTAGAATAAACTCAGCAAAGTCGCCCTGAAAGGGCAAAAGCTTTGAAATTGAAAGCTTTTGCCCTTTCAGGGCGACCTTGTTTTTTGCCAACATTACCCAGGGCGCTGCCCTGGGCTAGGAGCTTTTGCCCTTTCAGGGCGT
>N:10
ACGCCCTGAAAGGGCAGAAGCTCCTAGCCCAGGGCAACGCCCTGGGTATAATAGCAATCAGCAAGGCGCCCTGTAAGGGCAAAAGCTTTGTAAATTGCCTGGTACTTGAAAGCTTTTGCCCTTACAGGGCGACAGGTTTGCGTCCGTAATAACCCAGGGCGATGCCCTGGGCTAAGAGCTTCTGCCCTTTCAGGGCGT
>N:5
ACGGGCTGAAGGCCCAAAAGCTCCTAGCCCAGGGCATCGCCCTGGGTATAATGGCAATCAGCAAGGCGCCCTGTAAGGGCAAAAGCTTTGTATATTGCCAGGTATTTGAAAGTTTTTGCCCTTACAGGGCGACAGGTTTGCGCCCATAAACACCCAGGGCGATGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT
>N:7
ACGGGCTGAAGGCCCAAAAGCTCTTAGCCCAGGGCATCGCCCTGGGTATAATGGCAGCGCAACAATGCGCCCTGTAAGGGCAAAAGCTTTGTAAATTGCCTGGTACTTGAAAGCTTTTGCCCTTTCAGGGCGACAAACTTGCGTCCATGATTACCCAGGGCGTTGCCCTGGGCTAGGAGCTTCTGCCCTTTCAGGGCGT""".split('\n')

seqs = [ i.upper() for i in inputt if i[0] != '>']

## Get anchors and their reverse-complements for these sequences.
anchors = get_anchors(seqs)


In [18]:
## Retrieve a list of anchors contained in those reads. 

anchors = pd.DataFrame({'anchor':anchors[0]}).drop_duplicates().reset_index(drop=True)
print(anchors.shape)

(8320, 1)


In [8]:
## Get the list of downloaded R1 FASTQs. 

## Load accession lists. 
accession_list = glob.glob('/oak/stanford/groups/horence/george/fastq_download/segatella_copri_02062024/*/accession_list_1.txt')
accessions = pd.read_csv(accession_list[1],sep='\t',header=None)
for i in accession_list[1:]:
    accession_i = pd.read_csv(i,sep='\t',header=None)
    accessions = pd.concat([accessions,accession_i])
accessions = accessions.rename(columns={0:'name'})

## From download locations of FASTQs, get their accession names. 
downloads = glob.glob('/scratch/groups/horence/george/fqd_01292024/*')
d = [i for i in downloads if '1.fastq' in i]
d = pd.DataFrame({'path':d})
d['name'] = [i.split('/')[-1].split('_')[0] for i in d['path']]
d['name'][0]

## Consolidate the accession lists and downloaded FASTQs. 
downloaded_fastqs = d.merge(accessions).drop_duplicates().reset_index(drop=True)
assert downloaded_fastqs.shape[0] == 218


In [37]:

## Run compactors on the tiled anchors and the downloaded FASTQs (each FASTQ separately). 
compactor_command = '/oak/stanford/groups/horence/george/glm_anchors/bin/compactors_unobscured_single_sample_ne10.sh'

for name in downloaded_fastqs['name'].unique():
    
    ## Make a directory for this FASTQ and move there.
    os.mkdir(name+'_num_extended_10_02202024')
    os.chdir(name+'_num_extended_10_02202024')
    
    ## Write the FASTQ path as a samplesheet.
    downloaded_fastqs[downloaded_fastqs['name']==name][['path']].to_csv('samplesheet.txt',header=None,index=None,sep='\t')
    
    ## Write the anchor list and submit compactors.
    anchors.to_csv('anchor_list.tsv',sep='\t',header=None,index=None)
    os.system('sbatch '+compactor_command+' 1 samplesheet.txt')
    os.chdir('../')


Submitted batch job 41582407
Submitted batch job 41582408
Submitted batch job 41582409
Submitted batch job 41582410
Submitted batch job 41582411
Submitted batch job 41582412
Submitted batch job 41582414
Submitted batch job 41582415
Submitted batch job 41582416
Submitted batch job 41582417
Submitted batch job 41582419
Submitted batch job 41582420
Submitted batch job 41582422
Submitted batch job 41582423
Submitted batch job 41582425
Submitted batch job 41582426
Submitted batch job 41582427
Submitted batch job 41582428
Submitted batch job 41582429
Submitted batch job 41582430
Submitted batch job 41582431
Submitted batch job 41582432
Submitted batch job 41582433
Submitted batch job 41582434
Submitted batch job 41582435
Submitted batch job 41582436
Submitted batch job 41582437
Submitted batch job 41582438
Submitted batch job 41582439
Submitted batch job 41582440
Submitted batch job 41582442
Submitted batch job 41582443
Submitted batch job 41582445
Submitted batch job 41582446
Submitted batc

In [None]:
## Submit 'round 2' compactors. 
compactor_command = '/oak/stanford/groups/horence/george/glm_anchors/bin/compactors_unobscured_single_sample_ne10.sh'

## Get the list of compactor outputs. 
generated_compactors = pd.DataFrame(glob.glob('*_num_extended_10_02202024/compactors_out_1.tsv'))
generated_compactors['name'] = [i.split('/')[0] for i in generated_compactors[0]]


## For each FASTQ's compactor outputs: 
for name in generated_compactors['name'].unique():
    
    name2 = name.split('_')[0]
    
    ## Load the result of compactor generation and filter for read support. 
    compactor_result = pd.read_csv(list(generated_compactors[generated_compactors['name']==name][0])[0],sep='\t')
    compactor_result = compactor_result[compactor_result['exact_support']>0]
    
    ## Make a file corresponding to the 2nd round of compactor generation for this FASTQ.
    os.mkdir(name2+'_num_extended_10_02202024_round2')
    os.chdir(name2+'_num_extended_10_02202024_round2')
    if compactor_result.shape[0] > 0: 
        
        ## Rewrite the appropriate samplesheet.
        downloaded_fastqs[downloaded_fastqs['name']==name2][['path']].to_csv('samplesheet.txt',header=None,index=None,sep='\t')
        
        ## Get the tiles from each compactor generated; use these tiles for compactor generation. 
        pd.DataFrame(get_anchors(compactor_result['compactor'].tolist())[0])[0].to_csv('anchor_list.tsv',sep='\t',index=None,header=None)
        os.system('sbatch '+compactor_command+' 1 samplesheet.txt')
        
    os.chdir('../') 


Submitted batch job 41584167
Submitted batch job 41584168
Submitted batch job 41584169
Submitted batch job 41584177
Submitted batch job 41584178
Submitted batch job 41584180
Submitted batch job 41584184
Submitted batch job 41584186
Submitted batch job 41584187
Submitted batch job 41584189
Submitted batch job 41584191
Submitted batch job 41584192
Submitted batch job 41584193
Submitted batch job 41584195
Submitted batch job 41584196
Submitted batch job 41584197
Submitted batch job 41584198
Submitted batch job 41584199
Submitted batch job 41584200
Submitted batch job 41584201
Submitted batch job 41584214
Submitted batch job 41584220
Submitted batch job 41584221
Submitted batch job 41584222
Submitted batch job 41584226
Submitted batch job 41584227
Submitted batch job 41584228
Submitted batch job 41584229
Submitted batch job 41584240
Submitted batch job 41584243


In [13]:
## Cover for the first round of 'round 2'. 
## @ Aaron and Matt, this is an internal/personal-use code block where I'm checking that all jobs completed successfuly. 
compactor_command = '/oak/stanford/groups/horence/george/glm_anchors/bin/compactors_unobscured_single_sample_ne10.sh'


generated_compactors = pd.DataFrame(glob.glob('*_num_extended_10_02202024/compactors_out_1.tsv'))
generated_compactors['name'] = [i.split('/')[0] for i in generated_compactors[0]]

names_already_done = glob.glob('*_num_extended_10_02202024_round2/slurm*')
names_already_done = [i.split('_')[0] for i in names_already_done]
assert len(names_already_done) == 150

In [34]:
## Load all resulting compactors. 
generated_compactors = pd.DataFrame(glob.glob('*num_extended_10_02202024*/compactors_out_1.tsv'))
generated_compactors['name'] = [i.split('/')[0] for i in generated_compactors[0]]

compactor_result = pd.read_csv(generated_compactors[0][0],sep='\t')
compactor_result['dataset'] = generated_compactors['name'][0]

for name in generated_compactors['name'].unique()[1:]:
    
    compactor_result_i = pd.read_csv(list(generated_compactors[generated_compactors['name']==name][0])[0],sep='\t')
    compactor_result_i['dataset'] = name

    compactor_result = pd.concat([compactor_result,compactor_result_i])

In [35]:
# Write all resulting compactors to a file. 
compactor_result['dataset'] = [i.split('_')[0] for i in compactor_result['dataset']]

## Get read support. 
compactor_result = compactor_result[compactor_result['exact_support']>0]

## Drop duplicates. This refers to compactors generated both on passes 1 and 2 in a single FASTQ. 
compactor_result = compactor_result.drop_duplicates().reset_index(drop=True)
compactor_c = compactor_result.groupby(['compactor'])['dataset'].nunique().reset_index().rename(columns={'dataset':'dataset_count'})
compactor_result = compactor_result.merge(compactor_c,how='left')
#compactor_result.to_csv('compactors_num_extended_10_with_dataset_count.tsv',sep='\t',index=None)

In [None]:
## This is a code block I spliced into this notebook. 
## It checks, for a given 27mer and its RC, the lowest coordinate in a round 1 
## compactor where the 27mer or its RC appeared. 
## The dictionary has 2 levels of keys: dataset, anchor. 
## After running this code block, I'd do a single pass through 
## my dataset, using tuple(anchor, dataset) info to extract the earliest 
## coordinate of this anchor or its RC in a 'round 1' compactor run for this FASTQ. 

pos_dict = dict()
for i in glob.glob('tile_compactors/*_num_extended_10/compactors_out_1.tsv'):
    l = i.split('/')[1].split('_')[0]
    a = pd.read_csv(i,sep='\t')
    pos_dict[l] = dict()
    for j in a['compactor'].unique():
        for k in range(len(j)-26):
            c = j[k:k+27]
            rcc = rc(c)
            if c in pos_dict[l].keys():
                if k < pos_dict[l][c]:
                    pos_dict[l][c] = int(k) + 1
                    pos_dict[l][rcc] = int(k) + 1
            else:
                pos_dict[l][c] = int(k) + 1
                pos_dict[l][rcc] = int(k) + 1
                