In [1]:
import sys
import os
import shutil
import gzip
import csv
import multiprocessing

import tqdm.notebook as tqnb

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import numpy as np
import pandas as pd

import boda
from boda.common.utils import KmerFilter, dna2tensor

import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr

In [2]:
def pad_tensor_200(in_tensor):
    out_tensor = torch.cat([
        in_tensor,
        torch.zeros((4,200-in_tensor.shape[1]), device=in_tensor.device)
    ], dim=1)
    return out_tensor

In [3]:
seq_table = pd.read_table('boda_round_1_controls_20211215.seq', sep='\t', header=None, names=['ID','sequence'])

seq_tensor = torch.stack(
    [ pad_tensor_200(dna2tensor(line['sequence'])) for i, line in seq_table.iterrows() ], 
    dim=0
)

seq_loader = DataLoader(TensorDataset(seq_tensor), batch_size=1024)

In [4]:
kmer_7 = KmerFilter(7)
kmer_7.cuda()

KmerFilter()

In [5]:
kmer_7_content = torch.cat([ kmer_7(batch[0].cuda()).sum(dim=-1).cpu() for batch in tqnb.tqdm(seq_loader) ], dim=0)

  0%|          | 0/118 [00:00<?, ?it/s]

In [6]:
kmer_7_content = kmer_7_content.cuda()

section_size = torch.arange(kmer_7_content.shape[0]).flip(dims=[0])
flat_idxer   = torch.cat([torch.tensor([0],dtype=torch.long),torch.cumsum(section_size,dim=0,dtype=torch.long)])

cos_dist = torch.full((torch.arange(kmer_7_content.shape[0]).sum(),), fill_value=np.nan)

with torch.no_grad():
    for i in tqnb.tqdm(range(kmer_7_content.shape[0]-1)):
        cos_dist[flat_idxer[i]:flat_idxer[i+1]] = (kmer_7_content[i] - kmer_7_content[i+1:]) \
          .clamp(min=0).sum(axis=1).cpu()

  0%|          | 0/120056 [00:00<?, ?it/s]

In [7]:
torch.save({'condensed_distance': cos_dist}, 'kmer_7__condensed_edit_matrix.pt')

In [8]:
kmer_7_content.shape[0]

120057

## redo shuffled

rerun this with a fresh kernel

In [1]:
import sys
import os
import shutil
import gzip
import csv
import multiprocessing

import tqdm.notebook as tqnb

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import numpy as np
import pandas as pd

import boda
from boda.common.utils import KmerFilter, dna2tensor

import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr

In [2]:
!awk 'BEGIN {OFS="\t";FS="\t"} {print ">"$1"\n"$2}' boda_round_1_controls_20211215.seq  > boda_round_1_controls_20211215.fa
!fasta-dinucleotide-shuffle -f boda_round_1_controls_20211215.fa -t _shuffle -s 19890207 > boda_round_1_controls_20211215.shuffle_1.fa

In [3]:
def pad_tensor_200(in_tensor):
    out_tensor = torch.cat([
        in_tensor,
        torch.zeros((4,200-in_tensor.shape[1]), device=in_tensor.device)
    ], dim=1)
    return out_tensor

In [4]:
seq_table = {'ID': [], 'sequence': []}

with open('boda_round_1_controls_20211215.shuffle_1.fa', 'r') as f:
    for i, line in enumerate(f):
        if i % 2 == 0:
            seq_table['ID'].append( line.rstrip().lstrip('>') )
        else:
            seq_table['sequence'].append( line.rstrip() )
            
seq_table = pd.DataFrame.from_dict(seq_table, orient='columns')

seq_tensor = torch.stack(
    [ pad_tensor_200(dna2tensor(line['sequence'])) for i, line in seq_table.iterrows() ], 
    dim=0
)

seq_loader = DataLoader(TensorDataset(seq_tensor), batch_size=1024)

In [5]:
kmer_7 = KmerFilter(7)
kmer_7.cuda()

KmerFilter()

In [6]:
kmer_7_content = torch.cat([ kmer_7(batch[0].cuda()).sum(dim=-1).cpu() for batch in tqnb.tqdm(seq_loader) ], dim=0)

  0%|          | 0/118 [00:00<?, ?it/s]

In [7]:
kmer_7_content = kmer_7_content.cuda()

section_size = torch.arange(kmer_7_content.shape[0]).flip(dims=[0])
flat_idxer   = torch.cat([torch.tensor([0],dtype=torch.long),torch.cumsum(section_size,dim=0,dtype=torch.long)])

cos_dist = torch.full((torch.arange(kmer_7_content.shape[0]).sum(),), fill_value=np.nan)

with torch.no_grad():
    for i in tqnb.tqdm(range(kmer_7_content.shape[0]-1)):
        cos_dist[flat_idxer[i]:flat_idxer[i+1]] = (kmer_7_content[i] - kmer_7_content[i+1:]) \
          .clamp(min=0).sum(axis=1).cpu()

  0%|          | 0/120056 [00:00<?, ?it/s]

In [8]:
torch.save({'condensed_distance': cos_dist}, 'kmer_7__condensed_edit_matrix.shuffle_1.pt')

In [9]:
kmer_7_content.shape[0]

120057