In [1]:
!apt-get install bedtools

Reading package lists... Done
Building dependency tree       
Reading state information... Done
bedtools is already the newest version (2.27.1+dfsg-4ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.


In [19]:
import sys
import os
import shutil
import gzip
import csv
import multiprocessing

import tqdm.notebook as tqnb

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import numpy as np
import pandas as pd

import boda
from boda.common.utils import KmerFilter, dna2tensor

import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr
from scipy.spatial.distance import hamming
from sklearn.neighbors import NearestNeighbors

In [3]:
!wget -nc https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/ncbiRefSeqCurated.txt.gz

File ‘ncbiRefSeqCurated.txt.gz’ already there; not retrieving.



In [4]:
genes = pd.read_table(
    'ncbiRefSeqCurated.txt.gz', sep='\t', header=None, index_col=False,
    names=[
        'bin','name','chrom','strand','txStart','txEnd','cdsStart','cdsEnd',
        'exonCount','exonStart','exonEnds','score','name2','cdsStartStat','cdsEndStat','exonFrames'
    ]
)

genes_subset = genes.loc[ genes['name'].str.contains('NM_') & ~genes['chrom'].str.contains('_') ].drop_duplicates('name2')

In [5]:
genes_subset.loc[genes_subset['strand'] == '+', 'promoterStart'] = genes_subset.loc[ genes_subset['strand'] == '+', 'txStart' ] - 200
genes_subset.loc[genes_subset['strand'] == '-', 'promoterStart'] = genes_subset.loc[ genes_subset['strand'] == '-', 'txEnd' ]

genes_subset.loc[genes_subset['strand'] == '+', 'promoterEnd'] = genes_subset.loc[ genes_subset['strand'] == '+', 'txStart' ]
genes_subset.loc[genes_subset['strand'] == '-', 'promoterEnd'] = genes_subset.loc[ genes_subset['strand'] == '-', 'txEnd' ] + 200

genes_subset.loc[:, ['promoterStart', 'promoterEnd']] = genes_subset.loc[:, ['promoterStart', 'promoterEnd']].astype(int)
                                                                  
genes_subset.loc[:, ['chrom', 'promoterStart', 'promoterEnd', 'name2']].to_csv('simple_tss.bed', sep='\t',index=False,header=False, quoting=csv.QUOTE_NONE)

In [14]:
!wget -nc https://www.encodeproject.org/files/GRCh38_no_alt_analysis_set_GCA_000001405.15/@@download/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz
!gunzip GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz -y


File ‘GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz’ already there; not retrieving.

gzip: invalid option -- 'y'
Try `gzip --help' for more information.


In [7]:
!bedtools getfasta -name -tab -fi GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta -bed simple_tss.bed > tss_seq.txt
tss_seq = pd.read_table('tss_seq.txt', sep='\t', header=None, names=['ID', 'sequence'])
tss_seq['ID'] = 'TSS::' + tss_seq['ID']
tss_seq.loc[ tss_seq['sequence'].str.contains('N'), 'sequence' ] = [ x.replace('N','A') for x in tss_seq.loc[ tss_seq['sequence'].str.contains('N'), 'sequence' ]]

In [8]:
def pad_tensor_200(in_tensor):
    out_tensor = torch.cat([
        in_tensor,
        torch.zeros((4,200-in_tensor.shape[1]), device=in_tensor.device)
    ], dim=1)
    return out_tensor

In [9]:
tss_tensor = torch.stack(
    [ pad_tensor_200(dna2tensor(line['sequence'])) for i, line in tss_seq.iterrows() ], 
    dim=0
)

tss_loader = DataLoader(TensorDataset(tss_tensor), batch_size=1024)

In [10]:
seq_table = pd.read_table('boda_round_1_controls_20211215.seq', sep='\t', header=None, names=['ID','sequence'])

seq_tensor = torch.stack(
    [ pad_tensor_200(dna2tensor(line['sequence'])) for i, line in seq_table.iterrows() ], 
    dim=0
)

seq_loader = DataLoader(TensorDataset(seq_tensor), batch_size=1024)

In [None]:
all_seq = pd.concat([])

In [11]:
kmer_7 = KmerFilter(7)
kmer_7.cuda()

KmerFilter()

In [13]:
kmer_7_content = torch.cat([ kmer_7(batch[0].cuda()).sum(dim=-1).cpu() for batch in tqnb.tqdm(seq_loader) ], dim=0)
kmer_7_tss     = torch.cat([ kmer_7(batch[0].cuda()).sum(dim=-1).cpu() for batch in tqnb.tqdm(tss_loader) ], dim=0)

  0%|          | 0/118 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

In [23]:
torch.save(
    {
        'boda': {'ID': list(seq_table['ID']), 'kmers': kmer_7_content},
        'tss': {'ID': list(tss_seq['ID']), 'kmers': kmer_7_tss},
    }, 'kmer_7__content.pt'
)
!gsutil -m cp -n ./kmer_7__content.pt gs://syrgoth/data/


Copying file://./kmer_7__content.pt [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

/ [1/1 files][  8.5 GiB/  8.5 GiB] 100% Done 142.7 MiB/s ETA 00:00:00           
Operation completed over 1 objects/8.5 GiB.                                      


In [20]:
get_knn = NearestNeighbors(n_neighbors=5, radius=1.0, algorithm='auto', leaf_size=30, metric=hamming).fit(kmer_7_content.numpy())
distances, indices = get_knn.kneighbors(kmer_7_content.numpy())

KeyboardInterrupt: 

In [6]:
kmer_7_content = kmer_7_content.cuda()

cos = nn.CosineSimilarity(dim=1, eps=1e-6)
cos.cuda()

section_size = torch.arange(kmer_7_content.shape[0]).flip(dims=[0])
flat_idxer   = torch.cat([torch.tensor([0],dtype=torch.long),torch.cumsum(section_size,dim=0,dtype=torch.long)])

cos_dist = torch.full((torch.arange(kmer_7_content.shape[0]).sum(),), fill_value=np.nan)

with torch.no_grad():
    for i in tqnb.tqdm(range(kmer_7_content.shape[0]-1)):
        cos_dist[flat_idxer[i]:flat_idxer[i+1]] = 1 - cos(
                          kmer_7_content[i][None,:].expand(kmer_7_content.shape[0]-i-1,-1), 
                          kmer_7_content[i+1:]
                        ).cpu()

  0%|          | 0/120056 [00:00<?, ?it/s]

In [7]:
torch.save({'condensed_distance': cos_dist}, 'kmer_7__condensed_distance_matrix.pt')

In [8]:
kmer_7_content.shape[0]

120057

## redo shuffled

rerun this with a fresh kernel

In [1]:
import sys
import os
import shutil
import gzip
import csv
import multiprocessing

import tqdm.notebook as tqnb

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import numpy as np
import pandas as pd

import boda
from boda.common.utils import KmerFilter, dna2tensor

import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr

In [2]:
!awk 'BEGIN {OFS="\t";FS="\t"} {print ">"$1"\n"$2}' boda_round_1_controls_20211215.seq  > boda_round_1_controls_20211215.fa
!fasta-dinucleotide-shuffle -f boda_round_1_controls_20211215.fa -t _shuffle -s 19890207 > boda_round_1_controls_20211215.shuffle_1.fa

In [3]:
def pad_tensor_200(in_tensor):
    out_tensor = torch.cat([
        in_tensor,
        torch.zeros((4,200-in_tensor.shape[1]), device=in_tensor.device)
    ], dim=1)
    return out_tensor

In [4]:
seq_table = {'ID': [], 'sequence': []}

with open('boda_round_1_controls_20211215.shuffle_1.fa', 'r') as f:
    for i, line in enumerate(f):
        if i % 2 == 0:
            seq_table['ID'].append( line.rstrip().lstrip('>') )
        else:
            seq_table['sequence'].append( line.rstrip() )
            
seq_table = pd.DataFrame.from_dict(seq_table, orient='columns')

seq_tensor = torch.stack(
    [ pad_tensor_200(dna2tensor(line['sequence'])) for i, line in seq_table.iterrows() ], 
    dim=0
)

seq_loader = DataLoader(TensorDataset(seq_tensor), batch_size=1024)

In [5]:
kmer_7 = KmerFilter(7)
kmer_7.cuda()

KmerFilter()

In [6]:
kmer_7_content = torch.cat([ kmer_7(batch[0].cuda()).sum(dim=-1).cpu() for batch in tqnb.tqdm(seq_loader) ], dim=0)

  0%|          | 0/118 [00:00<?, ?it/s]

In [7]:
torch.save(
    {
        'shuffled': {'ID': list(seq_table['ID']), 'kmers': kmer_7_content},
    }, 'kmer_7__content__shuffled.pt'
)
!gsutil -m cp -n ./kmer_7__content__shuffled.pt gs://syrgoth/data/


Copying file://./kmer_7__content__shuffled.pt [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

/ [1/1 files][  7.3 GiB/  7.3 GiB] 100% Done 145.3 MiB/s ETA 00:00:00           
Operation completed over 1 objects/7.3 GiB.                                      


In [7]:
kmer_7_content = kmer_7_content.cuda()

cos = nn.CosineSimilarity(dim=1, eps=1e-6)
cos.cuda()

section_size = torch.arange(kmer_7_content.shape[0]).flip(dims=[0])
flat_idxer   = torch.cat([torch.tensor([0],dtype=torch.long),torch.cumsum(section_size,dim=0,dtype=torch.long)])

cos_dist = torch.full((torch.arange(kmer_7_content.shape[0]).sum(),), fill_value=np.nan)

with torch.no_grad():
    for i in tqnb.tqdm(range(kmer_7_content.shape[0]-1)):
        cos_dist[flat_idxer[i]:flat_idxer[i+1]] = 1 - cos(
                          kmer_7_content[i][None,:].expand(kmer_7_content.shape[0]-i-1,-1), 
                          kmer_7_content[i+1:]
                        ).cpu()

  0%|          | 0/120056 [00:00<?, ?it/s]

In [8]:
torch.save({'condensed_distance': cos_dist}, 'kmer_7__condensed_distance_matrix.shuffle_1.pt')

In [9]:
kmer_7_content.shape[0]

120057