[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mitiau/DNABERT-Z/blob/main/ZDNA-prediction.ipynb)

# ZDNA


## Install dependecies and define helper functions

In [1]:
!pip install transformers
!pip install biopython

Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.83


In [2]:
import torch
from torch import nn
import transformers
from transformers import BertTokenizer, BertForTokenClassification
import numpy as np
from Bio import SeqIO
from io import StringIO, BytesIO
from google.colab import drive, files
from tqdm import tqdm
import pickle
import scipy
from scipy import ndimage

In [3]:
def seq2kmer(seq, k):
    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
    return kmer

def split_seq(seq, length = 512, pad = 16):
    res = []
    for st in range(0, len(seq), length - pad):
        end = min(st+512, len(seq))
        res.append(seq[st:end])
    return res

def stitch_np_seq(np_seqs, pad = 16):
    res = np.array([])
    for seq in np_seqs:
        res = res[:-pad]
        res = np.concatenate([res,seq])
    return res

## Select model and parameters

In [13]:
model = 'HG kouzine' #@param ["HG chipseq", "HG kouzine", "MM chipseq", "MM kouzine"]
model_confidence_threshold = 0.25 #@param {type:"number"}
minimum_sequence_length = 6 #@param {type:"integer"}

In [14]:
if model == 'HG chipseq':
    model_id = '1VAsp8I904y_J0PUhAQqpSlCn1IqfG0FB'
elif model == 'HG kouzine':
    model_id = '1dAeAt5Gu2cadwDhbc7OnenUgDLHlUvkx'
elif model == 'MM curax':
    model_id = '1W6GEgHNoitlB-xXJbLJ_jDW4BF35W1Sd'
elif model == 'MM kouzine':
    model_id = '1dXpQFmheClKXIEoqcZ7kgCwx6hzVCv3H'


In [15]:
!gdown $model_id
!gdown 10sF8Ywktd96HqAL0CwvlZZUUGj05CGk5
!gdown 16bT7HDv71aRwyh3gBUbKwign1mtyLD2d
!gdown 1EE9goZ2JRSD8UTx501q71lGCk-CK3kqG
!gdown 1gZZdtAoDnDiLQqjQfGyuwt268Pe5sXW0


!mkdir 6-new-12w-0
!mv pytorch_model.bin 6-new-12w-0/
!mv config.json 6-new-12w-0/
!mv special_tokens_map.json 6-new-12w-0/
!mv tokenizer_config.json 6-new-12w-0/
!mv vocab.txt 6-new-12w-0/

Downloading...
From (original): https://drive.google.com/uc?id=1dAeAt5Gu2cadwDhbc7OnenUgDLHlUvkx
From (redirected): https://drive.google.com/uc?id=1dAeAt5Gu2cadwDhbc7OnenUgDLHlUvkx&confirm=t&uuid=233c08aa-5464-45dd-a93f-ed7370cccbb2
To: /content/pytorch_model.bin
100% 354M/354M [00:01<00:00, 188MB/s]
Downloading...
From: https://drive.google.com/uc?id=10sF8Ywktd96HqAL0CwvlZZUUGj05CGk5
To: /content/config.json
100% 634/634 [00:00<00:00, 2.83MB/s]
Downloading...
From: https://drive.google.com/uc?id=16bT7HDv71aRwyh3gBUbKwign1mtyLD2d
To: /content/special_tokens_map.json
100% 112/112 [00:00<00:00, 656kB/s]
Downloading...
From: https://drive.google.com/uc?id=1EE9goZ2JRSD8UTx501q71lGCk-CK3kqG
To: /content/tokenizer_config.json
100% 40.0/40.0 [00:00<00:00, 257kB/s]
Downloading...
From: https://drive.google.com/uc?id=1gZZdtAoDnDiLQqjQfGyuwt268Pe5sXW0
To: /content/vocab.txt
100% 28.7k/28.7k [00:00<00:00, 83.0MB/s]
mkdir: cannot create directory ‘6-new-12w-0’: File exists


In [16]:
tokenizer = BertTokenizer.from_pretrained('6-new-12w-0/')
model = BertForTokenClassification.from_pretrained('6-new-12w-0/')
model.cuda()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(4101, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

## Upload fasta files for prediction

подгрузка организма

In [None]:
!wget -O 'genome.fna.gz' https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/900/002/375/GCF_900002375.2_GCA_900002375/GCF_900002375.2_GCA_900002375_genomic.fna.gz

--2024-06-15 11:35:02--  https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/900/002/375/GCF_900002375.2_GCA_900002375/GCF_900002375.2_GCA_900002375_genomic.fna.gz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.11, 130.14.250.12, 2607:f220:41e:250::7, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.11|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5769816 (5.5M) [application/x-gzip]
Saving to: ‘genome.fna.gz’


2024-06-15 11:35:04 (6.83 MB/s) - ‘genome.fna.gz’ saved [5769816/5769816]



In [None]:
!gzip -rd 'genome.fna.gz'

In [8]:
uploaded = files.upload()
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving genome.fna to genome.fna
User uploaded file "genome.fna" with length 19013441 bytes


## Predict and save raw outputs

In [17]:
out = []
for key in uploaded.keys():
    print(key)
    out.append(key)
    result_dict = {}
    for seq_record in SeqIO.parse(StringIO(BytesIO(uploaded[key]).read().decode('UTF-8')), 'fasta'):
        kmer_seq = seq2kmer(str(seq_record.seq).upper(), 6)
        seq_pieces = split_seq(kmer_seq)
        print(seq_record.name)
        out.append(seq_record.name)
        with torch.no_grad():
            preds = []
            for seq_piece in tqdm(seq_pieces):
                input_ids = torch.LongTensor(tokenizer.encode(' '.join(seq_piece), add_special_tokens=False))
                outputs = torch.softmax(model(input_ids.cuda().unsqueeze(0))[-1],axis = -1)[0,:,1]
                preds.append(outputs.cpu().numpy())
        result_dict[seq_record.name] = stitch_np_seq(preds)



        labeled, max_label = scipy.ndimage.label(result_dict[seq_record.name]>model_confidence_threshold)
        print('  start     end')
        out.append('  start     end')
        for label in range(1, max_label+1):
            candidate = np.where(labeled == label)[0]
            candidate_length = candidate.shape[0]
            if candidate_length>minimum_sequence_length:
                print('{:8}'.format(candidate[0]), '{:8}'.format(candidate[-1]))
                out.append('{:8}'.format(candidate[0]) + '{:8}'.format(candidate[-1]))

    with open(key + '.preds.pkl',"wb") as fh:
      pickle.dump(result_dict, fh)
    print()

with open('text_predictions.txt',"w") as fh:
    for item in out:
        fh.write("%s\n" % item)


genome.fna
NC_036159.2


100%|██████████| 1040/1040 [00:39<00:00, 26.26it/s]


  start     end
NC_036160.2


100%|██████████| 1256/1256 [00:47<00:00, 26.69it/s]


  start     end
NC_036161.2


100%|██████████| 1330/1330 [00:51<00:00, 25.67it/s]


  start     end
NC_036162.2


100%|██████████| 1437/1437 [00:53<00:00, 26.74it/s]


  start     end
NC_036163.2


100%|██████████| 1878/1878 [01:10<00:00, 26.57it/s]


  start     end
  769829   769843
NC_036164.2


100%|██████████| 1985/1985 [01:15<00:00, 26.46it/s]


  start     end
  222856   222863
NC_036165.2


100%|██████████| 1707/1707 [01:03<00:00, 26.67it/s]


  start     end
NC_036166.2


100%|██████████| 2864/2864 [01:47<00:00, 26.59it/s]


  start     end
NC_036167.2


100%|██████████| 3294/3294 [02:03<00:00, 26.65it/s]


  start     end
NC_036168.2


100%|██████████| 3307/3307 [02:04<00:00, 26.56it/s]


  start     end
NC_036169.2


100%|██████████| 3554/3554 [02:15<00:00, 26.22it/s]


  start     end
 1711674  1711680
NC_036170.2


100%|██████████| 3644/3644 [02:17<00:00, 26.52it/s]


  start     end
  112772   112779
  112781   112787
  558741   558749
 1444226  1444245
 1637682  1637700
NC_036171.2


100%|██████████| 5085/5085 [03:13<00:00, 26.26it/s]


  start     end
  193374   193380
 1393218  1393231
 1718984  1718993
NC_036172.2


100%|██████████| 5141/5141 [03:16<00:00, 26.16it/s]


  start     end
NW_023259977.1


100%|██████████| 115/115 [00:04<00:00, 26.47it/s]


  start     end
NW_023259978.1


100%|██████████| 70/70 [00:02<00:00, 27.36it/s]


  start     end
NW_023259979.1


100%|██████████| 50/50 [00:01<00:00, 26.63it/s]


  start     end
NW_023259980.1


100%|██████████| 21/21 [00:00<00:00, 26.14it/s]


  start     end
NW_023259981.1


100%|██████████| 14/14 [00:00<00:00, 27.09it/s]


  start     end
NC_015303.1


100%|██████████| 12/12 [00:00<00:00, 23.64it/s]


  start     end
NC_030892.1


100%|██████████| 63/63 [00:02<00:00, 23.69it/s]


  start     end



In [18]:
files.download('text_predictions.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [19]:
for key in uploaded.keys():
    files.download(key + '.preds.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Zhunt

In [21]:
!chmod a+x zhunt3-alan.c

In [22]:
!gcc zhunt3-alan.c -lm -o zhunt3

[01m[Kzhunt3-alan.c:[m[K In function ‘[01m[Kuser_regret[m[K’:
  303 |       [01;35m[Kgets[m[K( tempstr );
      |       [01;35m[K^~~~[m[K
      |       [32m[Kfgets[m[K
/usr/bin/ld: /tmp/ccwBs0uf.o: in function `user_regret':


In [25]:
!./zhunt3 12 8 12 genome.fna

dinucleotides 12
min/max 8 12
min/max 8 12
operating on genome.fna
calculating zscore
opening genome.fna
inputting sequence
opening genome.fna.Z-SCORE

 run time=4577 sec
use min/max 8 12
analyzing_zscore
opening genome.fna.Z-SCORE
opening genome.fna
inputting sequence


In [26]:
files.download('genome.fna.Z-SCORE')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [28]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Квадруплексы

In [3]:
import re
from Bio import SeqIO

mas = []

pattern="(?:G{3,}[ATGC]{1,7}){3,}G{3,}"
pattern_minus = "(?:C{3,}[ATGC]{1,7}){3,}C{3,}"
for record in SeqIO.parse("genome.fna",'fasta'):
  for m in re.finditer(pattern, str(record.seq),re.IGNORECASE):
    mas.append([record.id, m.start(),m.end(),m.group(0)])
  for m in re.finditer(pattern_minus, str(record.seq),re.IGNORECASE):
    mas.append([record.id, m.start(),m.end(),m.group(0)])

with open("pqs.bed", "w") as f:
  for i in mas:
    f.write(f"{i[0]}\t{i[1]}\t{i[2]}\n")