<a href="https://colab.research.google.com/github/simecek/PseudoDNA_Generator/blob/master/models/WholeGenomeButChr1_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [1]:
!pip install fastai2>=0.0.11 ipywidgets matplotlib nbdev>=0.2.12 pandas scikit_learn sentencepiece biopython

In [2]:
# Mount to your Google Drive allowing lesson files will be saved to your Drive location
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
DNA_TOPLEVEL_FASTA_PATH = "/content/drive/My Drive/data/ensembl/Homo_sapiens.GRCh38.dna.toplevel.fa.gz"
DNA_TEXT_PATH = "/content/drive/My Drive/data/Homo_sapiens.GRCh38.dna.chrtext/"

In [5]:
import torch
torch.cuda.is_available(), torch.cuda.device_count(), torch.cuda.get_device_name(0)

(True, 1, 'Tesla P100-PCIE-16GB')

In [6]:
!wget https://raw.githubusercontent.com/simecek/PseudoDNA_Generator/master/models/genomic_tokenizer2.py 

--2020-06-19 22:31:32--  https://raw.githubusercontent.com/simecek/PseudoDNA_Generator/master/models/genomic_tokenizer2.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 565 [text/plain]
Saving to: ‘genomic_tokenizer2.py’


2020-06-19 22:31:33 (38.6 MB/s) - ‘genomic_tokenizer2.py’ saved [565/565]



In [7]:
from fastai2.text.all import *
from Bio import SeqIO
import pandas as pd
import gzip
from tqdm.notebook import tqdm
from genomic_tokenizer2 import tkn2
import os
import shutil
import math
import random

In [8]:
!ls "/content/drive/My Drive/data/Homo_sapiens.GRCh38.dna.chrtext/"

1   11	13  15	17  19	20  22	4  6  8  MT  Y
10  12	14  16	18  2	21  3	5  7  9  X


In [9]:
!rm -rf /content/data
!mkdir /content/data
!mkdir /content/data/train
!mkdir /content/data/valid

In [10]:
def get_chrom_sample(chrom, dest_dir, downscale_factor=20):
  chunks = os.listdir(DNA_TEXT_PATH + chrom)
  N_chunks = len(chunks)
  N_selected = math.ceil(N_chunks / downscale_factor)
  chunks_selected = random.sample(chunks, N_selected)
  os.mkdir(dest_dir + chrom + "/")
  for chunk in chunks_selected:
    shutil.copyfile(DNA_TEXT_PATH + chrom + "/" + chunk, dest_dir + chrom + "/" + chunk)
  print(chrom, N_selected) 

In [11]:
random.seed(42)

get_chrom_sample('1', '/content/data/valid/', downscale_factor=5)

1 498


In [12]:
TRAIN_CHRS = [str(i) for i in range(2,23)] + ['X', 'Y', 'MT']
for chrom in TRAIN_CHRS:
  get_chrom_sample(chrom, '/content/data/train/', 25)

2 97
3 80
4 77
5 73
6 69
7 64
8 59
9 56
10 54
11 55
12 54
13 40
14 37
15 34
16 37
17 34
18 33
19 24
20 26
21 17
22 17
X 63
Y 22
MT 1


In [13]:
!ls /content/data/train/Y/

Y_122.txt  Y_209.txt  Y_261.txt  Y_335.txt  Y_46.txt   Y_56.txt
Y_132.txt  Y_216.txt  Y_296.txt  Y_389.txt  Y_498.txt  Y_7.txt
Y_140.txt  Y_225.txt  Y_308.txt  Y_404.txt  Y_509.txt
Y_179.txt  Y_235.txt  Y_315.txt  Y_440.txt  Y_539.txt


## Data Loaders

In [14]:
dls_lm = TextDataLoaders.from_folder(Path("/content/data"), bs=2048, seed=42, 
                                   is_lm=True, 
                                   tok_tfm=tkn2, seq_len=50)

In [15]:
dls_lm.show_batch()

Unnamed: 0,text,text_
0,C A T A C T G T A C A T A A A A T A T C A A A C T A C C C A A A C T A T A T A T T A T A T A C G G T,A T A C T G T A C A T A A A A T A T C A A A C T A C C C A A A C T A T A T A T T A T A T A C G G T A
1,C T G C A A G T G G A T A T T T G G A T A G C T T T G A G G A T T T C G T T G G A A A C G G G T T A,T G C A A G T G G A T A T T T G G A T A G C T T T G A G G A T T T C G T T G G A A A C G G G T T A T
2,A T T T T C T T T A T C C A G T C T A T C A T T G A T G G G C A T T T G A G T T G A T A C C A T G T,T T T T C T T T A T C C A G T C T A T C A T T G A T G G G C A T T T G A G T T G A T A C C A T G T A
3,G T T T A T A A T A T A T T T G G T A G A A G T T A G C A G T G A A T C C A T C T A G T C C T G G G,T T T A T A A T A T A T T T G G T A G A A G T T A G C A G T G A A T C C A T C T A G T C C T G G G C
4,T G G C T T T C A C C G T G T T A G C C A G G A T G G T C T C A A A T C T C C T G A C C T C A T G A,G G C T T T C A C C G T G T T A G C C A G G A T G G T C T C A A A T C T C C T G A C C T C A T G A T
5,C C T A A T G T T A A T C T C C A A G A C C A T G G G G A A A A T G T C T C C A G G G C A T G T C A,C T A A T G T T A A T C T C C A A G A C C A T G G G G A A A A T G T C T C C A G G G C A T G T C A G
6,C A G A G T G A G A C T C T G T C T C A G A A A A A A A C A A A C A A A C A A A C A A A C A A A T A,A G A G T G A G A C T C T G T C T C A G A A A A A A A C A A A C A A A C A A A C A A A C A A A T A A
7,A A T G C C A T G C A G T C T A A A C A C A T C T G C G C A G A A C A C A G C C C A G G C C A T G C,A T G C C A T G C A G T C T A A A C A C A T C T G C G C A G A A C A C A G C C C A G G C C A T G C A
8,C T G A T A A C T T T G G A G A T G G T G A C A T G A G A A T A G A G G A A A A A A A C T T T C A G,T G A T A A C T T T G G A G A T G G T G A C A T G A G A A T A G A G G A A A A A A A C T T T C A G G


## Model and Learning

In [16]:
learn = language_model_learner(
    dls_lm, AWD_LSTM, drop_mult=0.3, pretrained=False, 
    metrics=[accuracy, Perplexity()])

In [17]:
learn.fit_one_cycle(2, 2e-2)
learn.export('/content/drive/My Drive/DNAModels/Human_v0/all_but_chr1_v2.pkl')
learn.save('/content/drive/My Drive/DNAModels/Human_v0/all_but_chr1_v2')
learn.summary()

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,1.475801,1.27889,0.341241,3.592649,5:17:22
1,1.423227,1.241679,0.37535,3.461422,5:05:33


SequentialRNN (Input shape: ['2048 x 50'])
Layer (type)         Output Shape         Param #    Trainable 
RNNDropout           2048 x 50 x 400      0          False     
________________________________________________________________
RNNDropout           2048 x 50 x 1152     0          False     
________________________________________________________________
RNNDropout           2048 x 50 x 1152     0          False     
________________________________________________________________
Linear               2048 x 50 x 8        3,208      True      
________________________________________________________________
RNNDropout           2048 x 50 x 400      0          False     
________________________________________________________________

Total params: 3,208
Total trainable params: 3,208
Total non-trainable params: 0

Optimizer used: <function Adam at 0x7f36904e2620>
Loss function: FlattenedLoss of CrossEntropyLoss()

Model unfrozen

Callbacks:
  - TrainEvalCallback
  - Recorder
  

## Testing the model

In [18]:
learn = load_learner('/content/drive/My Drive/DNAModels/Human_v0/all_but_chr1_v2.pkl')

In [19]:
TEXT = "ACTGCCTGCCTGC"
N_WORDS = 40
N_SENTENCES = 2
preds = [learn.predict(TEXT, N_WORDS, temperature=0.75)[::2] 
         for _ in range(N_SENTENCES)]
preds

['ACTGCCTGCCTGCCATTGAGTTGTTTGCATTGAGTGGTGTATTATATTCCTGA',
 'ACTGCCTGCCTGCTGTTGGGGCCCCCCTACCTTTTAAATCTTGGCCTCTTAAT']