<a href="https://colab.research.google.com/github/simecek/PseudoDNA_Generator/blob/master/colabs/models/WholeGenomeButChr1_v0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [1]:
!pip install fastai2>=0.0.11 ipywidgets matplotlib nbdev>=0.2.12 pandas scikit_learn sentencepiece biopython

In [2]:
# Mount to your Google Drive allowing lesson files will be saved to your Drive location
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd 'drive/My Drive/course-v4/nbs'
DNA_TOPLEVEL_FASTA_PATH = "/content/drive/My Drive/data/ensembl/Homo_sapiens.GRCh38.dna.toplevel.fa.gz"
DNA_TEXT_PATH = "/content/drive/My Drive/data/Homo_sapiens.GRCh38.dna.chrtext/"

/content/drive/My Drive/course-v4/nbs


In [27]:
torch.cuda.is_available(), torch.cuda.device_count(), torch.cuda.get_device_name(0)

(True, 1, 'Tesla P100-PCIE-16GB')

In [4]:
from fastai2.text.all import *
from Bio import SeqIO
import pandas as pd
import gzip
from tqdm.notebook import tqdm
from genomic_tokenizer import tkn2
import os
import shutil
import math
import random

In [5]:
!ls "/content/drive/My Drive/data/Homo_sapiens.GRCh38.dna.chrtext/"

1   11	13  15	17  19	20  22	4  6  8  MT  Y
10  12	14  16	18  2	21  3	5  7  9  X


In [16]:
!rm -rf /content/data
!mkdir /content/data
!mkdir /content/data/train
!mkdir /content/data/valid

In [7]:
def get_chrom_sample(chrom, dest_dir, downscale_factor=20):
  chunks = os.listdir(DNA_TEXT_PATH + chrom)
  N_chunks = len(chunks)
  N_selected = math.ceil(N_chunks / downscale_factor)
  chunks_selected = random.sample(chunks, N_selected)
  os.mkdir(dest_dir + chrom + "/")
  for chunk in chunks_selected:
    shutil.copyfile(DNA_TEXT_PATH + chrom + "/" + chunk, dest_dir + chrom + "/" + chunk)
  print(chrom, N_selected) 

In [17]:
random.seed(42)

get_chrom_sample('1', '/content/data/valid/', downscale_factor=5)

1 498


In [18]:
TRAIN_CHRS = [str(i) for i in range(2,23)] + ['X', 'Y', 'MT']
for chrom in TRAIN_CHRS:
  get_chrom_sample(chrom, '/content/data/train/', 25)

2 97
3 80
4 77
5 73
6 69
7 64
8 59
9 56
10 54
11 55
12 54
13 40
14 37
15 34
16 37
17 34
18 33
19 24
20 26
21 17
22 17
X 63
Y 22
MT 1


In [23]:
!ls /content/data/train/Y/

Y_122.txt  Y_209.txt  Y_261.txt  Y_335.txt  Y_46.txt   Y_56.txt
Y_132.txt  Y_216.txt  Y_296.txt  Y_389.txt  Y_498.txt  Y_7.txt
Y_140.txt  Y_225.txt  Y_308.txt  Y_404.txt  Y_509.txt
Y_179.txt  Y_235.txt  Y_315.txt  Y_440.txt  Y_539.txt


## Data Loaders

In [28]:
dls_lm = TextDataLoaders.from_folder(Path("/content/data"), bs=2048, seed=42, 
                                   is_lm=True, 
                                   tok_tfm=tkn2, seq_len=50)

In [24]:
dls_lm.show_batch()

Unnamed: 0,text,text_
0,xxbos T A T C C A A A G C A A T C T A C A G A T T C A A T G T A A C C C C T A T C A A A A T A C T A A T,T A T C C A A A G C A A T C T A C A G A T T C A A T G T A A C C C C T A T C A A A A T A C T A A T G
1,T T C T A T T C T T T A C T T T T A T A C T C A C T C T T A T T C T C A T T C T C A C G C C A C C C,T C T A T T C T T T A C T T T T A T A C T C A C T C T T A T T C T C A T T C T C A C G C C A C C C T
2,T G T A A T C C C A G C A C T T T G C A A A G G A G G C T G A G G T G G G C G G A T C A C C T G A G,G T A A T C C C A G C A C T T T G C A A A G G A G G C T G A G G T G G G C G G A T C A C C T G A G G
3,A C T T T G A C A T C C A T G T G A T T T A A T G C T A A G A T C T G T C C A T A A T A G A G T G A,C T T T G A C A T C C A T G T G A T T T A A T G C T A A G A T C T G T C C A T A A T A G A G T G A T
4,T C T G A A T T A C A C A T A C A G T T T G T G G C A T C T C T T T T T T G A A G C C T T A G A T T,C T G A A T T A C A C A T A C A G T T T G T G G C A T C T C T T T T T T G A A G C C T T A G A T T T
5,T A A G T G A A A G A A G C C A G A C A C A G A A G C C C A C A T A T T G T A T G A T T T C A T T T,A A G T G A A A G A A G C C A G A C A C A G A A G C C C A C A T A T T G T A T G A T T T C A T T T A
6,T A A A A A T A G G A T T T T G A G T T C T A G G C C T C C A C A G A A T C C A T T T G A A T A T G,A A A A A T A G G A T T T T G A G T T C T A G G C C T C C A C A G A A T C C A T T T G A A T A T G T
7,A A A C T C C C A G G C T C A A A T G A T C C T C T A G C C T C A C C C T T C T G A G T A G C T G G,A A C T C C C A G G C T C A A A T G A T C C T C T A G C C T C A C C C T T C T G A G T A G C T G G G
8,G T G A T G T T T G C A T T C A A C T C A C A G A G T T G A A C C T T G C T T T C A T A G T T C A G,T G A T G T T T G C A T T C A A C T C A C A G A G T T G A A C C T T G C T T T C A T A G T T C A G C


## Model and Learning

In [31]:
learn = language_model_learner(
    dls_lm, AWD_LSTM, drop_mult=0.3, pretrained=False, 
    metrics=[accuracy, Perplexity()])

In [33]:
learn.fit_one_cycle(2, 2e-2)
learn.export('/content/drive/My Drive/DNAModels/Human_v0/all_but_chr1_v1.pkl')
learn.summary()

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,1.292505,1.223054,0.405564,3.397548,5:10:55
1,1.260468,1.188988,0.430451,3.283757,5:12:11


SequentialRNN (Input shape: ['2048 x 50'])
Layer (type)         Output Shape         Param #    Trainable 
RNNDropout           2048 x 50 x 400      0          False     
________________________________________________________________
RNNDropout           2048 x 50 x 1152     0          False     
________________________________________________________________
RNNDropout           2048 x 50 x 1152     0          False     
________________________________________________________________
Linear               2048 x 50 x 8        3,208      True      
________________________________________________________________
RNNDropout           2048 x 50 x 400      0          False     
________________________________________________________________

Total params: 3,208
Total trainable params: 3,208
Total non-trainable params: 0

Optimizer used: <function Adam at 0x7feefa153ae8>
Loss function: FlattenedLoss of CrossEntropyLoss()

Model unfrozen

Callbacks:
  - TrainEvalCallback
  - Recorder
  

## Testing the model

In [None]:
learn = load_learner('/content/drive/My Drive/DNAModels/Human_v0/all_but_chr1_v1.pkl')

In [37]:
TEXT = "ACTGCCTGCCTGC"
N_WORDS = 40
N_SENTENCES = 2
preds = [learn.predict(TEXT, N_WORDS, temperature=0.75)[::2] 
         for _ in range(N_SENTENCES)]
preds

['ACTGCCTGCCTGCAATCTTCTCTCTGTTTGCTTTTGCCACTGATCCTTAATGT',
 'ACTGCCTGCCTGCCTAGGACTTGGGAGGCAGGCTCTCATGATTTTCTACGTTG']