<a href="https://colab.research.google.com/github/simecek/PseudoDNA_Generator/blob/master/colabs/models/Intergenomic_AWD_LSTM_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [0]:
!pip install fastai2>=0.0.11 ipywidgets matplotlib nbdev>=0.2.12 pandas scikit_learn sentencepiece

In [0]:
from fastai2.text.all import *
import pandas as pd

In [0]:
# Mount to your Google Drive allowing lesson files will be saved to your Drive location
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
dt = pd.read_csv("/content/drive/My Drive/data/random/intergenomic_seqs_50k.csv")
dt = dt[~dt.seq.str.contains("N")]   # just for sure

train = dt[(dt.chr!="1") & (dt.chr!="2")]
test = dt[(dt.chr=="1") | (dt.chr=="2")]

print(dt.shape, train.shape, test.shape)
dt.head()

(50000, 4) (42077, 4) (7923, 4)


Unnamed: 0,chr,start,end,seq
0,17,14239397,14239596,AACTGGGATTCACAGGAGCTTAATGGAGCACATGATGTTAAGTGAAGTGAGCCAGGCACAAAAAGACAACTACCACGTGATCTGACTTATGTGGAATGTAAAACAATTGAACTCATGGAAGCAGAGAGTAGAATGGAGGATACCAGGGGCTGGGAGGCAGGGGTTTGGGGAGACGGTGAAAGCGTTCTAAAGTGTAGTTA
1,X,90928374,90928573,GCTAGTTGTATGGTTAGCAGCAAGATATTTTTTCTCTCTGATCTTTAATTTTCATATTTAAATTTGGCTAAGAGTACTTGCCTCTTAAAACTGTGTTGCTGGTATTACCAGAGTGTGGTATAATTAAAATATATATTTGCTTTTTGTCACCAGTTTCTCACACAGTACATCAAAAGCCCTTGCAATTTTCTGAGTGATAA
2,3,104278717,104278916,GACTTTGTAGACTTGTGTGACCTGTGTGCCTCCCTCTCCCCCCAAAAAAAACAAAAAAAATAAAGGATCTTGGGAAAGACTATATAAAAGGCAAGACTCCTTTAATGGAGGGGATATGCTAGATTGCCTCCCATTATGGCCCATGCCAAAGTGTTTAAACTTAGAAAAATGGTTCCAGTTTACTTCTGGGCTTAAAAATC
3,4,187089054,187089253,ATGTTAACACCAAATCAGTCCATCCTAATTATCACTCAAAAATCAAACATTTTTTAGGGAGGCAAAAACTGTCATGAGAACTACAATTTGATTTGGAGACTATTTCACTTATACAGTTTCTTCACATGATGACCAGCCTTCTTTCTTTAGTAATGGTTATTACTATGGCCATTGCTGTTAATTCTGTGACTTATCACTTC
4,2,137742849,137743048,GCAGGAGCTCTATCTGTTTGGACTAGTTCAGCCCCATCTCTTTTGGGGTGACTCGGGTGATGCTAAGCTTCCCAGGGCCATTGTGTTCTGTCTTCTGCCTCTGACTTTTTCCCTGCTACCCACATGAGCTTCTGCTATGCTCTCTTCTTTCCTGTCCAGAAATCATGTAGTAAGATGCTTTTTGGCTGGAGACCCTGAAA


In [0]:
!rm -rf split_tok

!rm -rf split
!mkdir split
!mkdir split/train
!mkdir split/valid
!mkdir split/train/1/
!mkdir split/valid/1/

In [0]:
# splitting the file into training and test part
N = len(dt.seq)

for i, s in enumerate(train.seq):
  open("split/train/1/seq"+str(i)+".txt", 'w').writelines([s])
for i, s in enumerate(test.seq):
  open("split/valid/1/seq"+str(i)+".txt", 'w').writelines([s])

## Tokenizer

In [0]:
class LetterTokenizer():
    def __init__(self, **kwargs):
        pass
    def __call__(self, items):
        if isinstance(items, str):
          return self.__tokenize_str(items)
        else:
          return (self.__tokenize_str(t) for t in items)
    def __tokenize_str(self, t):
        return L(['xxbos'] + list(t.replace("\n","")))

    @property
    def special_toks(self,): return ['xxbos', 'xxunk']

In [0]:
tkn = LetterTokenizer()
tkn("ACCTGGCTAGCCGATCGGGACTAGCA")

(#27) ['xxbos','A','C','C','T','G','G','C','T','A'...]

In [0]:
# because empty list of rules throws error
def do_nothing(x):
  return x

tkn2 = Tokenizer(tkn, rules=[do_nothing])
tkn2("ACCTGGC\nTAGCCGATCGGGACTAGCA")

(#27) ['xxbos','A','C','C','T','G','G','C','T','A'...]

## Data Loaders

In [0]:
dls_lm = TextDataLoaders.from_folder(Path("./split"), bs=128, seed=42, 
                                   is_lm=True, 
                                   tok_tfm=tkn2, seq_len=50)

In [0]:
dls_lm.show_batch()

Unnamed: 0,text,text_
0,xxbos A G C T G T G A G A T A T T T C T T C T T G A T T T T C A G T T C A C T A A C T T G T T T C A A C,A G C T G T G A G A T A T T T C T T C T T G A T T T T C A G T T C A C T A A C T T G T T T C A A C A
1,C G C T G T T T C A A A A A A A T A A A G G G A A A A T C A G G A A C T C A A A G A G A T A C C T G,G C T G T T T C A A A A A A A T A A A G G G A A A A T C A G G A A C T C A A A G A G A T A C C T G C
2,T T C T A A A A T G T T T A A A A A A A A T T A T A A T C C A T A C C T C T T C T A C T T C A G A A,T C T A A A A T G T T T A A A A A A A A T T A T A A T C C A T A C C T C T T C T A C T T C A G A A A
3,C T T T T A T C T T A G A T C T A G T A G T A T T A G T T T T A T A A A T C T C T G T G T T A C A G,T T T T A T C T T A G A T C T A G T A G T A T T A G T T T T A T A A A T C T C T G T G T T A C A G T
4,T C T G T G A A G A T T T C G T T G G xxbos C T T T T T T T T T T T T C T G G T T A T T A T A T A A A T,C T G T G A A G A T T T C G T T G G xxbos C T T T T T T T T T T T T C T G G T T A T T A T A T A A A T A
5,C T T T G C A C C T G G C A T T A T G G A A A T T C C T G G T A A G G G G A A C T G T A C T G A T G,T T T G C A C C T G G C A T T A T G G A A A T T C C T G G T A A G G G G A A C T G T A C T G A T G G
6,T T A A A T G T A C A A T T A A A T T A T T G T T G A C T A T A G T C A T C C T G T T G T G C T C T,T A A A T G T A C A A T T A A A T T A T T G T T G A C T A T A G T C A T C C T G T T G T G C T C T G
7,C C A T G G C T G G A A A A C T T G G G G G G A T G A A T A A C T C C T C C C T T C T C A G G C C G,C A T G G C T G G A A A A C T T G G G G G G A T G A A T A A C T C C T C C C T T C T C A G G C C G A
8,T T A C A T G G A A G A A A C T A C G A A G A A A A T G A T A G C T C A A T xxbos A T T G C C C T G G C,T A C A T G G A A G A A A C T A C G A A G A A A A T G A T A G C T C A A T xxbos A T T G C C C T G G C T


## Model and Learning

In [0]:
learn = language_model_learner(
    dls_lm, AWD_LSTM, drop_mult=0.5, pretrained=False, 
    metrics=[accuracy, Perplexity()]).to_fp16()

In [0]:
learn.fit_one_cycle(5, 2e-2)
learn.export("/content/drive/My Drive/DNAModels/Intergenomic/AWD_LSTM_v2.pkl")

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,1.337824,1.331844,0.359472,3.788022,18:21
1,1.299125,1.302871,0.383243,3.679845,18:20
2,1.264779,1.277171,0.399055,3.586479,18:24
3,1.23891,1.261415,0.406707,3.530414,18:38
4,1.236638,1.248427,0.409745,3.484857,18:26
