<a href="https://colab.research.google.com/github/simecek/PseudoDNA_Generator/blob/master/colabs/experiments/Intergenomic_LSTM_AWD_Accuracy_Top1_v0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
MODEL_PATH = "/content/drive/My Drive/DNAModels/Intergenomic/AWD_LSTM_v4.pkl"
DATA_PATH = "/content/drive/My Drive/data/random/intergenomic_seqs_50k.csv"
EXPERIMENT_TYPE = "Accuracy_of_top1_prediction"
EXPERIMENT_NAME = "Intergenomic_AWD_LSTM_accuracy_top1_v0"

## Setup

In [0]:
!pip install fastai2>=0.0.11 ipywidgets matplotlib nbdev>=0.2.12 pandas scikit_learn sentencepiece

In [3]:
from fastai2.text.all import *
import pandas as pd
import torch
from tqdm import tqdm, notebook
notebook.tqdm().pandas()

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

In [4]:
torch.cuda.is_available(), torch.cuda.device_count(), torch.cuda.get_device_name(0)

(True, 1, 'Tesla P100-PCIE-16GB')

In [5]:
# Mount to your Google Drive allowing lesson files will be saved to your Drive location
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
dt = pd.read_csv(DATA_PATH)
dt = dt[~dt.seq.str.contains("N")]   # just for sure

train = dt[dt.chr!="1"]
test = dt[dt.chr=="1"]

print(dt.shape, train.shape, test.shape)
dt.head()


(50000, 4) (46286, 4) (3714, 4)


Unnamed: 0,chr,start,end,seq
0,17,14239397,14239596,AACTGGGATTCACAGGAGCTTAATGGAGCACATGATGTTAAGTGAAGTGAGCCAGGCACAAAAAGACAACTACCACGTGATCTGACTTATGTGGAATGTAAAACAATTGAACTCATGGAAGCAGAGAGTAGAATGGAGGATACCAGGGGCTGGGAGGCAGGGGTTTGGGGAGACGGTGAAAGCGTTCTAAAGTGTAGTTA
1,X,90928374,90928573,GCTAGTTGTATGGTTAGCAGCAAGATATTTTTTCTCTCTGATCTTTAATTTTCATATTTAAATTTGGCTAAGAGTACTTGCCTCTTAAAACTGTGTTGCTGGTATTACCAGAGTGTGGTATAATTAAAATATATATTTGCTTTTTGTCACCAGTTTCTCACACAGTACATCAAAAGCCCTTGCAATTTTCTGAGTGATAA
2,3,104278717,104278916,GACTTTGTAGACTTGTGTGACCTGTGTGCCTCCCTCTCCCCCCAAAAAAAACAAAAAAAATAAAGGATCTTGGGAAAGACTATATAAAAGGCAAGACTCCTTTAATGGAGGGGATATGCTAGATTGCCTCCCATTATGGCCCATGCCAAAGTGTTTAAACTTAGAAAAATGGTTCCAGTTTACTTCTGGGCTTAAAAATC
3,4,187089054,187089253,ATGTTAACACCAAATCAGTCCATCCTAATTATCACTCAAAAATCAAACATTTTTTAGGGAGGCAAAAACTGTCATGAGAACTACAATTTGATTTGGAGACTATTTCACTTATACAGTTTCTTCACATGATGACCAGCCTTCTTTCTTTAGTAATGGTTATTACTATGGCCATTGCTGTTAATTCTGTGACTTATCACTTC
4,2,137742849,137743048,GCAGGAGCTCTATCTGTTTGGACTAGTTCAGCCCCATCTCTTTTGGGGTGACTCGGGTGATGCTAAGCTTCCCAGGGCCATTGTGTTCTGTCTTCTGCCTCTGACTTTTTCCCTGCTACCCACATGAGCTTCTGCTATGCTCTCTTCTTTCCTGTCCAGAAATCATGTAGTAAGATGCTTTTTGGCTGGAGACCCTGAAA


## Tokenizer

In [7]:
!wget https://raw.githubusercontent.com/simecek/PseudoDNA_Generator/master/colabs/models/genomic_tokenizer.py

--2020-06-12 13:16:40--  https://raw.githubusercontent.com/simecek/PseudoDNA_Generator/master/colabs/models/genomic_tokenizer.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 586 [text/plain]
Saving to: ‘genomic_tokenizer.py.1’


2020-06-12 13:16:40 (38.9 MB/s) - ‘genomic_tokenizer.py.1’ saved [586/586]



In [0]:
from genomic_tokenizer import tkn2

## Load Model

In [0]:
learn = load_learner(MODEL_PATH)

In [10]:
TEXT = ""
N_WORDS = 40
N_SENTENCES = 2
preds = [learn.predict(TEXT, N_WORDS, temperature=0.75, no_bar=True)[::2] 
         for _ in range(N_SENTENCES)]
preds

['CCGCCAGGGCTGGAGTGCAGGAGAATTCACCTGAGGTCAG',
 'CTAGGAGGTTCAGGACTGAACACATCTTTTATACCAGCAA']

# Helper Functions

In [11]:
%%time
def predict10(learner):
    return lambda x: learner.predict(x, 10, temperature=0.1, no_bar=True)[::2][-10:]

for i in range(3): predict10(learn)('TATCCAGGGAGATGCTAAGG')

CPU times: user 2.3 s, sys: 3.92 ms, total: 2.3 s
Wall time: 2.31 s


# Comparison

In [12]:
# compare 10bp
def compare10(x, y):
    return (int(x[i] == y[i]) for i in range(10))

list(compare10("ACTGCACTGC", "ACTGCGGGGG"))

[1, 1, 1, 1, 1, 0, 0, 0, 1, 0]

In [21]:
STEP = 10
HISTORY = 20
STARTS_AT = 50
NSTEPS = 2

def test_data(s):
  planned_steps = range(STARTS_AT, STARTS_AT+NSTEPS*STEP, STEP)
  sdf = pd.DataFrame({'prev': [s[(i-HISTORY):i] for i in planned_steps], 'next': [s[i:i+STEP] for i in planned_steps]})
  sdf['nextpred'] = sdf.prev.apply(predict10(learn))
  c10 = ['c'+str(i) for i in range(10)]
  sdf[c10] = pd.DataFrame.from_records(sdf.apply(lambda row: compare10(row['next'], row['nextpred']), axis=1), columns=c10)
  return sdf

test_data(test.seq.iloc[0])

Unnamed: 0,prev,next,nextpred,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9
0,TATCCAGGGAGATGCTAAGG,TTGGGGACAG,TGAGAAAAAA,1,0,0,1,0,0,1,0,1,0
1,GATGCTAAGGTTGGGGACAG,AAGGTGTTTA,AGAAGAAGAA,1,0,0,0,0,0,0,0,0,1


In [22]:
test.seq.iloc[0][30:60]

'TATCCAGGGAGATGCTAAGGTTGGGGACAG'

In [23]:
%%time
def stat10(s):
  c10 = ['c'+str(i) for i in range(10)]
  sdf = test_data(s)
  
  return sdf[c10].mean(axis=0)

stat10(test.seq.iloc[3])

CPU times: user 1.6 s, sys: 3.89 ms, total: 1.61 s
Wall time: 1.62 s


In [24]:
stats = test.seq.progress_apply(stat10)
print(*[EXPERIMENT_NAME, EXPERIMENT_TYPE, DATA_PATH, MODEL_PATH, train.shape[0], test.shape[0]] + list(stats.mean(axis=0).values), sep=", ")

HBox(children=(FloatProgress(value=0.0, max=3714.0), HTML(value='')))


Intergenomic_AWD_LSTM_accuracy_top1_v0, Accuracy_of_top1_prediction, /content/drive/My Drive/data/random/intergenomic_seqs_50k.csv, /content/drive/My Drive/DNAModels/Intergenomic/AWD_LSTM_v4.pkl, 46286, 3714, 0.41343564889606893, 0.37143241787829834, 0.3739903069466882, 0.36106623586429726, 0.3621432417878298, 0.36631663974151857, 0.3535271943995692, 0.3607969843834141, 0.3555465805061928, 0.3509693053311793
