In [3]:
%load_ext autoreload
%autoreload 2

import sys, os

import gc
import pysam
import pandas as pd
import re
import torch

import numpy as np

#from markov_model import *
from markov_for_dinuc import *

import pickle

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
dinucl = ["AA", "AC", "AT", "AG", "CA", "CC", "CT", "CG", "TA", "TC", "TT", "TG", "GA", "GC", "GG", "GT"]
count_dinuc = dict((el, 0) for el in dinucl)

In [6]:
for record in SeqIO.parse('../../../test/Homo_sapiens_3prime_UTR.fa', 'fasta'):
    for nucleotide in count_dinuc:
        count = 0
        for i in range(len(record.seq)-1):
            pair=record.seq[i:i+2]
            if pair == nucleotide:
                count += 1
        count_dinuc[nucleotide] += count
print('\n'.join(['{}: {}'.format(i,count_dinuc[i]) for i in count_dinuc]))

KeyboardInterrupt: 

In [8]:
s_di = sum(count_dinuc.values())
a2 = {k: v / 4989147 for k, v in count_dinuc.items()}

In [7]:
count_nucletides = dict([(i,0) for i in "ACTG"])
for record in SeqIO.parse('../../../test/Homo_sapiens_3prime_UTR.fa', 'fasta'):
    for nucleotide in count_nucletides:
        count_nucletides[nucleotide] += record.seq.count(nucleotide)
print('\n'.join(['{}: {}'.format(i,count_nucletides[i]) for i in count_nucletides]))

A: 5919083
C: 4863209
T: 6414380
G: 4935864


In [9]:
s = sum(count_nucletides.values())
a = {k: v / s for k, v in count_nucletides.items()}

In [10]:
a

{'A': 0.26743808301046024,
 'C': 0.21973121381119634,
 'T': 0.2898167656883061,
 'G': 0.22301393749003728}

Example script usage ^^

In [None]:
#!cd ML4RG-2023-project && python main.py --test --fasta ../Homo_sapiens_3prime_UTR.fa --species_list 240_species.txt --output_dir ./test --model_weight ../MLM_mammals_species_aware_5000_weights

In [9]:
file_path = 'test_df.pickle'
if os.path.exists(file_path):
    with open(file_path, 'rb') as f:
        train_df = pickle.load(f)
else:
    # load the fasta file and select the train data
    fasta_file = "../../../test/Homo_sapiens_3prime_UTR.fa"
    sequences = []
    for s in SeqIO.parse(fasta_file, "fasta"):
        sequences.append(str(s.seq).upper())
    # get the train fraction
    val_fraction = 0.1
    N_train = int(len(sequences)*(1-val_fraction))
    test_data = sequences[N_train:]
    # store it as a dataframe
    test_df = pd.DataFrame({'3-UTR':test_data})
    with open(file_path, 'wb') as f:
        pickle.dump(test_df, f)
test_df

Unnamed: 0,3-UTR
0,CCCCCAGAACCAGTGGGACAAACTGCCTCCTGGAGGTTTTTAGAAA...
1,TATTGAGCCCTCAGAGAGTCCACAGTCCCTCCTCTCAGTTCAGTCT...
2,TATTCATTCCAACTGCTGCCCCTCTGTCTGCCTGGCTGAGATGCAT...
3,AACGGTGCGTTTGGCCAAAAAGAATCTGCATTTAGCACAAAAAAAA...
4,TAGTTTCTAACTGTCGGACCCGTCTGTAAACCAAGGACTATGAATA...
...,...
1809,AGCAAGCATTGAAAATAATAGTTATTGCATACCAATCCTTGTTTGC...
1810,AGCAAGCATTGAAAATAATAGTTATTGCATACCAATCCTTGTTTGC...
1811,GCCTACTTCATCTCAGGACCCGCCCAAGAGTGGCCGCGGCTTTGGG...
1812,TTGTCAGTCTGTCTGCTCAGGACACAAGAACTAAGGGGCAACAAAT...


In [10]:
file_path = 'train_df.pickle'
if os.path.exists(file_path):
    with open(file_path, 'rb') as f:
        train_df = pickle.load(f)
else:
    # load the fasta file and select the train data
    fasta_file = "../../../test/Homo_sapiens_3prime_UTR.fa"
    sequences = []
    for s in SeqIO.parse(fasta_file, "fasta"):
        sequences.append(str(s.seq).upper())
    # get the train fraction
    val_fraction = 0.1
    N_train = int(len(sequences)*(1-val_fraction))
    train_data = sequences[:N_train]
    # store it as a dataframe
    train_df = pd.DataFrame({'3-UTR':train_data})
    with open(file_path, 'wb') as f:
        pickle.dump(train_df, f)
train_df

Unnamed: 0,3-UTR
0,ATCTTATATAACTGTGAGATTAATCTCAGATAATGACACAAAATAT...
1,GGTTGCCGGGGGTAGGGGTGGGGCCACACAAATCTCCAGGAGCCAC...
2,GGCAGCCCATCTGGGGGGCCTGTAGGGGCTGCCGGGCTGGTGGCCA...
3,CCCACCTACCACCAGAGGCCTGCAGCCTCCCACATGCCTTAAGGGG...
4,TGGCCGCGGTGAGGTGGGTTCTCAGGACCACCCTCGCCAAGCTCCA...
...,...
16315,CCGTATGAAGATGTCCTGTTAAATTTACAACACTAACGATGTAGAC...
16316,ACACACCCCCGAAAAACACAAGACCGACCCAAAATCTAGAGGAAAG...
16317,AGAAGCTAAAAGGAAAGAAAATAAATCTATCAAAATTACCCTAAAC...
16318,CTTCACTTTTGGGCTCAAGGACTGTGTGAACCAACAAGGGGCCAGT...


In [11]:
file_path = 'kmer_train.pickle'
if os.path.exists(file_path):
    with open(file_path, 'rb') as f:
        kmer_train = pickle.load(f)
else:
    # get the frequency counts of all motifs till 11mer
    kmer_train = KmerCountNew(2,pseudocount=0.1)
    kmer_train.compute_counts(train_df['3-UTR'])
    kmer_train.kmer_counts_dict

    # save dictionary pickle file
    with open('kmer_train.pickle', 'wb') as f:
        pickle.dump(kmer_train, f)

  0%|          | 0/16320 [00:00<?, ?it/s]100%|██████████| 16320/16320 [01:05<00:00, 250.06it/s]


In [13]:
dinuc_dist = np.array([[[0.26743808301046024,0.21973121381119634, 0.22301393749003728, 0.2898167656883061],
        [0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        ]],

       [[0.2968737832744875, 0.18507058520226632, 0.26581813454136444,0.26581813454136444],
        [0.32053421022722217, 0.2803858410399341, 0.0554500543745703, 0.34362989435827346],
        [0.26000757243673117, 0.2138329457921364, 0.26097487205728753, 0.24694501885793302],
        [0.20207800866762973, 0.20690293271757126, 0.27748861612369985, 0.3135304424910991]]])

In [14]:
chain = MarkovChainNew(kmer_train, dinuc_dist)

In [15]:
chain.markov_matrix

array([[[0.26743808, 0.21973121, 0.22301394, 0.28981677],
        [0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        ]],

       [[0.29687378, 0.18507059, 0.26581813, 0.26581813],
        [0.32053421, 0.28038584, 0.05545005, 0.34362989],
        [0.26000757, 0.21383295, 0.26097487, 0.24694502],
        [0.20207801, 0.20690293, 0.27748862, 0.31353044]]])

In [16]:
chain.impute_for_seq("AAACT", 1)

array([[0.26743808, 0.21973121, 0.22301394, 0.28981677],
       [0.29687378, 0.18507059, 0.26581813, 0.26581813],
       [0.29687378, 0.18507059, 0.26581813, 0.26581813],
       [0.29687378, 0.18507059, 0.26581813, 0.26581813],
       [0.32053421, 0.28038584, 0.05545005, 0.34362989]])

In [17]:
markov_model = MarkovModelNew(
    kmer_train,
    markov_matrix_path="markov_model.npy",
    order=1,
    bidirectional=False,
    test_df_path='test_df.pickle',
    dinuc_dist = dinuc_dist)

In [18]:
markov_model.test()

In [27]:
!cp -r "/content/prbs.pt" "/content/drive/MyDrive/MLRG2023"