#### Summary:
In this notebook we'll use the updated LAI_hmm_script_progress.py function which has a progress bar to compare our HMM runtime between very small and very large datasets.

In [1]:
import random as rn
import numpy as np
import pandas as pd
import sys
import math

In [2]:
import datetime

In [3]:
from LAI_hmm_scriptFINAL import HMMOptimalPathLAI, HammingDist, makeSNPseq, standardizeIndices

In [5]:
from LAI_hmm_script_log10_progress import HMMOptimalPathLAI_progress

# HMM Running Function

In [6]:
def HMM_implementation_wrapper(popA, popB, genotype_fp, emission_df, recomb, output_fp, chrm):
    #read in sample genotype to test -- assumes current dir is the team3 dir
    genotype_df = pd.read_csv(genotype_fp, sep='\t',header = 0)

    #cut down both dfs to only be positions shared in both
    fin_emission_df, fin_genotype_df = standardizeIndices(emission_df, genotype_df, "POS")
    now = datetime.datetime.now()
    print ("Files cut to share POS", now.strftime("%Y-%m-%d %H:%M:%S"))
    
    #create SNPseq
    a1_idx = list(fin_genotype_df.columns).index('A1')
    a2_idx = list(fin_genotype_df.columns).index('A2')
    snps = makeSNPseq(fin_genotype_df,a1_idx,a2_idx)
    now = datetime.datetime.now()
    print ("SNP sequence created", now.strftime("%Y-%m-%d %H:%M:%S"))
    
    #create the hmm object
    hmm = HMMOptimalPathLAI_progress(popA, popB, fin_emission_df, recomb, snps)
    hmm.get_transition_matrix()
    now = datetime.datetime.now()
    print ("Transition matrix made", now.strftime("%Y-%m-%d %H:%M:%S"))
    
    #perform the Viterbi algorithm and reconstruct the most probable path through states
    hmm.get_optimal_path()
    now = datetime.datetime.now()
    print ("Optimal path algorithm complete", now.strftime("%Y-%m-%d %H:%M:%S"))
    
    hmm.reconstruct_path()
    now = datetime.datetime.now()
    print ("Path reconstruction complete", now.strftime("%Y-%m-%d %H:%M:%S"))

    #convert the path to the desired output format (pass the string to use for CHR col)
    path_df = hmm.output_path(chrm)
    path_df.to_csv(output_fp,sep='\t',header=True,index=False)
    now = datetime.datetime.now()
    print ("Path saved", now.strftime("%Y-%m-%d %H:%M:%S"))

# Short Data

In [8]:
#define names of the two test populations
test_PopA = "0"
test_PopB = "1"

In [11]:
#read in emissions data (simulated df)
emission_fp1 = "Git/cse284_project/Data/simData_N2_P100_seed518.tsv" #asumes in team3 dir
emission_df1 = pd.read_csv(emission_fp1, sep='\t',header=0)

#renaming the columns (only necessary for simulated data)
colnames = ["POS","0_A","0_C","0_G","0_T","1_A","1_C","1_G","1_T"]
emission_df1.columns = colnames
emission_df1.head()

Unnamed: 0,POS,0_A,0_C,0_G,0_T,1_A,1_C,1_G,1_T
0,0,0.0,0.3553,0.0,0.6447,0.0,0.5719,0.0,0.4281
1,1,0.0,0.758,0.0,0.242,0.0,0.4374,0.0,0.5626
2,2,0.1303,0.0,0.0,0.8697,0.4271,0.0,0.0,0.5729
3,3,0.2157,0.0,0.7843,0.0,0.3534,0.0,0.6466,0.0
4,4,0.2444,0.0,0.0,0.7556,0.3655,0.0,0.0,0.6345


In [12]:
#read in sample genotype to test
#assumes current dir is the team3 dir
genotype_fp1 = "Git/cse284_project/Data/simGenome_100_0_0.tsv"
genotype_df1 = pd.read_csv(genotype_fp1, sep='\t',header = 0)
genotype_df1.head()

Unnamed: 0,POS,A1,A2,POP1,POP2
0,0,C,T,0,0
1,1,C,C,0,0
2,2,T,T,0,0
3,3,A,G,0,0
4,4,T,T,0,0


In [24]:
output_fp1 = "HMM_Test_Outputs/210527_simGenome_100_0_0_short_runtime_test.tsv"
HMM_implementation_wrapper(test_PopA, test_PopB, genotype_fp1, emission_df1, 0.1, output_fp1, "21")

get_optimal_path: 19it [00:00, 169.89it/s]

Files cut to share POS 2021-05-27 13:28:27
SNP sequence created 2021-05-27 13:28:27
Transition matrix made 2021-05-27 13:28:27


get_optimal_path: 100it [00:00, 177.50it/s]
reconstruct_path: 100%|██████████| 100/100 [00:00<00:00, 373823.89it/s]
output_path: 100%|██████████| 100/100 [00:00<00:00, 668948.01it/s]


Optimal path algorithm complete 2021-05-27 13:28:28
Path reconstruction complete 2021-05-27 13:28:28
Path saved 2021-05-27 13:28:28


# Huge Data

In [6]:
popA = "AFR"
popB = "EUR"

Emissions files

In [7]:
#chr14
chr14_emissions_fp = "/home/hmummey/teams/CSE284_SP21_A00/team3/chromosome_14_files/chr14_genotypes_afr_eur_allelefreqs.bybp.csv"
emission_df2 = pd.read_csv(chr14_emissions_fp, sep=',',header=0)
emission_df2.head()

Unnamed: 0,CHR,POS,SNP,A2,A1,MAF_AFR,MAF_EUR,AFR_A,AFR_C,AFR_G,AFR_T,EUR_A,EUR_C,EUR_G,EUR_T
0,14,19000017,rs375700886,C,T,0.0,0.0,1e-06,1.0,1e-06,1e-06,1e-06,1.0,1e-06,1e-06
1,14,19000050,rs543746158,G,A,0.0,0.0,1e-06,1e-06,1.0,1e-06,1e-06,1e-06,1.0,1e-06
2,14,19000056,rs561973970,A,T,0.0,0.0,1.0,1e-06,1e-06,1e-06,1.0,1e-06,1e-06,1e-06
3,14,19000059,rs201622908,G,T,0.0,0.00498,1e-06,1e-06,1.0,1e-06,1e-06,1e-06,0.99502,0.00498
4,14,19000060,rs28973059,C,G,0.1188,0.3884,1e-06,0.8812,0.1188,1e-06,1e-06,0.6116,0.3884,1e-06


In [8]:
#go through manually (step by step) for the first file: /home/hmummey/teams/CSE284_SP21_A00/team3/simulated_files/admixEUR_AFR_chr14_Rx1_a.tsv
genotype_fp2 = "/home/hmummey/teams/CSE284_SP21_A00/team3/simulated_files/admixEUR_AFR_chr14_Rx1_a.tsv"
genotype_df2 = pd.read_csv(genotype_fp2, sep='\t',header = 0)
print(len(genotype_df2))
genotype_df2.head()

2539145


Unnamed: 0,POS,A1,A2,POP1,POP2
0,19000017,T,T,1,0
1,19000050,A,A,1,0
2,19000056,T,T,1,0
3,19000059,T,T,1,0
4,19000060,G,G,1,0


In [9]:
output_fp2 = "HMM_Test_Outputs/210527_admixEUR_AFR_chr14_Rx1_a_long_runtime_test.tsv"
HMM_implementation_wrapper(popA, popB, genotype_fp2, emission_df2, 0.1, output_fp2, "14")

Files cut to share POS 2021-05-27 17:25:56


get_optimal_path: 18it [00:00, 173.93it/s]

SNP sequence created 2021-05-27 17:27:59
Transition matrix made 2021-05-27 17:27:59


get_optimal_path: 734107it [47:05, 282.94it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

get_optimal_path: 2500619it [2:36:44, 265.89it/s]
reconstruct_path:   4%|▍         | 111117/2500619 [00:00<00:02, 1111144.57it/s]

Optimal path algorithm complete 2021-05-27 20:04:44


reconstruct_path: 100%|██████████| 2500619/2500619 [00:02<00:00, 1192002.93it/s]
output_path:   9%|▉         | 221389/2500619 [00:00<00:01, 2213828.77it/s]

Path reconstruction complete 2021-05-27 20:04:46


output_path: 100%|██████████| 2500619/2500619 [00:00<00:00, 2523670.34it/s]


Path saved 2021-05-27 20:05:43


# Huge data again:
* chr14 Rx1
* chr21 Rx4

In [11]:
popA = "AFR"
popB = "EUR"

Chr14

In [8]:
#chr14
chr14_emissions_fp = "/home/hmummey/teams/CSE284_SP21_A00/team3/chromosome_14_files/chr14_genotypes_afr_eur_allelefreqs.bybp.csv"
emission_df14 = pd.read_csv(chr14_emissions_fp, sep=',',header=0)
emission_df14.head()

Unnamed: 0,CHR,POS,SNP,A2,A1,MAF_AFR,MAF_EUR,AFR_A,AFR_C,AFR_G,AFR_T,EUR_A,EUR_C,EUR_G,EUR_T
0,14,19000017,rs375700886,C,T,0.0,0.0,1e-06,1.0,1e-06,1e-06,1e-06,1.0,1e-06,1e-06
1,14,19000050,rs543746158,G,A,0.0,0.0,1e-06,1e-06,1.0,1e-06,1e-06,1e-06,1.0,1e-06
2,14,19000056,rs561973970,A,T,0.0,0.0,1.0,1e-06,1e-06,1e-06,1.0,1e-06,1e-06,1e-06
3,14,19000059,rs201622908,G,T,0.0,0.00498,1e-06,1e-06,1.0,1e-06,1e-06,1e-06,0.99502,0.00498
4,14,19000060,rs28973059,C,G,0.1188,0.3884,1e-06,0.8812,0.1188,1e-06,1e-06,0.6116,0.3884,1e-06


In [9]:
#go through manually (step by step) for the first file: /home/hmummey/teams/CSE284_SP21_A00/team3/simulated_files/admixEUR_AFR_chr14_Rx1_a.tsv
genotype_fp14 = "/home/hmummey/teams/CSE284_SP21_A00/team3/simulated_files/admixEUR_AFR_chr14_Rx1_b.tsv"
genotype_df14 = pd.read_csv(genotype_fp14, sep='\t',header = 0)
print(len(genotype_df14))
genotype_df14.head()

2539145


Unnamed: 0,POS,A1,A2,POP1,POP2
0,19000017,T,T,0,0
1,19000050,A,A,0,0
2,19000056,T,T,0,0
3,19000059,T,T,0,0
4,19000060,G,G,0,0


In [12]:
output_fp14 = "HMM_Log10_Test_Outputs/admixEUR_AFR_chr14_Rx1_b_recomb0.01_HMMoutput.tsv" 
recomb14 = 0.01
HMM_implementation_wrapper(popA, popB, genotype_fp14, emission_df14, recomb14, output_fp14, "14")

Files cut to share POS 2021-06-01 11:31:34


get_optimal_path: 19it [00:00, 189.96it/s]

SNP sequence created 2021-06-01 11:34:07
Transition matrix made 2021-06-01 11:34:07


get_optimal_path: 2378699it [2:58:03, 230.35it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

get_optimal_path: 2500619it [3:07:34, 222.19it/s]
reconstruct_path:   3%|▎         | 87153/2500619 [00:00<00:02, 871501.74it/s]

Optimal path algorithm complete 2021-06-01 14:41:42


reconstruct_path: 100%|██████████| 2500619/2500619 [00:02<00:00, 879053.04it/s]
output_path:  17%|█▋        | 431422/2500619 [00:00<00:00, 2195899.34it/s]

Path reconstruction complete 2021-06-01 14:41:45


output_path: 100%|██████████| 2500619/2500619 [00:01<00:00, 2311189.58it/s]


Path saved 2021-06-01 14:41:51


Chr 21

In [13]:
#chr21
chr21_emissions_fp = "/home/hmummey/teams/CSE284_SP21_A00/team3/chromosome_21_files/chr21_genotypes_afr_eur_allelefreqs.bybp.csv"
emission_df21 = pd.read_csv(chr21_emissions_fp, sep=',',header=0)
emission_df21.head()

Unnamed: 0,CHR,POS,SNP,A2,A1,MAF_AFR,MAF_EUR,AFR_A,AFR_C,AFR_G,AFR_T,EUR_A,EUR_C,EUR_G,EUR_T
0,21,9411239,rs559462325,G,A,0.0,0.0,1e-06,1e-06,1.0,1e-06,1e-06,1e-06,1.0,1e-06
1,21,9411245,rs181691356,C,A,0.000893,0.001992,0.000893,0.999107,1e-06,1e-06,0.001992,0.998008,1e-06,1e-06
2,21,9411264,rs548263598,A,C,0.000893,0.0,0.999107,0.000893,1e-06,1e-06,1.0,1e-06,1e-06,1e-06
3,21,9411267,rs561987868,G,T,0.0,0.0,1e-06,1e-06,1.0,1e-06,1e-06,1e-06,1.0,1e-06
4,21,9411302,rs531010746,G,T,0.01161,0.0,1e-06,1e-06,0.98839,0.01161,1e-06,1e-06,1.0,1e-06


In [14]:
genotype_fp21 = "/home/hmummey/teams/CSE284_SP21_A00/team3/simulated_files/admixEUR_AFR_chr21_Rx4_a.tsv" 
genotype_df21 = pd.read_csv(genotype_fp21, sep='\t',header = 0)
print(len(genotype_df21))
genotype_df21.head()

1054445


Unnamed: 0,POS,A1,A2,POP1,POP2
0,9411239,A,A,0,1
1,9411245,A,A,0,1
2,9411264,C,C,0,1
3,9411267,T,T,0,1
4,9411302,T,T,0,1


In [16]:
output_fp21 = "HMM_Log10_Test_Outputs/admixEUR_AFR_chr21_Rx4_a_recomb0.01_HMMoutput.tsv"
recomb21 = 0.01
HMM_implementation_wrapper(popA, popB, genotype_fp21, emission_df21, recomb21, output_fp21, "21")

Files cut to share POS 2021-06-01 15:25:32


get_optimal_path: 23it [00:00, 224.01it/s]

SNP sequence created 2021-06-01 15:26:37
Transition matrix made 2021-06-01 15:26:37


get_optimal_path: 121080it [09:19, 232.41it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

get_optimal_path: 1037584it [1:18:31, 220.21it/s]
reconstruct_path:   8%|▊         | 86725/1037584 [00:00<00:01, 867223.95it/s]

Optimal path algorithm complete 2021-06-01 16:45:08


reconstruct_path: 100%|██████████| 1037584/1037584 [00:01<00:00, 941896.99it/s]
output_path:   0%|          | 0/1037584 [00:00<?, ?it/s]

Path reconstruction complete 2021-06-01 16:45:09


output_path: 100%|██████████| 1037584/1037584 [00:00<00:00, 2286587.31it/s]


Path saved 2021-06-01 16:45:12


In [17]:
print("test")

test
