In [10]:
import os
import numpy as np
import pandas as pd
import ast
import math
import re
import itertools
import random
import json
import matplotlib
from matplotlib.pyplot import plot as plt
from pylab import *
mpl.use("pgf")
# activate latex text rendering
rc('text', usetex=True)
rc('axes', linewidth=2)
rc('font', weight='bold')
mpl.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
    'text.latex.preamble':r'\usepackage{sfmath} \boldmath'
})
%matplotlib inline
import seaborn as sns
sns.set_theme(style="whitegrid")
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import train_test_split
from tqdm import tqdm
# sns.set_theme(style="ticks")


In [11]:
# load data
root_dir = './data/'
# load data

# load genotype
genotypes = pd.read_csv(root_dir+ 'genotype_full.txt',
                        sep='\t',
                        index_col=0)
genotypes[genotypes == -1] = 0
headers = genotypes.columns[:]
genotypes.head()


Unnamed: 0_level_0,33070_chrI_33070_A_T,33147_chrI_33147_G_T,33152_chrI_33152_T_C,33200_chrI_33200_C_T,33293_chrI_33293_A_T,33328_chrI_33328_C_A,33348_chrI_33348_G_C,33403_chrI_33403_C_T,33502_chrI_33502_A_G,33548_chrI_33548_A_C,...,12048853_chrXVI_925593_G_C,12049199_chrXVI_925939_T_C,12049441_chrXVI_926181_C_T,12050613_chrXVI_927353_T_G,12051167_chrXVI_927907_A_C,12051240_chrXVI_927980_A_G,12051367_chrXVI_928107_C_T,12052782_chrXVI_929522_C_T,12052988_chrXVI_929728_A_G,12053130_chrXVI_929870_C_T
SAMID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01_01,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
01_02,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
01_03,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
01_04,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
01_06,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
def iqs_score(data_imp, data_obs):
    #predict_onehot = SCDA.predict(test_X_missing[:, :, :])

    print('data_imp:', data_imp.shape)

    # data_imp: rows (snps); cols (samples)
    data_imp_012 = data_imp.transpose(1, 0)

    # data_imp_012 = data_imp - 1


    print('data_imp_012:', data_imp_012.shape)

    snp_cnt = data_imp_012.shape[0]
    sample_cnt = data_imp_012.shape[1]
    print('snp_cnt:', snp_cnt)
    print('sample_cnt:', sample_cnt)

    data_obs_012 = data_obs.transpose(1, 0)
    print('data_obs_012:', data_obs_012.shape)

    # data_obs_012 = data_obs - 1


    iqs = [0 for x in range(snp_cnt)]


    for i in tqdm(range(snp_cnt)):
        iqs_cnt = [[0 for x in range(3)] for y in range(3)]

        for j in range(sample_cnt):
            if(data_obs_012[i][j]==0) and (data_imp_012[i][j]==0):
                iqs_cnt[0][0] = iqs_cnt[0][0] + 1
            elif(data_obs_012[i][j]==1) and (data_imp_012[i][j]==0):
                iqs_cnt[0][1] = iqs_cnt[0][1] + 1
            elif(data_obs_012[i][j]==2) and (data_imp_012[i][j]==0):
                iqs_cnt[0][2] = iqs_cnt[0][2] + 1

            elif(data_obs_012[i][j]==0) and (data_imp_012[i][j]==1):
                iqs_cnt[1][0] = iqs_cnt[1][0] + 1
            elif(data_obs_012[i][j]==1) and (data_imp_012[i][j]==1):
                iqs_cnt[1][1] = iqs_cnt[1][1] + 1
            elif(data_obs_012[i][j]==2) and (data_imp_012[i][j]==1):
                iqs_cnt[1][2] = iqs_cnt[1][2] + 1

            elif(data_obs_012[i][j]==0) and (data_imp_012[i][j]==2):
                iqs_cnt[2][0] = iqs_cnt[2][0] + 1
            elif(data_obs_012[i][j]==1) and (data_imp_012[i][j]==2):
                iqs_cnt[2][1] = iqs_cnt[2][1] + 1
            elif(data_obs_012[i][j]==2) and (data_imp_012[i][j]==2):
                iqs_cnt[2][2] = iqs_cnt[2][2] + 1

        po = (iqs_cnt[0][0] + iqs_cnt[1][1]+ iqs_cnt[2][2])/sample_cnt

        pc_rows_0 = iqs_cnt[0][0] + iqs_cnt[0][1]+ iqs_cnt[0][2]
        pc_rows_1 = iqs_cnt[1][0] + iqs_cnt[1][1]+ iqs_cnt[1][2]
        pc_rows_2 = iqs_cnt[2][0] + iqs_cnt[2][1]+ iqs_cnt[2][2]

        pc_cols_0 = iqs_cnt[0][0] + iqs_cnt[1][0]+ iqs_cnt[2][0]
        pc_cols_1 = iqs_cnt[0][1] + iqs_cnt[1][1]+ iqs_cnt[2][1]
        pc_cols_2 = iqs_cnt[0][2] + iqs_cnt[1][2]+ iqs_cnt[2][2]

        pc = (pc_rows_0*pc_cols_0 + pc_rows_1*pc_cols_1 + pc_rows_2*pc_cols_2)/(sample_cnt**2)


        if pc == 1:
            iqs[i] = 1
        else:
            iqs[i] = (po - pc)/(1 - pc)


    print('iqs mean:', np.mean(iqs))
    return iqs, np.mean(iqs)

In [13]:
columns = ["Fold", "Missing Rate", "IQS", "Method"]
dfs = []
N_SPLITS=3

In [14]:
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=2022)
fold = 0
_x = genotypes.to_numpy()
for train_index, test_index in kf.split(_x):
  fold += 1
  print(f"Fold: {fold}")
  y = _x[test_index]
  for missing_perc in [0.01,
                       0.05,
                       0.1,
                       0.2]:
    print(f"Missing rate: {missing_perc}")
    file_name = f"./experimental_results/yeast/HLA_DEEP/preds_mixed_mr_{missing_perc}_fold_{fold}_.csv"
    result_genotypes = pd.read_csv(file_name, index_col=0)
    result_genotypes[result_genotypes == -1] = 0
    yhat = result_genotypes.to_numpy()
    assert y.shape == yhat.shape
    iqs_a, iqs_mean_a = iqs_score(yhat, y)

    # print('iqs_mean:', iqs_mean_a)
    # print('iqs:', iqs_a)

    dataframe = pd.DataFrame(np.array([fold, missing_perc, iqs_mean_a, "HLA*DEEP"]).reshape((1, -1)), columns=columns)
    dfs.append(dataframe)

Fold: 1
Missing rate: 0.01
data_imp: (1464, 28220)
data_imp_012: (28220, 1464)
snp_cnt: 28220
sample_cnt: 1464
data_obs_012: (28220, 1464)


100%|██████████| 28220/28220 [00:52<00:00, 534.36it/s]


iqs mean: 0.9998297743737391
Missing rate: 0.05
data_imp: (1464, 28220)
data_imp_012: (28220, 1464)
snp_cnt: 28220
sample_cnt: 1464
data_obs_012: (28220, 1464)


100%|██████████| 28220/28220 [00:50<00:00, 562.18it/s]


iqs mean: 0.9996911397501229
Missing rate: 0.1
data_imp: (1464, 28220)
data_imp_012: (28220, 1464)
snp_cnt: 28220
sample_cnt: 1464
data_obs_012: (28220, 1464)


100%|██████████| 28220/28220 [00:50<00:00, 560.98it/s]


iqs mean: 0.9995218297263782
Missing rate: 0.2
data_imp: (1464, 28220)
data_imp_012: (28220, 1464)
snp_cnt: 28220
sample_cnt: 1464
data_obs_012: (28220, 1464)


100%|██████████| 28220/28220 [00:50<00:00, 560.57it/s]


iqs mean: 0.9991786638425214
Fold: 2
Missing rate: 0.01
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:50<00:00, 555.15it/s]


iqs mean: 0.9998387238758168
Missing rate: 0.05
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:52<00:00, 542.61it/s]


iqs mean: 0.9996967991155525
Missing rate: 0.1
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:51<00:00, 547.41it/s]


iqs mean: 0.9995197886345294
Missing rate: 0.2
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:50<00:00, 558.73it/s]


iqs mean: 0.9991782264304861
Fold: 3
Missing rate: 0.01
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:49<00:00, 566.64it/s]


iqs mean: 0.9998652437151124
Missing rate: 0.05
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:49<00:00, 566.86it/s]


iqs mean: 0.9997211271665883
Missing rate: 0.1
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:50<00:00, 555.92it/s]


iqs mean: 0.9995465173966629
Missing rate: 0.2
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:49<00:00, 574.09it/s]

iqs mean: 0.9991951423896422





In [15]:
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=2022)
fold = 0
_x = genotypes.to_numpy()
for train_index, test_index in kf.split(_x):
  fold += 1
  print(f"Fold: {fold}")
  y = _x[test_index]
  for missing_perc in [0.01,
                       0.05,
                       0.1,
                       0.2]:
    print(f"Missing rate: {missing_perc}")
    file_name = f"./experimental_results/yeast/SCDA/preds_mixed_mr_{missing_perc}_fold_{fold}_.csv"
    result_genotypes = pd.read_csv(file_name, index_col=0)
    result_genotypes[result_genotypes == -1] = 0
    yhat = result_genotypes.to_numpy()
    assert y.shape == yhat.shape
    iqs_a, iqs_mean_a = iqs_score(yhat, y)

    # print('iqs_mean:', iqs_mean_a)
    # print('iqs:', iqs_a)

    dataframe = pd.DataFrame(np.array([fold, missing_perc, iqs_mean_a, "SCDA"]).reshape((1, -1)), columns=columns)
    dfs.append(dataframe)

Fold: 1
Missing rate: 0.01
data_imp: (1464, 28220)
data_imp_012: (28220, 1464)
snp_cnt: 28220
sample_cnt: 1464
data_obs_012: (28220, 1464)


100%|██████████| 28220/28220 [00:49<00:00, 569.63it/s]


iqs mean: 0.9998946788676805
Missing rate: 0.05
data_imp: (1464, 28220)
data_imp_012: (28220, 1464)
snp_cnt: 28220
sample_cnt: 1464
data_obs_012: (28220, 1464)


100%|██████████| 28220/28220 [00:49<00:00, 569.97it/s]


iqs mean: 0.9997453219937419
Missing rate: 0.1
data_imp: (1464, 28220)
data_imp_012: (28220, 1464)
snp_cnt: 28220
sample_cnt: 1464
data_obs_012: (28220, 1464)


100%|██████████| 28220/28220 [00:49<00:00, 571.98it/s]


iqs mean: 0.9995612847448404
Missing rate: 0.2
data_imp: (1464, 28220)
data_imp_012: (28220, 1464)
snp_cnt: 28220
sample_cnt: 1464
data_obs_012: (28220, 1464)


100%|██████████| 28220/28220 [00:50<00:00, 564.34it/s]


iqs mean: 0.999210159001086
Fold: 2
Missing rate: 0.01
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:49<00:00, 567.86it/s]


iqs mean: 0.9998414787372322
Missing rate: 0.05
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:49<00:00, 569.42it/s]


iqs mean: 0.9996991250538765
Missing rate: 0.1
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:48<00:00, 583.29it/s]


iqs mean: 0.9995172716934343
Missing rate: 0.2
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:49<00:00, 570.05it/s]


iqs mean: 0.9991710502793857
Fold: 3
Missing rate: 0.01
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:48<00:00, 582.39it/s]


iqs mean: 0.9998542693138176
Missing rate: 0.05
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:49<00:00, 569.28it/s]


iqs mean: 0.9997114069059717
Missing rate: 0.1
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:49<00:00, 575.16it/s]


iqs mean: 0.9995328630100891
Missing rate: 0.2
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:49<00:00, 567.17it/s]

iqs mean: 0.9991727434975075





In [16]:
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=2022)
fold = 0
_x = genotypes.to_numpy()
for train_index, test_index in kf.split(_x):
  fold += 1
  print(f"Fold: {fold}")
  y = _x[test_index]
  for missing_perc in [0.01,
                       0.05,
                       0.1,
                       0.2]:
    print(f"Missing rate: {missing_perc}")
    file_name = f"./experimental_results/yeast/AE/preds_mixed_mr_{missing_perc}_fold_{fold}_.csv"
    result_genotypes = pd.read_csv(file_name, index_col=0)
    result_genotypes[result_genotypes == -1] = 0
    yhat = result_genotypes.to_numpy()
    assert y.shape == yhat.shape
    iqs_a, iqs_mean_a = iqs_score(yhat, y)

    # print('iqs_mean:', iqs_mean_a)
    # print('iqs:', iqs_a)

    dataframe = pd.DataFrame(np.array([fold, missing_perc, iqs_mean_a, "AE"]).reshape((1, -1)), columns=columns)
    dfs.append(dataframe)

Fold: 1
Missing rate: 0.01
data_imp: (1464, 28220)
data_imp_012: (28220, 1464)
snp_cnt: 28220
sample_cnt: 1464
data_obs_012: (28220, 1464)


100%|██████████| 28220/28220 [00:47<00:00, 588.69it/s]


iqs mean: 0.9999541978598373
Missing rate: 0.05
data_imp: (1464, 28220)
data_imp_012: (28220, 1464)
snp_cnt: 28220
sample_cnt: 1464
data_obs_012: (28220, 1464)


100%|██████████| 28220/28220 [00:47<00:00, 591.14it/s]


iqs mean: 0.9998002911686824
Missing rate: 0.1
data_imp: (1464, 28220)
data_imp_012: (28220, 1464)
snp_cnt: 28220
sample_cnt: 1464
data_obs_012: (28220, 1464)


100%|██████████| 28220/28220 [00:47<00:00, 592.38it/s]


iqs mean: 0.9996109904730566
Missing rate: 0.2
data_imp: (1464, 28220)
data_imp_012: (28220, 1464)
snp_cnt: 28220
sample_cnt: 1464
data_obs_012: (28220, 1464)


100%|██████████| 28220/28220 [00:47<00:00, 591.52it/s]


iqs mean: 0.9992477428214512
Fold: 2
Missing rate: 0.01
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:48<00:00, 578.57it/s]


iqs mean: 0.9999277175551007
Missing rate: 0.05
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:48<00:00, 586.21it/s]


iqs mean: 0.9997824558797147
Missing rate: 0.1
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:47<00:00, 596.84it/s]


iqs mean: 0.9995890676631985
Missing rate: 0.2
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:47<00:00, 593.20it/s]


iqs mean: 0.9992169598984915
Fold: 3
Missing rate: 0.01
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:47<00:00, 596.64it/s]


iqs mean: 0.9999378591912056
Missing rate: 0.05
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:47<00:00, 590.50it/s]


iqs mean: 0.9997786186470409
Missing rate: 0.1
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:48<00:00, 577.40it/s]


iqs mean: 0.9996027001618768
Missing rate: 0.2
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:48<00:00, 587.65it/s]

iqs mean: 0.9992164750178015





In [17]:
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=2022)
fold = 0
_x = genotypes.to_numpy()
for train_index, test_index in kf.split(_x):
  fold += 1
  print(f"Fold: {fold}")
  y = _x[test_index]
  for missing_perc in [0.01,
                       0.05,
                       0.1,
                       0.2]:
    print(f"Missing rate: {missing_perc}")
    file_name = f"./experimental_results/yeast/STI_wo_emb/preds_mixed_mr_{missing_perc}_fold_{fold}_.csv"
    result_genotypes = pd.read_csv(file_name, index_col=0)
    result_genotypes[result_genotypes == -1] = 0
    yhat = result_genotypes.to_numpy()
    assert y.shape == yhat.shape
    iqs_a, iqs_mean_a = iqs_score(yhat, y)

    # print('iqs_mean:', iqs_mean_a)
    # print('iqs:', iqs_a)

    dataframe = pd.DataFrame(np.array([fold, missing_perc, iqs_mean_a, "STI*WE"]).reshape((1, -1)), columns=columns)
    dfs.append(dataframe)

Fold: 1
Missing rate: 0.01
data_imp: (1464, 28220)
data_imp_012: (28220, 1464)
snp_cnt: 28220
sample_cnt: 1464
data_obs_012: (28220, 1464)


100%|██████████| 28220/28220 [00:47<00:00, 597.54it/s]


iqs mean: 0.9997248754000115
Missing rate: 0.05
data_imp: (1464, 28220)
data_imp_012: (28220, 1464)
snp_cnt: 28220
sample_cnt: 1464
data_obs_012: (28220, 1464)


100%|██████████| 28220/28220 [00:47<00:00, 593.71it/s]


iqs mean: 0.999598453890404
Missing rate: 0.1
data_imp: (1464, 28220)
data_imp_012: (28220, 1464)
snp_cnt: 28220
sample_cnt: 1464
data_obs_012: (28220, 1464)


100%|██████████| 28220/28220 [00:47<00:00, 596.57it/s]


iqs mean: 0.9994409913197551
Missing rate: 0.2
data_imp: (1464, 28220)
data_imp_012: (28220, 1464)
snp_cnt: 28220
sample_cnt: 1464
data_obs_012: (28220, 1464)


100%|██████████| 28220/28220 [00:47<00:00, 596.75it/s]


iqs mean: 0.999132174418715
Fold: 2
Missing rate: 0.01
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:47<00:00, 595.20it/s]


iqs mean: 0.9997600895297347
Missing rate: 0.05
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:47<00:00, 591.93it/s]


iqs mean: 0.9996180219463825
Missing rate: 0.1
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:47<00:00, 598.28it/s]


iqs mean: 0.999434252446694
Missing rate: 0.2
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:47<00:00, 590.42it/s]


iqs mean: 0.9990854037098065
Fold: 3
Missing rate: 0.01
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:47<00:00, 597.27it/s]


iqs mean: 0.9998095466846395
Missing rate: 0.05
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:47<00:00, 599.83it/s]


iqs mean: 0.9996632523502983
Missing rate: 0.1
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:46<00:00, 600.58it/s]


iqs mean: 0.9994866572863538
Missing rate: 0.2
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:47<00:00, 598.52it/s]

iqs mean: 0.9991349475114798





In [18]:
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=2022)
fold = 0
_x = genotypes.to_numpy()
for train_index, test_index in kf.split(_x):
  fold += 1
  print(f"Fold: {fold}")
  y = _x[test_index]
  for missing_perc in [0.01,
                       0.05,
                       0.1,
                       0.2]:
    print(f"Missing rate: {missing_perc}")
    file_name = f"./experimental_results/yeast/STI/preds_mixed_mr_{missing_perc}_fold_{fold}_.csv"
    result_genotypes = pd.read_csv(file_name, index_col=0)
    result_genotypes[result_genotypes == -1] = 0
    yhat = result_genotypes.to_numpy()
    assert y.shape == yhat.shape
    iqs_a, iqs_mean_a = iqs_score(yhat, y)

    # print('iqs_mean:', iqs_mean_a)
    # print('iqs:', iqs_a)

    dataframe = pd.DataFrame(np.array([fold, missing_perc, iqs_mean_a, "STI"]).reshape((1, -1)), columns=columns)
    dfs.append(dataframe)

df = pd.DataFrame(columns = columns)
print("Combining the results...")
df = pd.concat(dfs, ignore_index=True)
del dfs

df

Fold: 1
Missing rate: 0.01
data_imp: (1464, 28220)
data_imp_012: (28220, 1464)
snp_cnt: 28220
sample_cnt: 1464
data_obs_012: (28220, 1464)


100%|██████████| 28220/28220 [00:47<00:00, 597.44it/s]


iqs mean: 0.9998838309481506
Missing rate: 0.05
data_imp: (1464, 28220)
data_imp_012: (28220, 1464)
snp_cnt: 28220
sample_cnt: 1464
data_obs_012: (28220, 1464)


100%|██████████| 28220/28220 [00:47<00:00, 598.39it/s]


iqs mean: 0.9997828835311254
Missing rate: 0.1
data_imp: (1464, 28220)
data_imp_012: (28220, 1464)
snp_cnt: 28220
sample_cnt: 1464
data_obs_012: (28220, 1464)


100%|██████████| 28220/28220 [00:47<00:00, 599.32it/s]


iqs mean: 0.999667690382948
Missing rate: 0.2
data_imp: (1464, 28220)
data_imp_012: (28220, 1464)
snp_cnt: 28220
sample_cnt: 1464
data_obs_012: (28220, 1464)


100%|██████████| 28220/28220 [00:47<00:00, 599.25it/s]


iqs mean: 0.9994294458563554
Fold: 2
Missing rate: 0.01
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:47<00:00, 598.13it/s]


iqs mean: 0.9998147284228881
Missing rate: 0.05
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:47<00:00, 598.42it/s]


iqs mean: 0.999722828163956
Missing rate: 0.1
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:47<00:00, 597.35it/s]


iqs mean: 0.9996091728808921
Missing rate: 0.2
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:47<00:00, 598.62it/s]


iqs mean: 0.9993712455087284
Fold: 3
Missing rate: 0.01
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:47<00:00, 599.30it/s]


iqs mean: 0.9998432339330708
Missing rate: 0.05
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:47<00:00, 599.35it/s]


iqs mean: 0.9997506587450659
Missing rate: 0.1
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:47<00:00, 599.84it/s]


iqs mean: 0.9996270083595543
Missing rate: 0.2
data_imp: (1463, 28220)
data_imp_012: (28220, 1463)
snp_cnt: 28220
sample_cnt: 1463
data_obs_012: (28220, 1463)


100%|██████████| 28220/28220 [00:47<00:00, 599.88it/s]


iqs mean: 0.9993814046896261
Combining the results...


Unnamed: 0,Fold,MissingRate,IQS,Method
0,1,0.01,0.9998297743737392,HLA*DEEP
1,1,0.05,0.9996911397501228,HLA*DEEP
2,1,0.1,0.9995218297263782,HLA*DEEP
3,1,0.2,0.9991786638425214,HLA*DEEP
4,2,0.01,0.9998387238758168,HLA*DEEP
5,2,0.05,0.9996967991155524,HLA*DEEP
6,2,0.1,0.9995197886345294,HLA*DEEP
7,2,0.2,0.999178226430486,HLA*DEEP
8,3,0.01,0.9998652437151124,HLA*DEEP
9,3,0.05,0.9997211271665885,HLA*DEEP


In [19]:
df["IQS"] = df["IQS"].astype(float)
df = df.round({'IQS': 5})
df

Unnamed: 0,Fold,MissingRate,IQS,Method
0,1,0.01,0.99983,HLA*DEEP
1,1,0.05,0.99969,HLA*DEEP
2,1,0.1,0.99952,HLA*DEEP
3,1,0.2,0.99918,HLA*DEEP
4,2,0.01,0.99984,HLA*DEEP
5,2,0.05,0.9997,HLA*DEEP
6,2,0.1,0.99952,HLA*DEEP
7,2,0.2,0.99918,HLA*DEEP
8,3,0.01,0.99987,HLA*DEEP
9,3,0.05,0.99972,HLA*DEEP


In [20]:
df.to_csv("results_in_table/yeast/IQS_all.csv")


In [21]:
df1 = df.groupby(["Method", "Fold", "Missing Rate"]).agg( {"IQS":"mean"})
df1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,IQS
Method,Fold,MissingRate,Unnamed: 3_level_1
AE,1,0.01,0.99995
AE,1,0.05,0.9998
AE,1,0.1,0.99961
AE,1,0.2,0.99925
AE,2,0.01,0.99993
AE,2,0.05,0.99978
AE,2,0.1,0.99959
AE,2,0.2,0.99922
AE,3,0.01,0.99994
AE,3,0.05,0.99978


In [22]:
df1 = df.groupby(["Method", "Fold", "Missing Rate"]).agg( {"IQS":"mean"}).reset_index()# df1.columns = df1.columns.droplevel(0)
df1 = df1.groupby(["Method", "Missing Rate"]).agg( {"IQS":["mean", "std"]})#.reset_index()
df1


Unnamed: 0_level_0,Unnamed: 1_level_0,IQS,IQS
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std
Method,MissingRate,Unnamed: 2_level_2,Unnamed: 3_level_2
AE,0.01,0.99994,1e-05
AE,0.05,0.999787,1.2e-05
AE,0.1,0.9996,1e-05
AE,0.2,0.99923,1.7e-05
HLA*DEEP,0.01,0.999847,2.1e-05
HLA*DEEP,0.05,0.999703,1.5e-05
HLA*DEEP,0.1,0.99953,1.7e-05
HLA*DEEP,0.2,0.999187,1.2e-05
SCDA,0.01,0.99986,2.6e-05
SCDA,0.05,0.99972,2.6e-05
