In [13]:
import os
import numpy as np
import pandas as pd
import ast
import math
import re
import itertools
import random
import json
import matplotlib
from matplotlib.pyplot import plot as plt
from pylab import *
mpl.use("pgf")
# activate latex text rendering
rc('text', usetex=True)
rc('axes', linewidth=2)
rc('font', weight='bold')
mpl.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
    'text.latex.preamble':r'\usepackage{sfmath} \boldmath'
})
%matplotlib inline
import seaborn as sns
sns.set_theme(style="whitegrid")
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import train_test_split
from tqdm import tqdm
# sns.set_theme(style="ticks")


In [14]:
# load data
root_dir = './data/'
new_data_header = ""
# get header
with open(root_dir + "DELL.chr22.genotypes.full.vcf", 'r') as f_in:
    # skip info
    for line_num in range(70):
        f_in.readline()

    new_data_header = f_in.readline()
# load data

# load genotype
genotypes = pd.read_csv(root_dir + "DELL.chr22.genotypes.full.vcf",
                        comment='#', sep='\t',
                        names=new_data_header.strip().split('\t'),
                        header=None).iloc[:, 9:].T

headers = genotypes.columns[:]
genotypes


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,563,564,565,566,567,568,569,570,571,572
HG00096,0|0,0|0,0|0,0|0,0|0,0|0,1|1,0|0,0|0,0|0,...,0|0,0|1,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
HG00097,0|0,0|0,0|0,0|0,0|0,0|0,1|0,0|0,0|0,0|0,...,0|0,0|1,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
HG00099,0|0,0|0,0|0,0|0,0|0,0|0,1|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
HG00100,0|0,0|0,0|0,0|0,0|0,0|0,1|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
HG00101,0|0,0|0,0|0,0|0,0|0,0|0,1|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NA21137,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
NA21141,0|0,0|0,0|0,0|0,0|0,0|0,1|0,0|0,0|0,0|0,...,0|0,1|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
NA21142,0|0,0|0,0|0,0|0,0|0,0|0,0|1,0|0,0|0,0|0,...,0|0,1|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
NA21143,0|0,0|0,0|0,0|0,0|0,0|0,1|0,0|0,0|0,0|0,...,0|0,0|1,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0


In [15]:
ped_file = 'integrated_call_samples.20130502.ALL.ped'
pedigree = pd.read_csv(root_dir+ped_file, sep='\t', index_col='Individual ID')
pedigree.head()

Unnamed: 0_level_0,Family ID,Paternal ID,Maternal ID,Gender,Phenotype,Population,Relationship,Siblings,Second Order,Third Order,Children,Other Comments
Individual ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
HG00096,HG00096,0,0,1,0,GBR,unrel,0,0,0,0,0
HG00097,HG00097,0,0,2,0,GBR,unrel,0,0,0,0,0
HG00098,HG00098,0,0,1,0,GBR,unrel,0,0,0,0,0
HG00099,HG00099,0,0,2,0,GBR,unrel,0,0,0,0,0
HG00100,HG00100,0,0,2,0,GBR,unrel,0,0,0,0,0


In [16]:
Y_train = pedigree.loc[genotypes.index]['Population']
X = genotypes[genotypes.index.isin(Y_train.index)]
X = X.replace({
    '0|0': 0,
    '0|1': 1,
    '1|0': 1,
    '1|1': 2
})
X.shape

(2504, 573)

In [17]:
def iqs_score(data_imp, data_obs):
    #predict_onehot = SCDA.predict(test_X_missing[:, :, :])

    print('data_imp:', data_imp.shape)

    # data_imp: rows (snps); cols (samples)
    data_imp_012 = data_imp.transpose(1, 0)

    # data_imp_012 = data_imp - 1


    print('data_imp_012:', data_imp_012.shape)

    snp_cnt = data_imp_012.shape[0]
    sample_cnt = data_imp_012.shape[1]
    print('snp_cnt:', snp_cnt)
    print('sample_cnt:', sample_cnt)

    data_obs_012 = data_obs.transpose(1, 0)
    print('data_obs_012:', data_obs_012.shape)

    # data_obs_012 = data_obs - 1


    iqs = [0 for x in range(snp_cnt)]


    for i in tqdm(range(snp_cnt)):
        iqs_cnt = [[0 for x in range(3)] for y in range(3)]

        for j in range(sample_cnt):
            if(data_obs_012[i][j]==0) and (data_imp_012[i][j]==0):
                iqs_cnt[0][0] += 1
            elif(data_obs_012[i][j]==1) and (data_imp_012[i][j]==0):
                iqs_cnt[0][1] += 1
            elif(data_obs_012[i][j]==2) and (data_imp_012[i][j]==0):
                iqs_cnt[0][2] += 1

            elif(data_obs_012[i][j]==0) and (data_imp_012[i][j]==1):
                iqs_cnt[1][0] += 1
            elif(data_obs_012[i][j]==1) and (data_imp_012[i][j]==1):
                iqs_cnt[1][1] += 1
            elif(data_obs_012[i][j]==2) and (data_imp_012[i][j]==1):
                iqs_cnt[1][2] += 1

            elif(data_obs_012[i][j]==0) and (data_imp_012[i][j]==2):
                iqs_cnt[2][0] += 1
            elif(data_obs_012[i][j]==1) and (data_imp_012[i][j]==2):
                iqs_cnt[2][1] += 1
            elif(data_obs_012[i][j]==2) and (data_imp_012[i][j]==2):
                iqs_cnt[2][2] += 1

        po = (iqs_cnt[0][0] + iqs_cnt[1][1] + iqs_cnt[2][2])/sample_cnt

        pc_rows_0 = iqs_cnt[0][0] + iqs_cnt[0][1] + iqs_cnt[0][2]
        pc_rows_1 = iqs_cnt[1][0] + iqs_cnt[1][1] + iqs_cnt[1][2]
        pc_rows_2 = iqs_cnt[2][0] + iqs_cnt[2][1] + iqs_cnt[2][2]

        pc_cols_0 = iqs_cnt[0][0] + iqs_cnt[1][0] + iqs_cnt[2][0]
        pc_cols_1 = iqs_cnt[0][1] + iqs_cnt[1][1] + iqs_cnt[2][1]
        pc_cols_2 = iqs_cnt[0][2] + iqs_cnt[1][2] + iqs_cnt[2][2]

        pc = (pc_rows_0*pc_cols_0 + pc_rows_1*pc_cols_1 + pc_rows_2*pc_cols_2)/(sample_cnt**2)


        if pc == 1:
            iqs[i] = 1
        else:
            iqs[i] = (po - pc)/(1 - pc)


    print('iqs mean:', np.mean(iqs))
    return iqs, np.mean(iqs)

In [18]:
columns = ["Fold", "Missing Rate", "IQS", "Method"]
dfs = []
N_SPLITS=3

In [19]:
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=2022)
fold = 0
_x = X[X.index.isin(Y_train.index)].to_numpy()
for train_index, test_index in kf.split(_x):
  fold += 1
  print(f"Fold: {fold}")
  y = _x[test_index]
  for missing_perc in [0.05,
                       0.1,
                       0.2]:
    print(f"Missing rate: {missing_perc}")
    file_name = f"./experimental_results/Chr22/DEL/minimac4/preds_mixed_mr_{missing_perc}_rs_{fold}_.csv"
    result_genotypes = pd.read_csv(file_name, index_col=0)
    result_genotypes = result_genotypes.replace({
        '0|0': 0,
        '0|1': 1,
        '1|0': 1,
        '1|1': 2
    })
    yhat = result_genotypes.to_numpy()
    assert y.shape == yhat.shape
    iqs_a, iqs_mean_a = iqs_score(yhat, y)

    # print('iqs_mean:', iqs_mean_a)
    # print('iqs:', iqs_a)

    dataframe = pd.DataFrame(np.array([fold, missing_perc, iqs_mean_a, "Minimac4"]).reshape((1, -1)), columns=columns)
    dfs.append(dataframe)

Fold: 1
Missing rate: 0.05
data_imp: (835, 573)
data_imp_012: (573, 835)
snp_cnt: 573
sample_cnt: 835
data_obs_012: (573, 835)


100%|██████████| 573/573 [00:00<00:00, 917.71it/s]


iqs mean: 0.7810578595904459
Missing rate: 0.1
data_imp: (835, 573)
data_imp_012: (573, 835)
snp_cnt: 573
sample_cnt: 835
data_obs_012: (573, 835)


100%|██████████| 573/573 [00:00<00:00, 964.63it/s]


iqs mean: 0.7606228848142589
Missing rate: 0.2
data_imp: (835, 573)
data_imp_012: (573, 835)
snp_cnt: 573
sample_cnt: 835
data_obs_012: (573, 835)


100%|██████████| 573/573 [00:00<00:00, 1039.91it/s]


iqs mean: 0.7253093500793352
Fold: 2
Missing rate: 0.05
data_imp: (835, 573)
data_imp_012: (573, 835)
snp_cnt: 573
sample_cnt: 835
data_obs_012: (573, 835)


100%|██████████| 573/573 [00:00<00:00, 1034.27it/s]


iqs mean: 0.8093062444669576
Missing rate: 0.1
data_imp: (835, 573)
data_imp_012: (573, 835)
snp_cnt: 573
sample_cnt: 835
data_obs_012: (573, 835)


100%|██████████| 573/573 [00:00<00:00, 1021.39it/s]


iqs mean: 0.7886545024430117
Missing rate: 0.2
data_imp: (835, 573)
data_imp_012: (573, 835)
snp_cnt: 573
sample_cnt: 835
data_obs_012: (573, 835)


100%|██████████| 573/573 [00:00<00:00, 1005.27it/s]


iqs mean: 0.7650665157960085
Fold: 3
Missing rate: 0.05
data_imp: (834, 573)
data_imp_012: (573, 834)
snp_cnt: 573
sample_cnt: 834
data_obs_012: (573, 834)


100%|██████████| 573/573 [00:00<00:00, 1000.00it/s]


iqs mean: 0.7917040502006942
Missing rate: 0.1
data_imp: (834, 573)
data_imp_012: (573, 834)
snp_cnt: 573
sample_cnt: 834
data_obs_012: (573, 834)


100%|██████████| 573/573 [00:00<00:00, 1025.05it/s]


iqs mean: 0.7773012894646297
Missing rate: 0.2
data_imp: (834, 573)
data_imp_012: (573, 834)
snp_cnt: 573
sample_cnt: 834
data_obs_012: (573, 834)


100%|██████████| 573/573 [00:00<00:00, 1019.58it/s]

iqs mean: 0.7272036898358366





In [20]:
# load data
root_dir = './data/'
new_data_header = ""
# get header
with open(root_dir + "DELL.chr22.genotypes.for.modeling.vcf", 'r') as f_in:
    # skip info
    for line_num in range(70):
        f_in.readline()

    new_data_header = f_in.readline()
# load data

# load genotype
genotypes = pd.read_csv(root_dir + "DELL.chr22.genotypes.for.modeling.vcf",
                        comment='#', sep='\t',
                        names=new_data_header.strip().split('\t'),
                        header=1,
                        index_col='Sample_id', dtype={'Sample_id':str})

headers = genotypes.columns[:]
ped_file = 'integrated_call_samples.20130502.ALL.ped'
pedigree = pd.read_csv(root_dir+ped_file, sep='\t', index_col='Individual ID')
Y_train = pedigree.loc[genotypes.index]['Population']
X = genotypes[genotypes.index.isin(Y_train.index)]
X = X.replace({
    '0|0': 0,
    '0|1': 1,
    '1|0': 1,
    '1|1': 2
})

In [21]:
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=2022)
fold = 0
_x = X[X.index.isin(Y_train.index)].to_numpy()[:, :-1]
for train_index, test_index in kf.split(_x):
  fold += 1
  print(f"Fold: {fold}")
  y = _x[test_index]
  for missing_perc in [0.05,
                       0.1,
                       0.2]:
    print(f"Missing rate: {missing_perc}")
    file_name = f"./experimental_results/Chr22/DEL/HLA_DEEP/preds_mixed_mr_{missing_perc}_rs_{fold}_.csv"
    result_genotypes = pd.read_csv(file_name, index_col=0)
    result_genotypes = result_genotypes.replace({
        '0|0': 0,
        '0|1': 1,
        '1|0': 1,
        '1|1': 2
    })
    yhat = result_genotypes.to_numpy()
    assert y.shape == yhat.shape
    iqs_a, iqs_mean_a = iqs_score(yhat, y)

    # print('iqs_mean:', iqs_mean_a)
    # print('iqs:', iqs_a)

    dataframe = pd.DataFrame(np.array([fold, missing_perc, iqs_mean_a, "HLA*DEEP"]).reshape((1, -1)), columns=columns)
    dfs.append(dataframe)

Fold: 1
Missing rate: 0.05
data_imp: (835, 572)
data_imp_012: (572, 835)
snp_cnt: 572
sample_cnt: 835
data_obs_012: (572, 835)


100%|██████████| 572/572 [00:00<00:00, 979.46it/s]


iqs mean: 0.8964028078953203
Missing rate: 0.1
data_imp: (835, 572)
data_imp_012: (572, 835)
snp_cnt: 572
sample_cnt: 835
data_obs_012: (572, 835)


100%|██████████| 572/572 [00:00<00:00, 1017.80it/s]


iqs mean: 0.8768880289722408
Missing rate: 0.2
data_imp: (835, 572)
data_imp_012: (572, 835)
snp_cnt: 572
sample_cnt: 835
data_obs_012: (572, 835)


100%|██████████| 572/572 [00:00<00:00, 967.88it/s]


iqs mean: 0.8385497465651487
Fold: 2
Missing rate: 0.05
data_imp: (834, 572)
data_imp_012: (572, 834)
snp_cnt: 572
sample_cnt: 834
data_obs_012: (572, 834)


100%|██████████| 572/572 [00:00<00:00, 1014.15it/s]


iqs mean: 0.876545957967092
Missing rate: 0.1
data_imp: (834, 572)
data_imp_012: (572, 834)
snp_cnt: 572
sample_cnt: 834
data_obs_012: (572, 834)


100%|██████████| 572/572 [00:00<00:00, 1030.61it/s]


iqs mean: 0.8621072606174227
Missing rate: 0.2
data_imp: (834, 572)
data_imp_012: (572, 834)
snp_cnt: 572
sample_cnt: 834
data_obs_012: (572, 834)


100%|██████████| 572/572 [00:00<00:00, 1040.01it/s]


iqs mean: 0.8426990265490621
Fold: 3
Missing rate: 0.05
data_imp: (834, 572)
data_imp_012: (572, 834)
snp_cnt: 572
sample_cnt: 834
data_obs_012: (572, 834)


100%|██████████| 572/572 [00:00<00:00, 977.81it/s] 


iqs mean: 0.8782428858082865
Missing rate: 0.1
data_imp: (834, 572)
data_imp_012: (572, 834)
snp_cnt: 572
sample_cnt: 834
data_obs_012: (572, 834)


100%|██████████| 572/572 [00:00<00:00, 933.14it/s] 


iqs mean: 0.8624152972610523
Missing rate: 0.2
data_imp: (834, 572)
data_imp_012: (572, 834)
snp_cnt: 572
sample_cnt: 834
data_obs_012: (572, 834)


100%|██████████| 572/572 [00:00<00:00, 1025.09it/s]

iqs mean: 0.8352408330806385





In [22]:
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=2022)
fold = 0
_x = X[X.index.isin(Y_train.index)].to_numpy()[:, :-1]
for train_index, test_index in kf.split(_x):
  fold += 1
  print(f"Fold: {fold}")
  y = _x[test_index]
  for missing_perc in [0.05,
                       0.1,
                       0.2]:
    print(f"Missing rate: {missing_perc}")
    file_name = f"./experimental_results/Chr22/DEL/SCDA/preds_mixed_mr_{missing_perc}_rs_{fold}_.csv"
    result_genotypes = pd.read_csv(file_name, index_col=0)
    result_genotypes = result_genotypes.replace({
        '0|0': 0,
        '0|1': 1,
        '1|0': 1,
        '1|1': 2
    })
    yhat = result_genotypes.to_numpy()
    assert y.shape == yhat.shape
    iqs_a, iqs_mean_a = iqs_score(yhat, y)

    # print('iqs_mean:', iqs_mean_a)
    # print('iqs:', iqs_a)

    dataframe = pd.DataFrame(np.array([fold, missing_perc, iqs_mean_a, "SCDA+"]).reshape((1, -1)), columns=columns)
    dfs.append(dataframe)

Fold: 1
Missing rate: 0.05
data_imp: (835, 572)
data_imp_012: (572, 835)
snp_cnt: 572
sample_cnt: 835
data_obs_012: (572, 835)


100%|██████████| 572/572 [00:00<00:00, 1026.98it/s]


iqs mean: 0.890564282123477
Missing rate: 0.1
data_imp: (835, 572)
data_imp_012: (572, 835)
snp_cnt: 572
sample_cnt: 835
data_obs_012: (572, 835)


100%|██████████| 572/572 [00:00<00:00, 903.62it/s] 


iqs mean: 0.8692987403066246
Missing rate: 0.2
data_imp: (835, 572)
data_imp_012: (572, 835)
snp_cnt: 572
sample_cnt: 835
data_obs_012: (572, 835)


100%|██████████| 572/572 [00:00<00:00, 1034.36it/s]


iqs mean: 0.8200527116201248
Fold: 2
Missing rate: 0.05
data_imp: (834, 572)
data_imp_012: (572, 834)
snp_cnt: 572
sample_cnt: 834
data_obs_012: (572, 834)


100%|██████████| 572/572 [00:00<00:00, 1026.96it/s]


iqs mean: 0.8865959485144825
Missing rate: 0.1
data_imp: (834, 572)
data_imp_012: (572, 834)
snp_cnt: 572
sample_cnt: 834
data_obs_012: (572, 834)


100%|██████████| 572/572 [00:00<00:00, 1005.25it/s]


iqs mean: 0.8620584525452124
Missing rate: 0.2
data_imp: (834, 572)
data_imp_012: (572, 834)
snp_cnt: 572
sample_cnt: 834
data_obs_012: (572, 834)


100%|██████████| 572/572 [00:00<00:00, 1037.99it/s]


iqs mean: 0.8164034005950548
Fold: 3
Missing rate: 0.05
data_imp: (834, 572)
data_imp_012: (572, 834)
snp_cnt: 572
sample_cnt: 834
data_obs_012: (572, 834)


100%|██████████| 572/572 [00:00<00:00, 1017.80it/s]


iqs mean: 0.8715485577225671
Missing rate: 0.1
data_imp: (834, 572)
data_imp_012: (572, 834)
snp_cnt: 572
sample_cnt: 834
data_obs_012: (572, 834)


100%|██████████| 572/572 [00:00<00:00, 1032.51it/s]


iqs mean: 0.8492924191985115
Missing rate: 0.2
data_imp: (834, 572)
data_imp_012: (572, 834)
snp_cnt: 572
sample_cnt: 834
data_obs_012: (572, 834)


100%|██████████| 572/572 [00:00<00:00, 927.07it/s]

iqs mean: 0.8207741148714447





In [23]:
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=2022)
fold = 0
_x = X[X.index.isin(Y_train.index)].to_numpy()[:, :-1]
for train_index, test_index in kf.split(_x):
  fold += 1
  print(f"Fold: {fold}")
  y = _x[test_index]
  for missing_perc in [0.05,
                       0.1,
                       0.2]:
    print(f"Missing rate: {missing_perc}")
    file_name = f"./experimental_results/Chr22/DEL/AE/preds_mixed_mr_{missing_perc}_fold_{fold}_.csv"
    result_genotypes = pd.read_csv(file_name, index_col=0)
    result_genotypes = result_genotypes.replace({
        0 : 0,
        1 : 1,
        2 : 1,
        3 : 2
    })
    yhat = result_genotypes.to_numpy()
    assert y.shape == yhat.shape
    iqs_a, iqs_mean_a = iqs_score(yhat, y)

    # print('iqs_mean:', iqs_mean_a)
    # print('iqs:', iqs_a)

    dataframe = pd.DataFrame(np.array([fold, missing_perc, iqs_mean_a, "AE"]).reshape((1, -1)), columns=columns)
    dfs.append(dataframe)

Fold: 1
Missing rate: 0.05
data_imp: (835, 572)
data_imp_012: (572, 835)
snp_cnt: 572
sample_cnt: 835
data_obs_012: (572, 835)


100%|██████████| 572/572 [00:00<00:00, 1030.62it/s]


iqs mean: 0.7306372096062076
Missing rate: 0.1
data_imp: (835, 572)
data_imp_012: (572, 835)
snp_cnt: 572
sample_cnt: 835
data_obs_012: (572, 835)


100%|██████████| 572/572 [00:00<00:00, 1026.95it/s]


iqs mean: 0.7453162231824787
Missing rate: 0.2
data_imp: (835, 572)
data_imp_012: (572, 835)
snp_cnt: 572
sample_cnt: 835
data_obs_012: (572, 835)


100%|██████████| 572/572 [00:00<00:00, 1017.82it/s]


iqs mean: 0.724838350395821
Fold: 2
Missing rate: 0.05
data_imp: (834, 572)
data_imp_012: (572, 834)
snp_cnt: 572
sample_cnt: 834
data_obs_012: (572, 834)


100%|██████████| 572/572 [00:00<00:00, 982.85it/s] 


iqs mean: 0.7789628136378918
Missing rate: 0.1
data_imp: (834, 572)
data_imp_012: (572, 834)
snp_cnt: 572
sample_cnt: 834
data_obs_012: (572, 834)


100%|██████████| 572/572 [00:00<00:00, 1005.24it/s]


iqs mean: 0.8406451721493636
Missing rate: 0.2
data_imp: (834, 572)
data_imp_012: (572, 834)
snp_cnt: 572
sample_cnt: 834
data_obs_012: (572, 834)


100%|██████████| 572/572 [00:00<00:00, 1106.35it/s]


iqs mean: 0.3688811188811189
Fold: 3
Missing rate: 0.05
data_imp: (834, 572)
data_imp_012: (572, 834)
snp_cnt: 572
sample_cnt: 834
data_obs_012: (572, 834)


100%|██████████| 572/572 [00:00<00:00, 971.14it/s] 


iqs mean: 0.7645384623626381
Missing rate: 0.1
data_imp: (834, 572)
data_imp_012: (572, 834)
snp_cnt: 572
sample_cnt: 834
data_obs_012: (572, 834)


100%|██████████| 572/572 [00:00<00:00, 1028.24it/s]


iqs mean: 0.7977114318166706
Missing rate: 0.2
data_imp: (834, 572)
data_imp_012: (572, 834)
snp_cnt: 572
sample_cnt: 834
data_obs_012: (572, 834)


100%|██████████| 572/572 [00:00<00:00, 1019.58it/s]

iqs mean: 0.651826938307196





In [24]:
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=2022)
fold = 0
_x = X[X.index.isin(Y_train.index)].to_numpy()
for train_index, test_index in kf.split(_x):
  fold += 1
  print(f"Fold: {fold}")
  y = _x[test_index]
  for missing_perc in [0.05,
                       0.1,
                       0.2]:
    print(f"Missing rate: {missing_perc}")
    file_name = f"./experimental_results/Chr22/DEL/STI_WE/preds_mixed_mr_{missing_perc}_rs_{fold}_.csv"
    result_genotypes = pd.read_csv(file_name, index_col=0)
    result_genotypes = result_genotypes.replace({
        '0|0': 0,
        '0|1': 1,
        '1|0': 1,
        '1|1': 2
    })
    yhat = result_genotypes.to_numpy()
    assert y.shape == yhat.shape
    iqs_a, iqs_mean_a = iqs_score(yhat, y)

    # print('iqs_mean:', iqs_mean_a)
    # print('iqs:', iqs_a)

    dataframe = pd.DataFrame(np.array([fold, missing_perc, iqs_mean_a, "STI*WE"]).reshape((1, -1)), columns=columns)
    dfs.append(dataframe)


Fold: 1
Missing rate: 0.05
data_imp: (835, 573)
data_imp_012: (573, 835)
snp_cnt: 573
sample_cnt: 835
data_obs_012: (573, 835)


100%|██████████| 573/573 [00:00<00:00, 994.80it/s] 


iqs mean: 0.9572486015916011
Missing rate: 0.1
data_imp: (835, 573)
data_imp_012: (573, 835)
snp_cnt: 573
sample_cnt: 835
data_obs_012: (573, 835)


100%|██████████| 573/573 [00:00<00:00, 969.54it/s] 


iqs mean: 0.9331872308771746
Missing rate: 0.2
data_imp: (835, 573)
data_imp_012: (573, 835)
snp_cnt: 573
sample_cnt: 835
data_obs_012: (573, 835)


100%|██████████| 573/573 [00:00<00:00, 969.52it/s]


iqs mean: 0.8801961146942185
Fold: 2
Missing rate: 0.05
data_imp: (834, 573)
data_imp_012: (573, 834)
snp_cnt: 573
sample_cnt: 834
data_obs_012: (573, 834)


100%|██████████| 573/573 [00:00<00:00, 991.35it/s] 


iqs mean: 0.9566823500643878
Missing rate: 0.1
data_imp: (834, 573)
data_imp_012: (573, 834)
snp_cnt: 573
sample_cnt: 834
data_obs_012: (573, 834)


100%|██████████| 573/573 [00:00<00:00, 994.77it/s] 


iqs mean: 0.9274252900730815
Missing rate: 0.2
data_imp: (834, 573)
data_imp_012: (573, 834)
snp_cnt: 573
sample_cnt: 834
data_obs_012: (573, 834)


100%|██████████| 573/573 [00:00<00:00, 1032.44it/s]


iqs mean: 0.881315491776947
Fold: 3
Missing rate: 0.05
data_imp: (834, 573)
data_imp_012: (573, 834)
snp_cnt: 573
sample_cnt: 834
data_obs_012: (573, 834)


100%|██████████| 573/573 [00:00<00:00, 976.15it/s] 


iqs mean: 0.9459849922332983
Missing rate: 0.1
data_imp: (834, 573)
data_imp_012: (573, 834)
snp_cnt: 573
sample_cnt: 834
data_obs_012: (573, 834)


100%|██████████| 573/573 [00:00<00:00, 968.90it/s] 


iqs mean: 0.912673203328194
Missing rate: 0.2
data_imp: (834, 573)
data_imp_012: (573, 834)
snp_cnt: 573
sample_cnt: 834
data_obs_012: (573, 834)


100%|██████████| 573/573 [00:00<00:00, 915.34it/s]

iqs mean: 0.8539854827113504





In [25]:
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=2022)
fold = 0
_x = X[X.index.isin(Y_train.index)].to_numpy()
for train_index, test_index in kf.split(_x):
  fold += 1
  print(f"Fold: {fold}")
  y = _x[test_index]
  for missing_perc in [0.05,
                       0.1,
                       0.2]:
    print(f"Missing rate: {missing_perc}")
    file_name = f"./experimental_results/Chr22/DEL/STI/preds_mixed_mr_{missing_perc}_rs_{fold}_.csv"
    result_genotypes = pd.read_csv(file_name, index_col=0)
    result_genotypes = result_genotypes.replace({
        '0|0': 0,
        '0|1': 1,
        '1|0': 1,
        '1|1': 2
    })
    yhat = result_genotypes.to_numpy()
    assert y.shape == yhat.shape
    iqs_a, iqs_mean_a = iqs_score(yhat, y)

    # print('iqs_mean:', iqs_mean_a)
    # print('iqs:', iqs_a)

    dataframe = pd.DataFrame(np.array([fold, missing_perc, iqs_mean_a, "STI"]).reshape((1, -1)), columns=columns)
    dfs.append(dataframe)

df = pd.DataFrame(columns = columns)
print("Combining the results...")
df = pd.concat(dfs, ignore_index=True)
del dfs

df

Fold: 1
Missing rate: 0.05
data_imp: (835, 573)
data_imp_012: (573, 835)
snp_cnt: 573
sample_cnt: 835
data_obs_012: (573, 835)


100%|██████████| 573/573 [00:00<00:00, 1028.13it/s]


iqs mean: 0.9681944310614421
Missing rate: 0.1
data_imp: (835, 573)
data_imp_012: (573, 835)
snp_cnt: 573
sample_cnt: 835
data_obs_012: (573, 835)


100%|██████████| 573/573 [00:00<00:00, 1026.92it/s]


iqs mean: 0.9237875857774382
Missing rate: 0.2
data_imp: (835, 573)
data_imp_012: (573, 835)
snp_cnt: 573
sample_cnt: 835
data_obs_012: (573, 835)


100%|██████████| 573/573 [00:00<00:00, 999.79it/s] 


iqs mean: 0.8630704066941796
Fold: 2
Missing rate: 0.05
data_imp: (834, 573)
data_imp_012: (573, 834)
snp_cnt: 573
sample_cnt: 834
data_obs_012: (573, 834)


100%|██████████| 573/573 [00:00<00:00, 1032.44it/s]


iqs mean: 0.9715622754531428
Missing rate: 0.1
data_imp: (834, 573)
data_imp_012: (573, 834)
snp_cnt: 573
sample_cnt: 834
data_obs_012: (573, 834)


100%|██████████| 573/573 [00:00<00:00, 1015.96it/s]


iqs mean: 0.952017344129189
Missing rate: 0.2
data_imp: (834, 573)
data_imp_012: (573, 834)
snp_cnt: 573
sample_cnt: 834
data_obs_012: (573, 834)


100%|██████████| 573/573 [00:00<00:00, 1005.32it/s]


iqs mean: 0.8982521711450453
Fold: 3
Missing rate: 0.05
data_imp: (834, 573)
data_imp_012: (573, 834)
snp_cnt: 573
sample_cnt: 834
data_obs_012: (573, 834)


100%|██████████| 573/573 [00:00<00:00, 981.18it/s] 


iqs mean: 0.9526346454373085
Missing rate: 0.1
data_imp: (834, 573)
data_imp_012: (573, 834)
snp_cnt: 573
sample_cnt: 834
data_obs_012: (573, 834)


100%|██████████| 573/573 [00:00<00:00, 1032.39it/s]


iqs mean: 0.9273173110620081
Missing rate: 0.2
data_imp: (834, 573)
data_imp_012: (573, 834)
snp_cnt: 573
sample_cnt: 834
data_obs_012: (573, 834)


100%|██████████| 573/573 [00:00<00:00, 1045.64it/s]

iqs mean: 0.8727860573256593
Combining the results...





Unnamed: 0,Fold,Missing Rate,IQS,Method
0,1,0.05,0.7810578595904459,Minimac4
1,1,0.1,0.7606228848142589,Minimac4
2,1,0.2,0.7253093500793352,Minimac4
3,2,0.05,0.8093062444669576,Minimac4
4,2,0.1,0.7886545024430117,Minimac4
5,2,0.2,0.7650665157960085,Minimac4
6,3,0.05,0.7917040502006942,Minimac4
7,3,0.1,0.7773012894646297,Minimac4
8,3,0.2,0.7272036898358366,Minimac4
9,1,0.05,0.8964028078953203,HLA*DEEP


In [26]:
df["IQS"] = df["IQS"].astype(float)
df = df.round({'IQS': 5})
df

Unnamed: 0,Fold,Missing Rate,IQS,Method
0,1,0.05,0.78106,Minimac4
1,1,0.1,0.76062,Minimac4
2,1,0.2,0.72531,Minimac4
3,2,0.05,0.80931,Minimac4
4,2,0.1,0.78865,Minimac4
5,2,0.2,0.76507,Minimac4
6,3,0.05,0.7917,Minimac4
7,3,0.1,0.7773,Minimac4
8,3,0.2,0.7272,Minimac4
9,1,0.05,0.8964,HLA*DEEP


In [27]:
df.to_csv("results_in_table/Chr22/DEL/IQS_all.csv")


In [29]:
df1 = df.groupby(["Method", "Fold", "Missing Rate"]).agg( {"IQS":"mean"})
df1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,IQS
Method,Fold,Missing Rate,Unnamed: 3_level_1
AE,1,0.05,0.73064
AE,1,0.1,0.74532
AE,1,0.2,0.72484
AE,2,0.05,0.77896
AE,2,0.1,0.84065
AE,2,0.2,0.36888
AE,3,0.05,0.76454
AE,3,0.1,0.79771
AE,3,0.2,0.65183
HLA*DEEP,1,0.05,0.8964


In [30]:
df1 = df.groupby(["Method", "Fold", "Missing Rate"]).agg( {"IQS":"mean"}).reset_index()# df1.columns = df1.columns.droplevel(0)
df1 = df1.groupby(["Method", "Missing Rate"]).agg( {"IQS":["mean", "std"]})#.reset_index()
df1


Unnamed: 0_level_0,Unnamed: 1_level_0,IQS,IQS
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std
Method,Missing Rate,Unnamed: 2_level_2,Unnamed: 3_level_2
AE,0.05,0.758047,0.024806
AE,0.1,0.79456,0.047743
AE,0.2,0.58185,0.188015
HLA*DEEP,0.05,0.88373,0.011005
HLA*DEEP,0.1,0.86714,0.008445
HLA*DEEP,0.2,0.83883,0.003738
Minimac4,0.05,0.794023,0.014268
Minimac4,0.1,0.775523,0.014099
Minimac4,0.2,0.739193,0.02243
SCDA+,0.05,0.882903,0.01003
