In [1]:
!pip install toolz scikit-allel

import os
import numpy as np
import pandas as pd
import ast
import math
import re
import itertools
import random
import json
import matplotlib
from matplotlib.pyplot import plot as plt
from scipy.spatial.distance import squareform
from pylab import *
mpl.use("pgf")
# activate latex text rendering
rc('text', usetex=True)
rc('axes', linewidth=2)
rc('font', weight='bold')
mpl.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
    'text.latex.preamble':r'\usepackage{sfmath} \boldmath'
})
%matplotlib inline
import seaborn as sns
sns.set_theme(style="whitegrid")
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import allel
# sns.set_theme(style="ticks")


You should consider upgrading via the 'c:\users\lab\appdata\local\programs\python\python38\python.exe -m pip install --upgrade pip' command.




In [2]:
# load data
root_dir = './data/'
new_data_header = ""
# get header
with open(root_dir + "DELL.chr22.genotypes.for.modeling.vcf", 'r') as f_in:
    # skip info
    for line_num in range(70):
        f_in.readline()

    new_data_header = f_in.readline()
# load data

# load genotype
genotypes = pd.read_csv(root_dir + "DELL.chr22.genotypes.for.modeling.vcf",
                        comment='#', sep='\t',
                        names=new_data_header.strip().split('\t'),
                        header=1,
                        index_col='Sample_id', dtype={'Sample_id':str})#.iloc[:, :-1]

headers = genotypes.columns[:]
genotypes.head()

Unnamed: 0_level_0,1,2,3,4,7,8,10,13,15,16,...,835,836,837,838,839,840,842,843,846,847
Sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HG00097,0|0,0|0,0|0,0|0,0|0,0|0,1|0,0|0,0|0,0|0,...,0|0,0|1,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
HG00099,0|0,0|0,0|0,0|0,0|0,0|0,1|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
HG00100,0|0,0|0,0|0,0|0,0|0,0|0,1|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
HG00101,0|0,0|0,0|0,0|0,0|0,0|0,1|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
HG00102,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0


In [3]:
ped_file = 'integrated_call_samples.20130502.ALL.ped'
pedigree = pd.read_csv(root_dir+ped_file, sep='\t', index_col='Individual ID')
pedigree.head()

Unnamed: 0_level_0,Family ID,Paternal ID,Maternal ID,Gender,Phenotype,Population,Relationship,Siblings,Second Order,Third Order,Children,Other Comments
Individual ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
HG00096,HG00096,0,0,1,0,GBR,unrel,0,0,0,0,0
HG00097,HG00097,0,0,2,0,GBR,unrel,0,0,0,0,0
HG00098,HG00098,0,0,1,0,GBR,unrel,0,0,0,0,0
HG00099,HG00099,0,0,2,0,GBR,unrel,0,0,0,0,0
HG00100,HG00100,0,0,2,0,GBR,unrel,0,0,0,0,0


In [4]:
Y_train = pedigree.loc[genotypes.index]['Population']
X = genotypes[genotypes.index.isin(Y_train.index)]
X = X.replace({
    '0|0': 0,
    '0|1': 1,
    '1|0': 2,
    '1|1': 3
})
X.shape

(2503, 573)

In [5]:
r = allel.rogers_huff_r(X.T)
LD = squareform(r ** 2)
LD.shape

(573, 573)

In [6]:
plt.figure(figsize=(8,8))
plt.imshow(LD)

<matplotlib.image.AxesImage at 0x20910012f70>

In [7]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
plt.hist(np.max(LD, axis=1), bins=bins)

(array([420.,  46.,  29.,  31.,  47.]),
 array([0. , 0.2, 0.4, 0.6, 0.8, 1. ]),
 <BarContainer object of 5 artists>)

In [8]:
LD_max_freqs = np.amax(LD, axis=1)
# mean_LDs = np.mean(LD, axis=1)
# max_mean_LD = np.max(mean_LDs)
# min_mean_LD = np.min(mean_LDs)
# mean_LDs_min_max_scaled = (mean_LDs - min_mean_LD)/(max_mean_LD - min_mean_LD)
bin_labels = np.digitize(LD_max_freqs, bins=bins, right=True)
# bin_labels = np.digitize(mean_LDs_min_max_scaled, bins=bins[1:], right=True)
bin_general_labels, bin_counts = np.unique(bin_labels, return_counts=True)
bin_general_labels, bin_counts

(array([1, 2, 3, 4, 5], dtype=int64),
 array([420,  46,  29,  31,  47], dtype=int64))

In [9]:
columns = ["Fold", "MissingRate", "LD", "Accuracy", "Sample ID"]
dfs = []

N_SPLITS=3
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=2022)
fold = 0
_x = X[X.index.isin(Y_train.index)].to_numpy()
for train_index, test_index in kf.split(_x):
  fold += 1
  print(f"Fold: {fold}")
  y = _x[test_index]
  for missing_perc in [0.05,
                       0.1,
                       0.2]:
    dataframe = pd.DataFrame(columns = columns)
    print(f"Missing rate: {missing_perc}")
    file_name = f"./experimental_results/Chr22/DEL/SCDA/preds_mixed_mr_{missing_perc}_rs_{fold}_.csv"
    result_genotypes = pd.read_csv(file_name, index_col=0)
    result_genotypes = result_genotypes.replace({
        '0|0': 0,
        '0|1': 1,
        '1|0': 2,
        '1|1': 3
    })
    yhat = result_genotypes.to_numpy()
    assert y.shape == yhat.shape
    for i in tqdm(list(range(y.shape[0]))):
        missing_index, _ = train_test_split(np.arange(y.shape[1]), train_size=missing_perc,
                                            random_state=i + fold,
                                            shuffle=True,
                                            stratify=bin_labels
                                            )
        # for mi in missing_index:
        #     maf_bin_label = maf_bin_labels[mi]
        #     df.loc[-1] = [fold, missing_perc, maf_bin_label, int(yhat[i, mi]==y[i, mi])]
        #     df.index += 1
        a = np.reshape(bin_labels[missing_index], (-1, 1))
        b = np.array([fold for _ in missing_index]).reshape((-1, 1))
        c = np.array([missing_perc for _ in missing_index]).reshape((-1, 1))
        d = (yhat[i, missing_index]==y[i, missing_index]).astype(int).reshape((-1, 1))
        e = np.array([X.index[test_index][i] for _ in missing_index]).astype(object).reshape((-1, 1))
        dataframe = pd.concat([pd.DataFrame(np.concatenate([b,c,a,d,e], axis=1), columns=dataframe.columns), dataframe], ignore_index=True)
    dfs.append(dataframe)

df = pd.DataFrame(columns = columns)
print("Combining the results...")
for i, dff in tqdm(enumerate(dfs)):
    df = pd.concat([df, dff], ignore_index=True)
del dfs

df["LD"] = df["LD"].replace({
    1.0: '0 <= LD < 0.2',
    2.0: '0.2 <= LD < 0.4',
    3.0: '0.4 <= LD < 0.6',
    4.0: '0.6 <= LD < 0.8',
    5.0: '0.8 <= LD <= 1',
})
df

Fold: 1
Missing rate: 0.05


AssertionError: 

In [None]:
df.to_csv("results_in_table/Chr22/DEL/LD_SCDA.csv")

In [None]:
df.groupby(["MissingRate", "LD"]).agg( {"Accuracy":["mean", "std"]})

In [None]:
df1 = df.groupby(["Fold", "MissingRate", "Sample ID"]).agg( {"Accuracy":"mean"}).reset_index()
df1 = df1.groupby(["Fold", "MissingRate"]).agg( {"Accuracy":"mean"})
df1

In [None]:
df1 = df.groupby(["Fold", "MissingRate", "Sample ID"]).agg( {"Accuracy":"mean"}).reset_index()
df1 = df1.groupby(["Fold", "MissingRate"]).agg( {"Accuracy":"mean"}).reset_index()# df1.columns = df1.columns.droplevel(0)
df1 = df1.groupby(["MissingRate"]).agg( {"Accuracy":["mean", "std"]})#.reset_index()
df1

In [27]:
# print(df.groupby(["MissingRate", "LD"]).agg( {"Accuracy":["mean", "std"]}).round({'mean': 4})#.rename(columns={"Accuracy": "Max Accuracy"})
#             .to_latex(multirow=True))

In [10]:
columns = ["Fold", "MissingRate", "LD", "Accuracy", "Sample ID"]
dfs = []

N_SPLITS=3
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=2022)
fold = 0
_x = X[X.index.isin(Y_train.index)].to_numpy()
for train_index, test_index in kf.split(_x):
  fold += 1
  print(f"Fold: {fold}")
  y = _x[test_index]
  for missing_perc in [0.05,
                       0.1,
                       0.2]:
    dataframe = pd.DataFrame(columns = columns)
    print(f"Missing rate: {missing_perc}")
    file_name = f"./experimental_results/Chr22/DEL/AE/preds_mixed_mr_{missing_perc}_fold_{fold}_.csv"
    result_genotypes = pd.read_csv(file_name, index_col=0)
    # result_genotypes = result_genotypes.replace({
    #     '0|0': 0,
    #     '0|1': 1,
    #     '1|0': 2,
    #     '1|1': 3
    # })
    yhat = result_genotypes.to_numpy()
    assert y.shape == yhat.shape
    for i in tqdm(list(range(y.shape[0]))):
        missing_index, _ = train_test_split(np.arange(y.shape[1]), train_size=missing_perc,
                                            random_state=i + fold,
                                            shuffle=True,
                                            stratify=bin_labels
                                            )
        # for mi in missing_index:
        #     maf_bin_label = maf_bin_labels[mi]
        #     df.loc[-1] = [fold, missing_perc, maf_bin_label, int(yhat[i, mi]==y[i, mi])]
        #     df.index += 1
        a = np.reshape(bin_labels[missing_index], (-1, 1))
        b = np.array([fold for _ in missing_index]).reshape((-1, 1))
        c = np.array([missing_perc for _ in missing_index]).reshape((-1, 1))
        d = (yhat[i, missing_index]==y[i, missing_index]).astype(int).reshape((-1, 1))
        e = np.array([X.index[test_index][i] for _ in missing_index]).astype(object).reshape((-1, 1))
        dataframe = pd.concat([pd.DataFrame(np.concatenate([b,c,a,d,e], axis=1), columns=dataframe.columns), dataframe], ignore_index=True)
    dfs.append(dataframe)

df = pd.DataFrame(columns = columns)
print("Combining the results...")
for i, dff in tqdm(enumerate(dfs)):
    df = pd.concat([df, dff], ignore_index=True)
del dfs

df["LD"] = df["LD"].replace({
    1.0: '0 <= LD < 0.2',
    2.0: '0.2 <= LD < 0.4',
    3.0: '0.4 <= LD < 0.6',
    4.0: '0.6 <= LD < 0.8',
    5.0: '0.8 <= LD <= 1',
})
df

Fold: 1
Missing rate: 0.05


AssertionError: 

In [11]:
df.to_csv("results_in_table/Chr22/DEL/LD_AE.csv")

In [11]:
df.groupby(["MissingRate", "LD"]).agg( {"Accuracy":["mean", "std"]})

NameError: name 'df' is not defined

In [12]:
df1 = df.groupby(["Fold", "MissingRate", "Sample ID"]).agg( {"Accuracy":"mean"}).reset_index()
df1 = df1.groupby(["Fold", "MissingRate"]).agg( {"Accuracy":"mean"})
df1

NameError: name 'df' is not defined

In [13]:
df1 = df.groupby(["Fold", "MissingRate", "Sample ID"]).agg( {"Accuracy":"mean"}).reset_index()
df1 = df1.groupby(["Fold", "MissingRate"]).agg( {"Accuracy":"mean"}).reset_index()# df1.columns = df1.columns.droplevel(0)
df1 = df1.groupby(["MissingRate"]).agg( {"Accuracy":["mean", "std"]})#.reset_index()
df1

NameError: name 'df' is not defined

In [15]:
# print(df.groupby(["MissingRate", "LD"]).agg( {"Accuracy":["mean", "std"]}).round({'mean': 4})#.rename(columns={"Accuracy": "Max Accuracy"})
#             .to_latex(multirow=True))

In [14]:
columns = ["Fold", "MissingRate", "LD", "Accuracy", "Sample ID"]
dfs = []

N_SPLITS=3
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=2022)
fold = 0
_x = X[X.index.isin(Y_train.index)].to_numpy()
for train_index, test_index in kf.split(_x):
  fold += 1
  print(f"Fold: {fold}")
  y = _x[test_index]
  for missing_perc in [0.05,
                       0.1,
                       0.2]:
    dataframe = pd.DataFrame(columns = columns)
    print(f"Missing rate: {missing_perc}")
    file_name = f"./experimental_results/Chr22/DEL/HLA_DEEP/preds_mixed_mr_{missing_perc}_rs_{fold}_.csv"
    result_genotypes = pd.read_csv(file_name, index_col=0)
    result_genotypes = result_genotypes.replace({
        '0|0': 0,
        '0|1': 1,
        '1|0': 2,
        '1|1': 3
    })
    yhat = result_genotypes.to_numpy()
    assert y.shape == yhat.shape
    for i in tqdm(list(range(y.shape[0]))):
        missing_index, _ = train_test_split(np.arange(y.shape[1]), train_size=missing_perc,
                                            random_state=i + fold,
                                            shuffle=True,
                                            stratify=bin_labels
                                            )
        # for mi in missing_index:
        #     maf_bin_label = maf_bin_labels[mi]
        #     df.loc[-1] = [fold, missing_perc, maf_bin_label, int(yhat[i, mi]==y[i, mi])]
        #     df.index += 1
        a = np.reshape(bin_labels[missing_index], (-1, 1))
        b = np.array([fold for _ in missing_index]).reshape((-1, 1))
        c = np.array([missing_perc for _ in missing_index]).reshape((-1, 1))
        d = (yhat[i, missing_index]==y[i, missing_index]).astype(int).reshape((-1, 1))
        e = np.array([X.index[test_index][i] for _ in missing_index]).astype(object).reshape((-1, 1))
        dataframe = pd.concat([pd.DataFrame(np.concatenate([b,c,a,d,e], axis=1), columns=dataframe.columns), dataframe], ignore_index=True)
    dfs.append(dataframe)

df = pd.DataFrame(columns = columns)
print("Combining the results...")
for i, dff in tqdm(enumerate(dfs)):
    df = pd.concat([df, dff], ignore_index=True)
del dfs

df["LD"] = df["LD"].replace({
    1.0: '0 <= LD < 0.2',
    2.0: '0.2 <= LD < 0.4',
    3.0: '0.4 <= LD < 0.6',
    4.0: '0.6 <= LD < 0.8',
    5.0: '0.8 <= LD <= 1',
})
df

Fold: 1
Missing rate: 0.05


AssertionError: 

In [15]:
df.to_csv("results_in_table/Chr22/DEL/LD_HLA_DEEP.csv")

NameError: name 'df' is not defined

In [18]:
df.groupby(["MissingRate", "LD"]).agg( {"Accuracy":["mean", "std"]})

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Accuracy
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std
MissingRate,LD,Unnamed: 2_level_2,Unnamed: 3_level_2
0.05,0 <= LD < 0.2,0.953294,0.21101
0.05,0.2 <= LD < 0.4,0.985417,0.119886
0.05,0.4 <= LD < 0.6,0.9996,0.019988
0.05,0.6 <= LD < 0.8,0.996604,0.058181
0.05,0.8 <= LD <= 1,0.985417,0.119886
0.1,0 <= LD < 0.2,0.952971,0.211703
0.1,0.2 <= LD < 0.4,0.986017,0.117427
0.1,0.4 <= LD < 0.6,0.999334,0.025798
0.1,0.6 <= LD < 0.8,0.996671,0.057608
0.1,0.8 <= LD <= 1,0.985537,0.119393


In [19]:
df1 = df.groupby(["Fold", "MissingRate", "Sample ID"]).agg( {"Accuracy":"mean"}).reset_index()
df1 = df1.groupby(["Fold", "MissingRate"]).agg( {"Accuracy":"mean"})
df1

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy
Fold,MissingRate,Unnamed: 2_level_1
1,0.05,0.961762
1,0.1,0.961782
1,0.2,0.961603
2,0.05,0.962273
2,0.1,0.96304
2,0.2,0.962083
3,0.05,0.963857
3,0.1,0.963839
3,0.2,0.963314


In [20]:
df1 = df.groupby(["Fold", "MissingRate", "Sample ID"]).agg( {"Accuracy":"mean"}).reset_index()
df1 = df1.groupby(["Fold", "MissingRate"]).agg( {"Accuracy":"mean"}).reset_index()# df1.columns = df1.columns.droplevel(0)
df1 = df1.groupby(["MissingRate"]).agg( {"Accuracy":["mean", "std"]})#.reset_index()
df1

Unnamed: 0_level_0,Accuracy,Accuracy
Unnamed: 0_level_1,mean,std
MissingRate,Unnamed: 1_level_2,Unnamed: 2_level_2
0.05,0.962631,0.001093
0.1,0.962887,0.001037
0.2,0.962333,0.000882


In [21]:
# print(df.groupby(["MissingRate", "LD"]).agg( {"Accuracy":["mean", "std"]}).round({'mean': 4})#.rename(columns={"Accuracy": "Max Accuracy"})
#             .to_latex(multirow=True))

In [10]:

columns = ["Fold", "MissingRate", "LD", "Accuracy", "Sample ID"]
dfs = []

N_SPLITS=3
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=2022)
fold = 0
_x = X[X.index.isin(Y_train.index)].to_numpy()
for train_index, test_index in kf.split(_x):
  fold += 1
  print(f"Fold: {fold}")
  y = _x[test_index]
  for missing_perc in [0.05,
                       0.1,
                       0.2]:
    dataframe = pd.DataFrame(columns = columns)
    print(f"Missing rate: {missing_perc}")
    file_name = f"./experimental_results/Chr22/DEL/STI/preds_mixed_mr_{missing_perc}_rs_{fold}_.csv"
    result_genotypes = pd.read_csv(file_name, index_col=0)
    result_genotypes = result_genotypes.replace({
        '0|0': 0,
        '0|1': 1,
        '1|0': 2,
        '1|1': 3
    })
    yhat = result_genotypes.to_numpy()
    assert y.shape == yhat.shape
    for i in tqdm(list(range(y.shape[0]))):
        missing_index, _ = train_test_split(np.arange(y.shape[1]), train_size=missing_perc,
                                            random_state=i + fold,
                                            shuffle=True,
                                            stratify=bin_labels
                                            )
        # for mi in missing_index:
        #     maf_bin_label = maf_bin_labels[mi]
        #     df.loc[-1] = [fold, missing_perc, maf_bin_label, int(yhat[i, mi]==y[i, mi])]
        #     df.index += 1
        a = np.reshape(bin_labels[missing_index], (-1, 1))
        b = np.array([fold for _ in missing_index]).reshape((-1, 1))
        c = np.array([missing_perc for _ in missing_index]).reshape((-1, 1))
        d = (yhat[i, missing_index]==y[i, missing_index]).astype(int).reshape((-1, 1))
        e = np.array([X.index[test_index][i] for _ in missing_index]).astype(object).reshape((-1, 1))
        dataframe = pd.concat([pd.DataFrame(np.concatenate([b,c,a,d,e], axis=1), columns=dataframe.columns), dataframe], ignore_index=True)
    dfs.append(dataframe)

df = pd.DataFrame(columns = columns)
print("Combining the results...")
for i, dff in tqdm(enumerate(dfs)):
    df = pd.concat([df, dff], ignore_index=True)
del dfs

df["LD"] = df["LD"].replace({
    1.0: '0 <= LD < 0.2',
    2.0: '0.2 <= LD < 0.4',
    3.0: '0.4 <= LD < 0.6',
    4.0: '0.6 <= LD < 0.8',
    5.0: '0.8 <= LD <= 1',
})
df

Fold: 1
Missing rate: 0.05


100%|██████████| 835/835 [00:01<00:00, 715.43it/s]


Missing rate: 0.1


100%|██████████| 835/835 [00:01<00:00, 444.71it/s]


Missing rate: 0.2


100%|██████████| 835/835 [00:03<00:00, 262.61it/s]


Fold: 2
Missing rate: 0.05


100%|██████████| 834/834 [00:01<00:00, 712.99it/s]


Missing rate: 0.1


100%|██████████| 834/834 [00:01<00:00, 457.47it/s]


Missing rate: 0.2


100%|██████████| 834/834 [00:03<00:00, 265.90it/s]


Fold: 3
Missing rate: 0.05


100%|██████████| 834/834 [00:01<00:00, 740.90it/s]


Missing rate: 0.1


100%|██████████| 834/834 [00:01<00:00, 461.24it/s]


Missing rate: 0.2


100%|██████████| 834/834 [00:03<00:00, 266.77it/s]


Combining the results...


9it [00:00, 104.31it/s]


Unnamed: 0,Fold,MissingRate,LD,Accuracy,Sample ID
0,1,0.05,0 <= LD < 0.2,1,NA21144
1,1,0.05,0.2 <= LD < 0.4,1,NA21144
2,1,0.05,0 <= LD < 0.2,1,NA21144
3,1,0.05,0 <= LD < 0.2,1,NA21144
4,1,0.05,0 <= LD < 0.2,1,NA21144
...,...,...,...,...,...
498092,3,0.2,0 <= LD < 0.2,1,HG00099
498093,3,0.2,0.8 <= LD <= 1,1,HG00099
498094,3,0.2,0 <= LD < 0.2,1,HG00099
498095,3,0.2,0.4 <= LD < 0.6,1,HG00099


In [11]:
df.to_csv("results_in_table/Chr22/DEL/LD_STI.csv")

In [12]:
df.groupby(["MissingRate", "LD"]).agg( {"Accuracy":["mean", "std"]})


Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Accuracy
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std
MissingRate,LD,Unnamed: 2_level_2,Unnamed: 3_level_2
0.05,0 <= LD < 0.2,0.954455,0.208499
0.05,0.2 <= LD < 0.4,0.987215,0.112355
0.05,0.4 <= LD < 0.6,0.999201,0.028262
0.05,0.6 <= LD < 0.8,0.997004,0.054663
0.05,0.8 <= LD <= 1,0.994007,0.077189
0.1,0 <= LD < 0.2,0.954046,0.209387
0.1,0.2 <= LD < 0.4,0.987115,0.112782
0.1,0.4 <= LD < 0.6,0.999068,0.03052
0.1,0.6 <= LD < 0.8,0.99707,0.054052
0.1,0.8 <= LD <= 1,0.993208,0.082136


In [13]:
df1 = df.groupby(["Fold", "MissingRate", "Sample ID"]).agg( {"Accuracy":"mean"}).reset_index()
df1 = df1.groupby(["Fold", "MissingRate"]).agg( {"Accuracy":"mean"})
df1

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy
Fold,MissingRate,Unnamed: 2_level_1
1,0.05,0.964842
1,0.1,0.964009
1,0.2,0.96341
2,0.05,0.964029
2,0.1,0.965396
2,0.2,0.964166
3,0.05,0.9639
3,0.1,0.963903
3,0.2,0.963976


In [14]:
df1 = df.groupby(["Fold", "MissingRate", "Sample ID"]).agg( {"Accuracy":"mean"}).reset_index()
df1 = df1.groupby(["Fold", "MissingRate"]).agg( {"Accuracy":"mean"}).reset_index()# df1.columns = df1.columns.droplevel(0)
df1 = df1.groupby(["MissingRate"]).agg( {"Accuracy":["mean", "std"]})#.reset_index()
df1

Unnamed: 0_level_0,Accuracy,Accuracy
Unnamed: 0_level_1,mean,std
MissingRate,Unnamed: 1_level_2,Unnamed: 2_level_2
0.05,0.964257,0.000511
0.1,0.964436,0.000833
0.2,0.963851,0.000393


In [20]:

columns = ["Fold", "MissingRate", "LD", "Accuracy", "Sample ID"]
dfs = []

N_SPLITS=3
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=2022)
fold = 0
_x = X[X.index.isin(Y_train.index)].to_numpy()
for train_index, test_index in kf.split(_x):
  fold += 1
  print(f"Fold: {fold}")
  y = _x[test_index]
  for missing_perc in [0.05,
                       0.1,
                       0.2]:
    dataframe = pd.DataFrame(columns = columns)
    print(f"Missing rate: {missing_perc}")
    file_name = f"./experimental_results/Chr22/DEL/STI_WE/preds_mixed_mr_{missing_perc}_rs_{fold}_.csv"
    result_genotypes = pd.read_csv(file_name, index_col=0)
    result_genotypes = result_genotypes.replace({
        '0|0': 0,
        '0|1': 1,
        '1|0': 2,
        '1|1': 3
    })
    yhat = result_genotypes.to_numpy()
    assert y.shape == yhat.shape
    for i in tqdm(list(range(y.shape[0]))):
        missing_index, _ = train_test_split(np.arange(y.shape[1]), train_size=missing_perc,
                                            random_state=i + fold,
                                            shuffle=True,
                                            stratify=bin_labels
                                            )
        # for mi in missing_index:
        #     maf_bin_label = maf_bin_labels[mi]
        #     df.loc[-1] = [fold, missing_perc, maf_bin_label, int(yhat[i, mi]==y[i, mi])]
        #     df.index += 1
        a = np.reshape(bin_labels[missing_index], (-1, 1))
        b = np.array([fold for _ in missing_index]).reshape((-1, 1))
        c = np.array([missing_perc for _ in missing_index]).reshape((-1, 1))
        d = (yhat[i, missing_index]==y[i, missing_index]).astype(int).reshape((-1, 1))
        e = np.array([X.index[test_index][i] for _ in missing_index]).astype(object).reshape((-1, 1))
        dataframe = pd.concat([pd.DataFrame(np.concatenate([b,c,a,d,e], axis=1), columns=dataframe.columns), dataframe], ignore_index=True)
    dfs.append(dataframe)

df = pd.DataFrame(columns = columns)
print("Combining the results...")
for i, dff in tqdm(enumerate(dfs)):
    df = pd.concat([df, dff], ignore_index=True)
del dfs

df["LD"] = df["LD"].replace({
    1.0: '0 <= LD < 0.2',
    2.0: '0.2 <= LD < 0.4',
    3.0: '0.4 <= LD < 0.6',
    4.0: '0.6 <= LD < 0.8',
    5.0: '0.8 <= LD <= 1',
})
df

Fold: 1
Missing rate: 0.05


100%|██████████| 835/835 [00:01<00:00, 463.02it/s]


Missing rate: 0.1


100%|██████████| 835/835 [00:02<00:00, 292.01it/s]


Missing rate: 0.2


100%|██████████| 835/835 [00:04<00:00, 179.97it/s]


Fold: 2
Missing rate: 0.05


100%|██████████| 834/834 [00:01<00:00, 470.77it/s]


Missing rate: 0.1


100%|██████████| 834/834 [00:02<00:00, 312.21it/s]


Missing rate: 0.2


100%|██████████| 834/834 [00:04<00:00, 186.29it/s]


Fold: 3
Missing rate: 0.05


100%|██████████| 834/834 [00:01<00:00, 485.69it/s]


Missing rate: 0.1


100%|██████████| 834/834 [00:02<00:00, 307.98it/s]


Missing rate: 0.2


100%|██████████| 834/834 [00:04<00:00, 189.79it/s]


Combining the results...


9it [00:00, 89.11it/s]


Unnamed: 0,Fold,MissingRate,LD,Accuracy,Sample ID
0,1,0.05,0 <= LD < 0.2,1,NA21144
1,1,0.05,0.2 <= LD < 0.4,1,NA21144
2,1,0.05,0 <= LD < 0.2,1,NA21144
3,1,0.05,0 <= LD < 0.2,1,NA21144
4,1,0.05,0 <= LD < 0.2,1,NA21144
...,...,...,...,...,...
498092,3,0.2,0 <= LD < 0.2,1,HG00099
498093,3,0.2,0.8 <= LD <= 1,1,HG00099
498094,3,0.2,0 <= LD < 0.2,1,HG00099
498095,3,0.2,0.4 <= LD < 0.6,1,HG00099


In [21]:
df.groupby(["MissingRate", "LD"]).agg( {"Accuracy":["mean", "std"]})


Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Accuracy
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std
MissingRate,LD,Unnamed: 2_level_2,Unnamed: 3_level_2
0.05,0 <= LD < 0.2,0.954036,0.209409
0.05,0.2 <= LD < 0.4,0.986416,0.115766
0.05,0.4 <= LD < 0.6,0.999201,0.028262
0.05,0.6 <= LD < 0.8,0.996604,0.058181
0.05,0.8 <= LD <= 1,0.984019,0.125414
0.1,0 <= LD < 0.2,0.953618,0.210313
0.1,0.2 <= LD < 0.4,0.985617,0.119068
0.1,0.4 <= LD < 0.6,0.998801,0.034602
0.1,0.6 <= LD < 0.8,0.996404,0.05986
0.1,0.8 <= LD <= 1,0.98314,0.128751


In [22]:
df1 = df.groupby(["Fold", "MissingRate", "Sample ID"]).agg( {"Accuracy":"mean"}).reset_index()
df1 = df1.groupby(["Fold", "MissingRate"]).agg( {"Accuracy":"mean"})
df1

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy
Fold,MissingRate,Unnamed: 2_level_1
1,0.05,0.964328
1,0.1,0.963273
1,0.2,0.96258
2,0.05,0.961888
2,0.1,0.963208
2,0.2,0.962935
3,0.05,0.963215
3,0.1,0.962767
3,0.2,0.963019


In [23]:
df1 = df.groupby(["Fold", "MissingRate", "Sample ID"]).agg( {"Accuracy":"mean"}).reset_index()
df1 = df1.groupby(["Fold", "MissingRate"]).agg( {"Accuracy":"mean"}).reset_index()# df1.columns = df1.columns.droplevel(0)
df1 = df1.groupby(["MissingRate"]).agg( {"Accuracy":["mean", "std"]})#.reset_index()