This workflow creates the datafiles to be used for GSMR and processes

In [1]:
import pandas as pd

## Region Extraction

### Create unrelated samples with header

In [2]:
unrelated = pd.read_csv("/mnt/mfs/statgen/archive/UKBiobank_Yale_transfer/pleiotropy_geneticfiles/unrelated_n307259/UKB_genotypedatadownloaded0830_unrelatedCAUConly_307259indiv_529024snps_120319_withdrawnsubjects021820.fam_IDonly", sep="\t", header=None, names=["IID"])

In [3]:
unrelated

Unnamed: 0,IID
0,1000019
1,1000022
2,1000035
3,1000046
4,1000054
...,...
307250,6025346
307251,6025354
307252,6025363
307253,6025409


In [4]:
unrelated.to_csv("UKB_genotypedatadownloaded0830_unrelatedCAUConly_307259indiv_529024snps_120319_withdrawnsubjects021820.fam_IDonly", sep="\t", index=False)

## Create GWAS sumdata format for GSMR

In [5]:
# reading in the regenie on the imputed data
# original regenie files located on Yale cluster

# ASTHMA: /home/yl2385/work/21fall/220110_regenie_asthma_rerun/asthma_PC10_step2_imp.regenie_ASTHMA.regenie
# T2D: /home/yl2385/work/21fall/220110_regenie_T2D_rerun/T2D_PC10_step2_imp.regenie_T2D.regenie
# WC: /home/yl2385/work/21fall/220110_regenie_WC_rerun/WC_PC10_step2_imp.regenie_WAISTcirc_invranknorm.regenie

# first subsetting regenie data to only keep information within jazf1 region - 7 27868573 28273990
# also only keeping necessary columns for analysis

# asthma data
asthma_regenie = pd.read_csv("/mnt/mfs/hgrcgrid/homes/tf2478/regenie_rerun_yining_030222/asthma_PC10_step2_imp.regenie_ASTHMA.regenie", sep=" ")
asthma_regenie = asthma_regenie[(asthma_regenie["CHROM"] == 7) & (asthma_regenie["GENPOS"] >= 27868573) & (asthma_regenie["GENPOS"] <= 28273990)][["CHROM", "GENPOS", "ID", "ALLELE0", "ALLELE1", "A1FREQ", "N", "BETA", "SE", "LOG10P"]]

# t2d data
t2d_regenie = pd.read_csv("/mnt/mfs/hgrcgrid/homes/tf2478/regenie_rerun_yining_030222/T2D_PC10_step2_imp.regenie_T2D.regenie", sep=" ")
t2d_regenie = t2d_regenie[(t2d_regenie["CHROM"] == 7) & (t2d_regenie["GENPOS"] >= 27868573) & (t2d_regenie["GENPOS"] <= 28273990)][["CHROM", "GENPOS", "ID", "ALLELE0", "ALLELE1", "A1FREQ", "N", "BETA", "SE", "LOG10P"]]

# waist circumference data
waist_regenie = pd.read_csv("/mnt/mfs/hgrcgrid/homes/tf2478/regenie_rerun_yining_030222/WC_PC10_step2_imp.regenie_WAISTcirc_invranknorm.regenie", sep=" ")
waist_regenie = waist_regenie[(waist_regenie["CHROM"] == 7) & (waist_regenie["GENPOS"] >= 27868573) & (waist_regenie["GENPOS"] <= 28273990)][["CHROM", "GENPOS", "ID", "ALLELE0", "ALLELE1", "A1FREQ", "N", "BETA", "SE", "LOG10P"]]

In [6]:
asthma_regenie

Unnamed: 0,CHROM,GENPOS,ID,ALLELE0,ALLELE1,A1FREQ,N,BETA,SE,LOG10P
6069486,7,27869098,rs545409685,C,T,0.997340,339345,0.116773,0.071129,0.997184
6069487,7,27869261,7:27869261_CAGTA_C,C,CAGTA,0.998498,339345,0.036830,0.098431,0.149797
6069488,7,27869377,rs73075348,G,A,0.943372,339345,-0.000204,0.015226,0.004665
6069489,7,27869782,rs6948467,A,G,0.607234,339345,-0.002206,0.007223,0.119179
6069490,7,27869794,rs73075354,G,C,0.883803,339345,0.012311,0.011001,0.579898
...,...,...,...,...,...,...,...,...,...,...
6071548,7,28273623,7:28273623_TTTCCTTCCTTCC_T,T,TTTCCTTCCTTCC,0.842749,339345,0.000601,0.009892,0.021555
6071549,7,28273697,rs188426589,A,T,0.987408,339345,-0.004086,0.031982,0.046563
6071550,7,28273719,rs6944995,G,T,0.146693,339345,0.006399,0.009963,0.283409
6071551,7,28273829,rs192297723,C,A,0.988365,339345,0.013026,0.035082,0.148496


In [7]:
t2d_regenie

Unnamed: 0,CHROM,GENPOS,ID,ALLELE0,ALLELE1,A1FREQ,N,BETA,SE,LOG10P
6070158,7,27869098,rs545409685,C,T,0.997337,336074,0.038626,0.100036,0.155268
6070159,7,27869261,7:27869261_CAGTA_C,C,CAGTA,0.998492,336074,0.108386,0.140165,0.357178
6070160,7,27869377,rs73075348,G,A,0.943307,336074,0.016225,0.021522,0.345893
6070161,7,27869782,rs6948467,A,G,0.607261,336074,-0.020544,0.010241,1.348310
6070162,7,27869794,rs73075354,G,C,0.883898,336074,0.038589,0.015628,1.868420
...,...,...,...,...,...,...,...,...,...,...
6072220,7,28273623,7:28273623_TTTCCTTCCTTCC_T,T,TTTCCTTCCTTCC,0.842757,336074,-0.007129,0.014019,0.213893
6072221,7,28273697,rs188426589,A,T,0.987404,336074,-0.013464,0.045338,0.115494
6072222,7,28273719,rs6944995,G,T,0.146537,336074,0.013798,0.014138,0.482690
6072223,7,28273829,rs192297723,C,A,0.988329,336074,-0.051344,0.049307,0.526178


In [8]:
waist_regenie

Unnamed: 0,CHROM,GENPOS,ID,ALLELE0,ALLELE1,A1FREQ,N,BETA,SE,LOG10P
6069968,7,27869098,rs545409685,C,T,0.997340,365499,0.000830,0.009980,0.029799
6069969,7,27869261,7:27869261_CAGTA_C,C,CAGTA,0.998506,365499,-0.018772,0.013917,0.751086
6069970,7,27869377,rs73075348,G,A,0.943390,365499,-0.002467,0.002139,0.604290
6069971,7,27869782,rs6948467,A,G,0.607228,365499,-0.002577,0.001015,1.953540
6069972,7,27869794,rs73075354,G,C,0.883827,365499,0.006933,0.001547,5.128300
...,...,...,...,...,...,...,...,...,...,...
6072031,7,28273623,7:28273623_TTTCCTTCCTTCC_T,T,TTTCCTTCCTTCC,0.842736,365499,-0.001481,0.001391,0.541901
6072032,7,28273697,rs188426589,A,T,0.987452,365499,-0.008732,0.004520,1.272780
6072033,7,28273719,rs6944995,G,T,0.146811,365499,0.003185,0.001400,1.640000
6072034,7,28273829,rs192297723,C,A,0.988319,365499,-0.002544,0.004923,0.218021


In [9]:
# gsmr sumdata uses SNP, a1, a2, a1_freq, bzx, bzx_se, bzx_pval, bzx_n, bzy, bzy_se, bzy_pval, bzy_n as columns

# from current regenie data need to calculate A0FREQ and PVAL

# A0FREQ
get_a0freq = lambda row: 1 - row["A1FREQ"]

asthma_regenie["A0FREQ"] = asthma_regenie.apply(get_a0freq, axis=1)
t2d_regenie["A0FREQ"] = t2d_regenie.apply(get_a0freq, axis=1)
waist_regenie["A0FREQ"] = waist_regenie.apply(get_a0freq, axis=1)

# PVAL
get_pval = lambda row: 10 ** (-row["LOG10P"])

asthma_regenie["PVAL"] = asthma_regenie.apply(get_pval, axis=1)
t2d_regenie["PVAL"] = t2d_regenie.apply(get_pval, axis=1)
waist_regenie["PVAL"] = waist_regenie.apply(get_pval, axis=1)

# also renaming all the columns for merging later on
asthma_regenie = asthma_regenie.rename(columns={"ID":"SNP", "ALLELE0":"a1", "ALLELE1":"a2", "A0FREQ":"asthma_a1_freq", "N":"asthma_n", "BETA":"asthma_beta", "SE":"asthma_se", "PVAL":"asthma_pval"})
t2d_regenie = t2d_regenie.rename(columns={"ID":"SNP", "ALLELE0":"a1", "ALLELE1":"a2", "A0FREQ":"t2d_a1_freq", "N":"t2d_n", "BETA":"t2d_beta", "SE":"t2d_se", "PVAL":"t2d_pval"})
waist_regenie = waist_regenie.rename(columns={"ID":"SNP", "ALLELE0":"a1", "ALLELE1":"a2", "A0FREQ":"waist_a1_freq", "N":"waist_n", "BETA":"waist_beta", "SE":"waist_se", "PVAL":"waist_pval"})

# keeping only relevant columns
asthma_regenie = asthma_regenie[["SNP", "a1", "a2", "asthma_a1_freq", "asthma_beta", "asthma_se", "asthma_pval", "asthma_n"]]
t2d_regenie = t2d_regenie[["SNP", "a1", "a2", "t2d_a1_freq", "t2d_beta", "t2d_se", "t2d_pval", "t2d_n"]]
waist_regenie = waist_regenie[["SNP", "a1", "a2", "waist_a1_freq", "waist_beta", "waist_se", "waist_pval", "waist_n"]]

In [10]:
asthma_regenie

Unnamed: 0,SNP,a1,a2,asthma_a1_freq,asthma_beta,asthma_se,asthma_pval,asthma_n
6069486,rs545409685,C,T,0.002660,0.116773,0.071129,0.100651,339345
6069487,7:27869261_CAGTA_C,C,CAGTA,0.001502,0.036830,0.098431,0.708277,339345
6069488,rs73075348,G,A,0.056628,-0.000204,0.015226,0.989316,339345
6069489,rs6948467,A,G,0.392766,-0.002206,0.007223,0.760013,339345
6069490,rs73075354,G,C,0.116197,0.012311,0.011001,0.263089,339345
...,...,...,...,...,...,...,...,...
6071548,7:28273623_TTTCCTTCCTTCC_T,T,TTTCCTTCCTTCC,0.157251,0.000601,0.009892,0.951580,339345
6071549,rs188426589,A,T,0.012592,-0.004086,0.031982,0.898333,339345
6071550,rs6944995,G,T,0.853307,0.006399,0.009963,0.520704,339345
6071551,rs192297723,C,A,0.011635,0.013026,0.035082,0.710402,339345


In [11]:
t2d_regenie

Unnamed: 0,SNP,a1,a2,t2d_a1_freq,t2d_beta,t2d_se,t2d_pval,t2d_n
6070158,rs545409685,C,T,0.002663,0.038626,0.100036,0.699410,336074
6070159,7:27869261_CAGTA_C,C,CAGTA,0.001508,0.108386,0.140165,0.439362,336074
6070160,rs73075348,G,A,0.056693,0.016225,0.021522,0.450928,336074
6070161,rs6948467,A,G,0.392739,-0.020544,0.010241,0.044843,336074
6070162,rs73075354,G,C,0.116102,0.038589,0.015628,0.013539,336074
...,...,...,...,...,...,...,...,...
6072220,7:28273623_TTTCCTTCCTTCC_T,T,TTTCCTTCCTTCC,0.157243,-0.007129,0.014019,0.611093,336074
6072221,rs188426589,A,T,0.012596,-0.013464,0.045338,0.766489,336074
6072222,rs6944995,G,T,0.853463,0.013798,0.014138,0.329086,336074
6072223,rs192297723,C,A,0.011671,-0.051344,0.049307,0.297730,336074


In [12]:
waist_regenie

Unnamed: 0,SNP,a1,a2,waist_a1_freq,waist_beta,waist_se,waist_pval,waist_n
6069968,rs545409685,C,T,0.002660,0.000830,0.009980,0.933687,365499
6069969,7:27869261_CAGTA_C,C,CAGTA,0.001494,-0.018772,0.013917,0.177384,365499
6069970,rs73075348,G,A,0.056610,-0.002467,0.002139,0.248720,365499
6069971,rs6948467,A,G,0.392772,-0.002577,0.001015,0.011129,365499
6069972,rs73075354,G,C,0.116173,0.006933,0.001547,0.000007,365499
...,...,...,...,...,...,...,...,...
6072031,7:28273623_TTTCCTTCCTTCC_T,T,TTTCCTTCCTTCC,0.157264,-0.001481,0.001391,0.287144,365499
6072032,rs188426589,A,T,0.012548,-0.008732,0.004520,0.053361,365499
6072033,rs6944995,G,T,0.853189,0.003185,0.001400,0.022909,365499
6072034,rs192297723,C,A,0.011681,-0.002544,0.004923,0.605312,365499


### Test asthma_t2d sumdata for GSMR

In [13]:
asthma_v_t2d = pd.merge(asthma_regenie, t2d_regenie,  how='inner', left_on=['SNP','a1','a2'], right_on = ['SNP','a1','a2']).drop(["t2d_a1_freq"], axis=1)
asthma_v_t2d = asthma_v_t2d.rename(columns={"asthma_a1_freq":"a1_freq", "asthma_beta":"bzx", "asthma_se":"bzx_se", "asthma_pval":"bzx_pval", "asthma_n":"bzx_n", "t2d_beta":"bzy", "t2d_se":"bzy_se", "t2d_pval":"bzy_pval", "t2d_n":"bzy_n"})
asthma_v_t2d

Unnamed: 0,SNP,a1,a2,a1_freq,bzx,bzx_se,bzx_pval,bzx_n,bzy,bzy_se,bzy_pval,bzy_n
0,rs545409685,C,T,0.002660,0.116773,0.071129,0.100651,339345,0.038626,0.100036,0.699410,336074
1,7:27869261_CAGTA_C,C,CAGTA,0.001502,0.036830,0.098431,0.708277,339345,0.108386,0.140165,0.439362,336074
2,rs73075348,G,A,0.056628,-0.000204,0.015226,0.989316,339345,0.016225,0.021522,0.450928,336074
3,rs6948467,A,G,0.392766,-0.002206,0.007223,0.760013,339345,-0.020544,0.010241,0.044843,336074
4,rs73075354,G,C,0.116197,0.012311,0.011001,0.263089,339345,0.038589,0.015628,0.013539,336074
...,...,...,...,...,...,...,...,...,...,...,...,...
2060,7:28273623_TTTCCTTCCTTCC_T,T,TTTCCTTCCTTCC,0.157251,0.000601,0.009892,0.951580,339345,-0.007129,0.014019,0.611093,336074
2061,rs188426589,A,T,0.012592,-0.004086,0.031982,0.898333,339345,-0.013464,0.045338,0.766489,336074
2062,rs6944995,G,T,0.853307,0.006399,0.009963,0.520704,339345,0.013798,0.014138,0.329086,336074
2063,rs192297723,C,A,0.011635,0.013026,0.035082,0.710402,339345,-0.051344,0.049307,0.297730,336074


In [14]:
asthma_v_t2d.to_csv("asthma_v_t2d_gsmr_data", sep=" ", index=False)

In [15]:
asthma_v_t2d

Unnamed: 0,SNP,a1,a2,a1_freq,bzx,bzx_se,bzx_pval,bzx_n,bzy,bzy_se,bzy_pval,bzy_n
0,rs545409685,C,T,0.002660,0.116773,0.071129,0.100651,339345,0.038626,0.100036,0.699410,336074
1,7:27869261_CAGTA_C,C,CAGTA,0.001502,0.036830,0.098431,0.708277,339345,0.108386,0.140165,0.439362,336074
2,rs73075348,G,A,0.056628,-0.000204,0.015226,0.989316,339345,0.016225,0.021522,0.450928,336074
3,rs6948467,A,G,0.392766,-0.002206,0.007223,0.760013,339345,-0.020544,0.010241,0.044843,336074
4,rs73075354,G,C,0.116197,0.012311,0.011001,0.263089,339345,0.038589,0.015628,0.013539,336074
...,...,...,...,...,...,...,...,...,...,...,...,...
2060,7:28273623_TTTCCTTCCTTCC_T,T,TTTCCTTCCTTCC,0.157251,0.000601,0.009892,0.951580,339345,-0.007129,0.014019,0.611093,336074
2061,rs188426589,A,T,0.012592,-0.004086,0.031982,0.898333,339345,-0.013464,0.045338,0.766489,336074
2062,rs6944995,G,T,0.853307,0.006399,0.009963,0.520704,339345,0.013798,0.014138,0.329086,336074
2063,rs192297723,C,A,0.011635,0.013026,0.035082,0.710402,339345,-0.051344,0.049307,0.297730,336074


In [16]:
asthma_v_t2d[["SNP", "a1"]].to_csv("asthma_v_t2d_snps.allele", sep=" ", header=False, index=False)

### Test asthma regenie file against bfile and bgenfile to determine intersection of snps

In [5]:
import pandas as pd

In [6]:
asthma_regenie = pd.read_csv("/mnt/mfs/hgrcgrid/homes/tf2478/regenie_rerun_yining_030222/asthma_PC10_step2_imp.regenie_ASTHMA.regenie", sep=" ")
asthma_regenie = asthma_regenie[(asthma_regenie["CHROM"] == 7) & (asthma_regenie["GENPOS"] >= 27868573) & (asthma_regenie["GENPOS"] <= 28273990)]

In [7]:
asthma_regenie

Unnamed: 0,CHROM,GENPOS,ID,ALLELE0,ALLELE1,A1FREQ,INFO,N,TEST,BETA,SE,CHISQ,LOG10P,EXTRA
6069486,7,27869098,rs545409685,C,T,0.997340,0.921984,339345,ADD,0.116773,0.071129,2.695200,0.997184,
6069487,7,27869261,7:27869261_CAGTA_C,C,CAGTA,0.998498,0.847488,339345,ADD,0.036830,0.098431,0.140004,0.149797,
6069488,7,27869377,rs73075348,G,A,0.943372,1.000000,339345,ADD,-0.000204,0.015226,0.000179,0.004665,
6069489,7,27869782,rs6948467,A,G,0.607234,0.993104,339345,ADD,-0.002206,0.007223,0.093309,0.119179,
6069490,7,27869794,rs73075354,G,C,0.883803,0.994174,339345,ADD,0.012311,0.011001,1.252430,0.579898,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6071548,7,28273623,7:28273623_TTTCCTTCCTTCC_T,T,TTTCCTTCCTTCC,0.842749,0.950661,339345,ADD,0.000601,0.009892,0.003687,0.021555,
6071549,7,28273697,rs188426589,A,T,0.987408,0.964993,339345,ADD,-0.004086,0.031982,0.016324,0.046563,
6071550,7,28273719,rs6944995,G,T,0.146693,0.995769,339345,ADD,0.006399,0.009963,0.412498,0.283409,
6071551,7,28273829,rs192297723,C,A,0.988365,0.874006,339345,ADD,0.013026,0.035082,0.137876,0.148496,


In [8]:
snps = pd.read_csv("asthma_v_t2d_snps.allele", sep=" ", header=None)
snps = set(snps[0].to_list())
len(snps)

2063

In [9]:
rel_asthma_regenie = asthma_regenie[asthma_regenie["ID"].isin(snps)]
rel_asthma_regenie

Unnamed: 0,CHROM,GENPOS,ID,ALLELE0,ALLELE1,A1FREQ,INFO,N,TEST,BETA,SE,CHISQ,LOG10P,EXTRA
6069486,7,27869098,rs545409685,C,T,0.997340,0.921984,339345,ADD,0.116773,0.071129,2.695200,0.997184,
6069487,7,27869261,7:27869261_CAGTA_C,C,CAGTA,0.998498,0.847488,339345,ADD,0.036830,0.098431,0.140004,0.149797,
6069488,7,27869377,rs73075348,G,A,0.943372,1.000000,339345,ADD,-0.000204,0.015226,0.000179,0.004665,
6069489,7,27869782,rs6948467,A,G,0.607234,0.993104,339345,ADD,-0.002206,0.007223,0.093309,0.119179,
6069490,7,27869794,rs73075354,G,C,0.883803,0.994174,339345,ADD,0.012311,0.011001,1.252430,0.579898,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6071548,7,28273623,7:28273623_TTTCCTTCCTTCC_T,T,TTTCCTTCCTTCC,0.842749,0.950661,339345,ADD,0.000601,0.009892,0.003687,0.021555,
6071549,7,28273697,rs188426589,A,T,0.987408,0.964993,339345,ADD,-0.004086,0.031982,0.016324,0.046563,
6071550,7,28273719,rs6944995,G,T,0.146693,0.995769,339345,ADD,0.006399,0.009963,0.412498,0.283409,
6071551,7,28273829,rs192297723,C,A,0.988365,0.874006,339345,ADD,0.013026,0.035082,0.137876,0.148496,


In [10]:
def chromsnp(row):
    return f"{row['CHROM']}:{row['GENPOS']}_{row['ALLELE0']}_{row['ALLELE1']}"
asthma_regenie_chromsnp = rel_asthma_regenie.apply(chromsnp, axis=1)

In [11]:
asthma_regenie_chromsnp = set(asthma_regenie_chromsnp.to_list())
asthma_regenie_chromsnp

{'7:28014039_G_T',
 '7:27962159_T_C',
 '7:27944415_AT_A',
 '7:28083813_T_C',
 '7:28176612_T_C',
 '7:27902404_T_C',
 '7:28148747_T_G',
 '7:28075172_C_T',
 '7:27935862_A_G',
 '7:28182177_A_G',
 '7:28068337_T_A',
 '7:27980766_C_T',
 '7:28202079_C_T',
 '7:28223842_C_T',
 '7:28272541_G_GAGAGAGAA',
 '7:28273986_C_T',
 '7:28248966_T_G',
 '7:27999518_C_CAAA',
 '7:27965391_A_G',
 '7:28241450_C_T',
 '7:27963437_C_T',
 '7:27989148_T_C',
 '7:28004716_A_T',
 '7:27937355_G_A',
 '7:27884860_T_G',
 '7:27995830_T_C',
 '7:28038375_A_G',
 '7:28187134_A_G',
 '7:27902124_T_TCA',
 '7:28038886_G_A',
 '7:27945339_T_C',
 '7:27950384_A_G',
 '7:28132104_G_A',
 '7:28152845_T_C',
 '7:28200097_T_C',
 '7:28051402_T_C',
 '7:28215169_C_T',
 '7:28113623_A_G',
 '7:28109834_C_G',
 '7:28012328_C_T',
 '7:28022296_C_T',
 '7:28055031_T_C',
 '7:28059800_T_C',
 '7:28026570_C_T',
 '7:27950265_C_T',
 '7:28225899_A_C',
 '7:28256240_A_G',
 '7:28046376_T_C',
 '7:28070377_TTAAA_T',
 '7:28266932_C_T',
 '7:28230502_A_G',
 '7:28156506_

In [12]:
bfile = pd.read_csv("/mnt/mfs/statgen/archive/UKBiobank_Yale_transfer/pleiotropy_geneticfiles/UKB_Caucasians_phenotypeindepqc120319_updated082020removedwithdrawnindiv.bim", sep="\t", header=None)

In [13]:
bfile[(bfile[0] == 7) & (bfile[3] >= 27868573) & (bfile[3] <= 28273990)]

Unnamed: 0,0,1,2,3,4,5
229225,7,rs73075348,0,27869377,G,A
229226,7,rs112759592,0,27875642,C,G
229227,7,rs10281008,0,27877537,G,A
229228,7,rs117933100,0,27879552,C,T
229229,7,rs58651394,0,27881058,T,C
...,...,...,...,...,...,...
229316,7,rs56142710,0,28260867,C,A
229317,7,rs73076597,0,28260970,T,C
229318,7,rs478266,0,28261344,T,C
229319,7,rs475615,0,28263825,T,C


bfile cannot have all 2065 snps that are in the regenie file

In [14]:
def bchromsnp(row):
    return f"{row[0]}:{row[3]}_{row[4]}_{row[5]}"
b_chromsnp = bfile.apply(bchromsnp, axis=1)

In [15]:
b_chromsnp = set(b_chromsnp.to_list())
b_chromsnp

{'15:93581663_T_C',
 '14:21357547_A_G',
 '2:80211850_C_A',
 '6:4375064_C_T',
 '10:80873647_T_C',
 '6:76999932_C_T',
 '2:85119053_G_C',
 '3:190314468_T_C',
 '10:91651203_G_A',
 '1:247491277_G_A',
 '3:31328238_C_A',
 '6:26577401_C_T',
 '22:21263353_A_G',
 '3:48600145_T_C',
 '11:101658406_T_C',
 '2:8771521_A_C',
 '12:95941907_A_C',
 '12:22853733_G_A',
 '12:88393916_C_T',
 '21:28191916_G_A',
 '8:71346171_T_C',
 '3:12592368_C_T',
 '4:151450555_G_A',
 '6:70253835_A_G',
 '6:106537750_A_G',
 '2:9014947_C_T',
 '3:25592883_C_T',
 '8:138093633_T_C',
 '2:96737083_G_A',
 '4:162812065_T_C',
 '16:82731804_T_C',
 '2:241512581_A_G',
 '13:40679099_G_A',
 '15:79994582_T_C',
 '11:88349663_A_G',
 '17:78026382_A_G',
 '1:241907817_T_C',
 '4:20749360_T_C',
 '4:159001538_C_T',
 '20:10879336_C_T',
 '21:43787562_G_A',
 '17:5684985_G_A',
 '1:110461748_T_C',
 '5:10631431_C_T',
 '4:124538500_C_T',
 '8:98103766_C_G',
 '6:95188593_G_C',
 '11:117574774_T_C',
 '1:9710821_T_C',
 '3:109785766_T_C',
 '3:86456731_T_G',
 '1

In [16]:
len(asthma_regenie_chromsnp.intersection(b_chromsnp))

74

In `chr:pos_a1_a2` format only 74 intersecting snps between regenie file and bfile

In [17]:
bfile_snps = set(bfile[1].to_list())

In [18]:
len(snps.intersection(bfile_snps))

96

In `rsid` format 96 intersecting snps between regenie file and bfile

In [2]:
bgenfile = pd.read_csv("/mnt/mfs/statgen/archive/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_mfi_chr7_v3.txt_infoabove0.8_maf0.01", sep="\t", header=None)

In [3]:
bgenfile

Unnamed: 0,0,1,2,3,4,5,6,7
0,7:27916_T_C,rs577290214,27916,T,C,0.052138,C,0.896544
1,7:30580_C_T,rs183052695,30580,C,T,0.039627,T,0.856357
2,7:30581_A_T,rs559579537,30581,A,T,0.039627,T,0.856357
3,7:30626_C_T,rs569327782,30626,C,T,0.010377,T,0.806582
4,7:31273_G_C,rs68115332,31273,G,C,0.151767,C,0.971182
...,...,...,...,...,...,...,...,...
565183,7:159128289_C_T,rs577986429,159128289,C,T,0.011603,T,0.837460
565184,7:159128337_A_T,rs117038851,159128337,A,T,0.027920,T,0.989628
565185,7:159128530_G_A,rs80176631,159128530,G,A,0.027696,A,0.926026
565186,7:159128533_C_T,rs75085106,159128533,C,T,0.027199,T,0.936634


In [20]:
bgen_chromsnp = set(bgenfile[0].to_list())
bgen_chromsnp

{'rs80306000',
 '7:19973987_G_T',
 '7:39656685_T_A',
 '7:41018719_C_T',
 '7:108529504_A_G',
 '7:62322883_T_C',
 '7:30136945_C_T',
 '7:22205696_G_A',
 'rs73217334',
 '7:91001617_G_T',
 '7:49069438_T_C',
 '7:8875235_C_T',
 '7:109468952_A_G',
 '7:154675463_A_G',
 '7:119147391_A_G',
 '7:150436391_CATTTCCTGATGCTAGCTCCATT_C',
 '7:5240140_C_G',
 '7:131463005_G_A',
 '7:152286148_A_C',
 '7:88325416_T_G',
 '7:111290144_T_A',
 '7:10622076_A_C',
 '7:68393539_T_C',
 '7:31611054_C_T',
 '7:62544557_C_G',
 '7:67746612_CAAAA_C',
 '7:16763322_G_A',
 '7:51905254_C_T',
 'rs218116',
 '7:77620790_C_T',
 '7:137445794_G_A',
 '7:103358920_C_A',
 '7:139640382_C_T',
 '7:93854234_T_C',
 '7:111717018_TG_T',
 '7:90311819_T_C',
 '7:88260924_A_C',
 '7:38762492_C_T',
 '7:14616881_A_C',
 '7:90453952_T_C',
 '7:82796800_A_G',
 '7:85060952_A_G',
 '7:3229720_C_T',
 '7:135767650_T_C',
 '7:5489303_T_C',
 '7:48601730_G_A',
 '7:55553385_C_T',
 '7:121410123_T_C',
 '7:19585813_T_C',
 '7:149897162_A_G',
 '7:23843315_A_C',
 '7:346

In [22]:
asthma_regenie_chromsnp

{'7:28014039_G_T',
 '7:27962159_T_C',
 '7:27944415_AT_A',
 '7:28083813_T_C',
 '7:28176612_T_C',
 '7:27902404_T_C',
 '7:28148747_T_G',
 '7:28075172_C_T',
 '7:27935862_A_G',
 '7:28182177_A_G',
 '7:28068337_T_A',
 '7:27980766_C_T',
 '7:28202079_C_T',
 '7:28223842_C_T',
 '7:28272541_G_GAGAGAGAA',
 '7:28273986_C_T',
 '7:28248966_T_G',
 '7:27999518_C_CAAA',
 '7:27965391_A_G',
 '7:28241450_C_T',
 '7:27963437_C_T',
 '7:27989148_T_C',
 '7:28004716_A_T',
 '7:27937355_G_A',
 '7:27884860_T_G',
 '7:27995830_T_C',
 '7:28038375_A_G',
 '7:28187134_A_G',
 '7:27902124_T_TCA',
 '7:28038886_G_A',
 '7:27945339_T_C',
 '7:27950384_A_G',
 '7:28132104_G_A',
 '7:28152845_T_C',
 '7:28200097_T_C',
 '7:28051402_T_C',
 '7:28215169_C_T',
 '7:28113623_A_G',
 '7:28109834_C_G',
 '7:28012328_C_T',
 '7:28022296_C_T',
 '7:28055031_T_C',
 '7:28059800_T_C',
 '7:28026570_C_T',
 '7:27950265_C_T',
 '7:28225899_A_C',
 '7:28256240_A_G',
 '7:28046376_T_C',
 '7:28070377_TTAAA_T',
 '7:28266932_C_T',
 '7:28230502_A_G',
 '7:28156506_

In [21]:
len(asthma_regenie_chromsnp.intersection(bgen_chromsnp))

0

Regenie file is in HG38 and bgen file is in HG19, so need to liftover regenie file to HG38