# Region Extraction

## Aim

To extract the summary statistics and genotype on specific genomic regions and calculate their LD matrix.

## Input and Output

Region extract input:

- list of regions e.g. rs849335, Chr 7 at bp 28223990. > [7, 27723990,28723990]
- all genotype
- all summary stast (beta and se(beta))
- list of unrelated individuals

output:

- regional genotype
- regional LD accounting for reltaed individuals
- reginonal summary stats

- Genotype files:
/SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv
- Bgen files:
SAY/dbgapstg/scratch/UKBiobank/genotype_files/ukb39554_imputeddataset/
- Summary stats for imputed variants:
/SAY/dbgapstg/scratch/UKBiobank/results/BOLTLMM_results/results_imputed_data
- Phenotype files:
/SAY/dbgapstg/scratch/UKBiobank/phenotype_files/pleiotropy_R01/phenotypesforanalysis
- relationship file:
/SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/unrelated_n307259/UKB_unrelatedcauc_phenotypes_asthmat2dbmiwaisthip_agesex_waisthipratio_040620

sos run Region_Extraction.ipynb -j 1

## Main function

In [None]:
[global]
parameter: regions = [7]
# Genotype file in plink binary format
parameter: ukb_geno_path = path('/SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv')
# Path to summary stats file
parameter: sumstat_path= path('/SAY/dbgapstg/scratch/UKBiobank/results/BOLTLMM_results/results_imputed_data/INT-BMI/ukb_imp_v3.UKB_caucasians_BMIwaisthip_AsthmaAndT2D_INT-BMI_withagesex_041720.BoltLMM.stats.all_chr.gz')
# Path to samples of unrelated individuals
parameter: unrelated_samples = path('/SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/unrelated_n307259/UKB_unrelatedcauc_phenotypes_asthmat2dbmiwaisthip_agesex_waisthipratio_040620')


## All in One

In [104]:
#[global]
sumstat_path = '/SAY/dbgapstg/scratch/UKBiobank/results/BOLTLMM_results/results_imputed_data/INT-BMI/ukb_imp_v3.UKB_caucasians_BMIwaisthip_AsthmaAndT2D_INT-BMI_withagesex_041720.BoltLMM.stats.all_chr.gz'
#sumstat_path = '/SAY/dbgapstg/scratch/UKBiobank/results/BOLTLMM_results/results_imputed_data/asthma/UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv.Asthma_casesbyICD10codesANDselfreport_controlsbyselfreportandicd10_noautoimmuneincontrols.fastGWA.stats.all_chr'
ukb_geno_path='/SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv'

unrelated_samples='/SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/unrelated_n307259/UKB_unrelatedcauc_phenotypes_asthmat2dbmiwaisthip_agesex_waisthipratio_040620'


In [97]:
#[global]
def extract_regions(regions,sumstat_path,geno_path,unr_path):
    try:
        gwas = pd.read_csv(sumstat_path, compression='gzip', header=0, sep='\t', quotechar='"')
    except:
        print("It is not BOLT result")
        gwas = pd.read_csv(sumstat_path, header=0, sep='\t', quotechar='"')
    geno = read_plink(geno_path)
    unr = pd.read_csv(unr_path, header=0, sep=' ', quotechar='"')

    fid = geno[1].fid.astype(int)
    fid_bool = fid.isin(unr.FID)
    fid_ind = fid_bool[fid_bool].index
    rg_infos=[]
    for region in regions:
        rg_info=extract_region(region,gwas,geno,fid_ind)
        rg_infos.append(rg_info)
    return(rg_infos)

In [105]:
rg_infos= extract_regions([[16,53800754,53845487],[7,28223990-500000,28223990+500000]],
                            sumstat_path,ukb_geno_path,unrelated_samples)

Mapping files: 100%|##########| 3/3 [00:42<00:00, 14.28s/it]


## Import and Path

In [78]:
#[global]
from pathlib import Path
import numpy as np
import pandas as pd
from pandas_plink import read_plink

ukb_path = Path('/SAY/dbgapstg/scratch/UKBiobank/')
ukb_geno_path = ukb_path/'genotype_files/pleiotropy_geneticfiles'
ukb_phen_path = ukb_path/'phenotype_files/pleiotropy_R01/phenotypesforanalysis'
ukb_gwas_path = ukb_path/'results/BOLTLMM_results/results_imputed_data'
print(ukb_gwas_path)

/SAY/dbgapstg/scratch/UKBiobank/results/BOLTLMM_results/results_imputed_data


## Summary Stat

### Read GWAS stat

In [21]:
bmi_bolt_path = ukb_gwas_path/'INT-BMI/ukb_imp_v3.UKB_caucasians_BMIwaisthip_AsthmaAndT2D_INT-BMI_withagesex_041720.BoltLMM.stats.all_chr.gz'
bmi_bolt = pd.read_csv(bmi_bolt_path, compression='gzip', header=0, sep='\t', quotechar='"')

In [37]:
bmi_bolt.iloc[3:7,]

Unnamed: 0,SNP,CHR,BP,GENPOS,ALLELE1,ALLELE0,A1FREQ,F_MISS,CHISQ_LINREG,P_LINREG,BETA,SE,CHISQ_BOLT_LMM_INF,P_BOLT_LMM_INF,CHISQ_BOLT_LMM,P_BOLT_LMM
3,rs79373928,1,801536,0.005872,G,T,0.014871,0.000596,0.009184,0.92,-0.005915,0.008938,0.43801,0.51,0.358727,0.55
4,rs11240779,1,808631,0.006208,G,A,0.224724,0.006986,1.30599,0.25,0.003471,0.002604,1.77719,0.18,1.59823,0.21
5,rs57181708,1,809876,0.006208,G,A,0.100445,0.000874,6.64697,0.0099,0.008757,0.003594,5.93807,0.015,5.17968,0.023
6,rs4970383,1,838555,0.006208,A,C,0.245613,0.001787,0.43358,0.51,0.000175,0.002512,0.00487,0.94,0.003883,0.95


In [22]:
bmi_bolt

Unnamed: 0,SNP,CHR,BP,GENPOS,ALLELE1,ALLELE0,A1FREQ,F_MISS,CHISQ_LINREG,P_LINREG,BETA,SE,CHISQ_BOLT_LMM_INF,P_BOLT_LMM_INF,CHISQ_BOLT_LMM,P_BOLT_LMM
0,rs3131962,1,756604,0.004907,A,G,0.129619,0.001543,0.143797,0.700,-0.000596,0.003224,0.034136,0.850,0.008625,0.930
1,rs12562034,1,768448,0.004957,A,G,0.105098,0.001601,3.206650,0.073,0.006521,0.003525,3.423000,0.064,2.742840,0.098
2,rs4040617,1,779322,0.005007,G,A,0.127412,0.002416,0.681178,0.410,-0.001951,0.003249,0.360661,0.550,0.294739,0.590
3,rs79373928,1,801536,0.005872,G,T,0.014871,0.000596,0.009184,0.920,-0.005915,0.008938,0.438010,0.510,0.358727,0.550
4,rs11240779,1,808631,0.006208,G,A,0.224724,0.006986,1.305990,0.250,0.003471,0.002604,1.777190,0.180,1.598230,0.210
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529019,rs4824147,22,51173542,0.750810,T,C,0.061359,0.000558,0.011819,0.910,-0.001627,0.004553,0.127749,0.720,0.211562,0.650
529020,rs73174435,22,51174939,0.750825,T,C,0.053825,0.001157,3.009070,0.083,0.009323,0.004843,3.705620,0.054,3.406890,0.065
529021,rs3810648,22,51175626,0.750832,G,A,0.061006,0.002949,0.137834,0.710,-0.000627,0.004571,0.018814,0.890,0.000217,0.990
529022,rs3865764,22,51185848,0.750911,G,A,0.050675,0.001809,1.385920,0.240,0.011019,0.004979,4.896610,0.027,4.455700,0.035


In [6]:
asthma_fastgwa_path = ukb_gwas_path/'asthma/UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv.Asthma_casesbyICD10codesANDselfreport_controlsbyselfreportandicd10_noautoimmuneincontrols.fastGWA.stats.all_chr'

In [7]:
asthma_fastgwa = pd.read_csv(asthma_fastgwa_path, header=0, sep='\t', quotechar='"')

In [8]:
asthma_fastgwa

Unnamed: 0,CHR,SNP,POS,A1,A2,N,AF1,BETA,SE,P
0,1,rs3131962,756604,A,G,338837,0.129612,-0.001497,0.001272,0.239246
1,1,rs12562034,768448,A,G,338815,0.105013,-0.000502,0.001391,0.718023
2,1,rs4040617,779322,G,A,338535,0.127406,-0.001131,0.001281,0.377623
3,1,rs79373928,801536,G,T,339152,0.014847,-0.002883,0.003529,0.413952
4,1,rs11240779,808631,G,A,336983,0.224707,-0.001378,0.001027,0.179785
...,...,...,...,...,...,...,...,...,...,...
529019,22,rs4824147,51173542,T,C,339168,0.061450,-0.000804,0.001775,0.650797
529020,22,rs73174435,51174939,T,C,338963,0.053767,-0.000395,0.001891,0.834663
529021,22,rs3810648,51175626,G,A,338362,0.060905,0.000996,0.001785,0.576889
529022,22,rs3865764,51185848,G,A,338737,0.050654,-0.001565,0.001944,0.420748


### Read genotype data

In [81]:
#[global]
ukb_geno_path1='/SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv'
ukb_geno = read_plink(str(ukb_geno_path1))
ukb_unrelatedcauc_path='/SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/unrelated_n307259/UKB_unrelatedcauc_phenotypes_asthmat2dbmiwaisthip_agesex_waisthipratio_040620'
ukb_unrelatedcauc = pd.read_csv(ukb_unrelatedcauc_path, header=0, sep=' ', quotechar='"')

#the index of unrelated individuals
fid = ukb_geno[1].fid.astype(int)
fid_bool = fid.isin(ukb_unrelatedcauc.FID)
fid_ind = fid_bool[fid_bool].index

Mapping files: 100%|##########| 3/3 [00:43<00:00, 14.66s/it]


## Extract regions

The function of getting the index of a region

In [82]:
#[global]
def region_index(ss,chrom,start,end):
    chr_bool = ss.CHR == chrom
    chr_ind = chr_bool[chr_bool].index
    try:
        reg_bool = (ss.BP[chr_ind]>start) & (ss.BP[chr_ind]<end)
    except:
        print("There is no BP column,trying POS")
        reg_bool = (ss.POS[chr_ind]>start) & (ss.POS[chr_ind]<end)
        
    return chr_ind[reg_bool]

The function of extracting reginonal summary stats

In [83]:
#[global]
def regional_stat(ss,ind):
    return ss.iloc[ind,:]

The function of extracting regional genotype
   - p: list of bim, fam, bed
   - pb: index of bim
   - pf: index of fam

In [84]:
#[global]
def plink_slice(p,pb=None,pf=None):
    (bim,fam,bed)=p
    if pb is not None:
        bim = bim.iloc[pb]
        bed = bed[pb,:]
    if pf is not None:
        fam = fam.iloc[pf]
        bed = bed[:,pf]
    return(bim,fam,bed)

regional LD accounting for unrelated individuals

In [86]:
#[global]
def LD_matrix(region,gwas,geno,iid):
    rg_ind = region_index(gwas,region[0],region[1],region[2])
    rg_stat = regional_stat(gwas,rg_ind)
    rg_geno = plink_slice(geno,pb=list(rg_ind),pf=list(iid))
    rg_bed = rg_geno[2].compute()
    rg_bed=rg_bed[:,~np.any(np.isnan(rg_bed), axis=0)] #remove columns included nan
    rg_ld = np.corrcoef(rg_bed)
    return(rg_ind,rg_stat,rg_geno[0],rg_geno[1],rg_bed,rg_ld)

In [89]:
#[global]
def extract_region(region,gwas,geno,iid):
    rg_ind = region_index(gwas,region[0],region[1],region[2])
    rg_stat = regional_stat(gwas,rg_ind)
    rg_geno = plink_slice(geno,pb=list(rg_ind),pf=list(iid))
    rg_bed = rg_geno[2].compute()
    rg_bed=rg_bed[:,~np.any(np.isnan(rg_bed), axis=0)] #remove columns included nan
    rg_ld = np.corrcoef(rg_bed)
    return(rg_stat.BETA,rg_stat.SE,rg_ld)

In [20]:
tmp1 = LD_matrix(asthma_fastgwa,ukb_geno,[7,28223990-500000,28223990+500000],fid_ind)

There is no BP column,trying POS


the functions for multiple phenotypes or regions

In [22]:
def extract_stats(gwass,geno,region,iid):
    ld = LD_matrix(gwass[0],geno,region,iid)
    rg_stats = [regional_stat(gwas,ld[0]) for gwas in gwass]
    return(ld,rg_stats)
  
def extract_regions(gwas,geno,regions,iid):
    return([LD_matrix(gwas,geno,region,iid) for region in regions])

In [23]:
tmp2 = extract_regions(bmi_bolt,ukb_geno,[[16,53800754,53845487],[7,28223990-500000,28223990+500000]],fid_ind)

## Other

In [2]:
bmi_sr_path = "/SAY/dbgapstg/scratch/UKBiobank/results/BOLTLMM_results/results_imputed_data/INT-BMI/ukb_imp_v3.UKB_caucasians_BMIwaisthip_AsthmaAndT2D_INT-BMI_withagesex_041720.BoltLMM.snp_stats.all_chr.gz"

In [3]:
df = pd.read_csv(bmi_sr_path, compression='gzip', header=0, sep='\t', quotechar='"')

In [4]:
df.head()

Unnamed: 0,SNP,CHR,BP,GENPOS,ALLELE1,ALLELE0,A1FREQ,INFO,CHISQ_LINREG,P_LINREG,BETA,SE,CHISQ_BOLT_LMM_INF,P_BOLT_LMM_INF,CHISQ_BOLT_LMM,P_BOLT_LMM
0,1:692794_CA_C,1,692794,0.003338,CA,C,0.88936,0.824483,1.86946,0.17,0.003181,0.003795,0.702462,0.4,0.525906,0.47
1,rs12238997,1,693731,0.003363,A,G,0.884251,0.875969,0.314486,0.57,0.001202,0.003587,0.112243,0.74,0.07003,0.79
2,rs371890604,1,707522,0.003733,G,C,0.902904,0.803693,0.067607,0.79,-0.00016,0.004037,0.00158,0.97,0.018918,0.89
3,rs149887893,1,714596,0.003923,T,C,0.968095,0.844433,0.050703,0.82,-0.002333,0.006792,0.117972,0.73,0.145692,0.7
4,rs12184267,1,715265,0.003941,C,T,0.964573,0.926915,0.019258,0.89,-0.001602,0.006167,0.067493,0.8,0.105143,0.75


## Check files

#### GWAS results

In [1]:
ls /SAY/dbgapstg/scratch/UKBiobank/results/BOLTLMM_results/results_imputed_data

[0m[01;34masthma[0m/  [01;34mINT-BMI[0m/  [01;34mINT-WAIST[0m/  [01;34mINT-WHR[0m/


In [180]:
ls /SAY/dbgapstg/scratch/UKBiobank/results/BOLTLMM_results/results_imputed_data/asthma/

UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv.Asthma_casesbyICD10codesANDselfreport_controlsbyselfreportandicd10_noautoimmuneincontrols.fastGWA.stats.all_chr
UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv.Asthma_casesbyICD10codesANDselfreport_controlsbyselfreportandicd10_noautoimmuneincontrols.fastGWA.stats.all_chr.numgenosnps.txt
ukb_imp_v3.Asthma_casesbyICD10codesANDselfreport_controlsbyselfreportandicd10_noautoimmuneincontrols.fastGWA.log.all_chr
ukb_imp_v3.Asthma_casesbyICD10codesANDselfreport_controlsbyselfreportandicd10_noautoimmuneincontrols.fastGWA.snp_stats.all_chr.gz
ukb_imp_v3.Asthma_casesbyICD10codesANDselfreport_controlsbyselfreportandicd10_noautoimmuneincontrols.fastGWA.snp_stats.all_chr.numimpsnps.txt


#### Genotype data and imputed data

In [11]:
ls /SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv.*

/SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv.bed
/SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv.bim
/SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv.fam
/SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv.log


In [129]:
ls /SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/unrelated_n307259/UKB_unrelatedcauc_phenotypes_asthmat2dbmiwaisthip_agesex_waisthipratio_040620

/SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/unrelated_n307259/UKB_unrelatedcauc_phenotypes_asthmat2dbmiwaisthip_agesex_waisthipratio_040620


In [3]:
ls /SAY/dbgapstg/scratch/UKBiobank/genotype_files/ukb39554_imputeddataset/

[0m[01;34mdownloadscripts[0m/                            [01;32mukb_imp_chr18_v3.bgen[0m*
[01;32mencoding.ukb[0m*                               [01;32mukb_imp_chr18_v3.bgen.bgi[0m*
[01;32mfields.ukb[0m*                                 [01;32mukb_imp_chr19_v3.bgen[0m*
README                                      [01;32mukb_imp_chr19_v3.bgen.bgi[0m*
samples.txt                                 [01;32mukb_imp_chr1_v3.bgen[0m*
subset_sample_variant_ukb_imp_chr1_v3.bgen  [01;32mukb_imp_chr1_v3.bgen.bgi[0m*
subset_ukb_imp_chr1_v3.bgen                 [01;32mukb_imp_chr20_v3.bgen[0m*
[01;32mukb1_1581369705_4814.tmp_bulk[0m*              [01;32mukb_imp_chr20_v3.bgen.bgi[0m*
[01;32mukb1_1581570817_24750.tmp_bulk[0m*             [01;32mukb_imp_chr21_v3.bgen[0m*
[01;32mukb1_1581572048_25854.tmp_bulk[0m*             [01;32mukb_imp_chr21_v3.bgen.bgi[0m*
[01;32mukb32285_imp_chr22_v3_s487297.sample[0m*       [01;32mukb_imp_chr22_v3.bgen[0m*
[01;32mukb322

#### Phenotype data

In [5]:
ls /SAY/dbgapstg/scratch/UKBiobank/phenotype_files/pleiotropy_R01/phenotypesforanalysis

Asthma_casesbyICD10codesANDselfreport_controlsbyselfreportandicd10_forbolt030720
Asthma_casesbyICD10codesANDselfreport_controlsbyselfreportandicd10_noautoimmuneincontrols_forbolt030720
diabetes_casesbyICD10andselfreport_controls_030620
diabetes_casesbyICD10andselfreport_controlswithoutautoiummune_030720
UKB_caucasians_BMI_nopreg_adjagesex_residuals_andstandardized_022720
UKB_caucasians_BMIwaisthip_AsthmaAndT2D_withagesex_033120
UKB_caucasians_waistcircumference_adjbmiagesex_nopreg_residuals_022720
UKB_caucasians_waisthipratio_adjbmiagesex_nopreg_residuals_022720
UKBrelated_AsthmaT2D_casesbyfieldoricd10_noautoincontrols_dataforbothphenotypes_040820


## Install pandas_plink

a Python package for reading PLINK binary file format and PLINK and GCTA realized relationship matrices. The file reading is taken place via lazy loading, meaning that it saves up memory by actually reading only the genotypes that are actually accessed by the user.(https://pypi.org/project/pandas-plink/)

In [None]:
pip install pandas-plink