# Extracting data for genomic regions of interest

## Aim

To extract the summary statistics and genotype on specific genomic regions and calculate their LD matrix.

In [4]:
%cd ~/tmp/08-Jun-2020/MWE

/home/gw/tmp/08-Jun-2020/MWE

## Input and Output

Region extract input:

- list of regions e.g. SNP rs849335 is at chr 7:28223990; and we extract 1Mb radius around it.
    - The locus will be represented by one line in the region list with 3 columns `7 27723990 28723990`
- all genotypes, in `bgen` format and in `plink format`.
- all summary statistics ($\hat{\beta}$, $SE(\hat{\beta})$ and p-values)
    - These summary statistics should contain at least these columns: **FIXME please read https://github.com/dianacornejo/pleiotropy_UKB/issues/1#issuecomment-640680333  and explain it here**

- list of unrelated individuals (from which LD matrices will be computed)

**FIXME: examplain format of summary stats file and format of unrelated individual list**

output:

- regional genotype
- regional LD accounting for related individuals
- reginonal summary stats

## Workflow usage

If genotypes are in PLINK format,

```
sos run Region_Extraction.ipynb plink
```

If genotypes are in bgen format,

```
sos run Region_Extraction.ipynb bgen
```

## Workflow codes

In [2]:
[global]
# Work directory where output will be saved to
parameter: cwd = path
# Region specifications
parameter: region_file = path
# Genotype file in plink binary format or bgen format
parameter: geno_path = path
# Phenotype path
parameter: pheno_path = path
# Path to summary stats file
parameter: sumstat_path= path
# Path to samples of unrelated individuals
parameter: unrelated_samples = path
# Column ID for summary stats, 1-based index
parameter: chrom_col = int
parameter: pos_col = int
parameter: bhat_col = int
parameter: sbhat_col = int
parameter: p_col = int
fail_if(not region_file.is_file(), msg = 'Cannot find regions to extract. Please specify them using ``--region-file`` option.')
# Load all regions of interest. Each item in the list will be a region: (chr, start, end)
regions = [x.strip().split() for x in open(region_file).readlines() if x.strip()]

## Some utility functions

- `plink_slice`: The function of extracting regional genotype
   - p: list of bim, fam, bed
   - pb: index of bim
   - pf: index of fam

In [3]:
[plink_0, bgen_0]
output: f'{cwd:a}/utils.py'
report: expand = '${ }'
    import numpy as np
    import pandas as pd
    def read_sumstat(file):
        try:
            sumstats = pd.read_csv(file, compression='gzip', header=0, sep='\t', quotechar='"')
            sumstats = sumstats.loc[:,['CHR','BP','SNP','ALLELE1','ALLELE0','BETA','SE','P_BOLT_LMM']]
        except:
            sumstats = pd.read_csv(file, header=0, sep='\t', quotechar='"')
            sumstats = sumstats.loc[:,['CHR','POS','SNP','A1','A2','BETA','SE','P']]
        sumstats.columns = ['CHR','BP','SNP','ALT','REF','BETA','SE','P']
        return sumstats


    def region_index(bim,chrom,start,end):
        chr_bool = bim.chrom.astype(int) == chrom
        chr_ind = chr_bool[chr_bool].index
        reg_bool = (bim.pos[chr_ind]>start) & (bim.pos[chr_ind]<end)
        return chr_ind[reg_bool]

    def p2z(pval,beta):
        z = [norm.pdf(p) if b>0 else -norm.pdf(p) for p,b in zip(pval,beta)]
        return z

    from scipy.stats import norm
    def regional_stat(ss,ind):
        rg_ss = ss.iloc[ind,:]
        rg_ss['z']=p2z(rg_ss.P,rg_ss.BETA)
        return ss.iloc[ind,:]

    def plink_slice(p,pb=None,pf=None):
        (bim,fam,bed)=p
        if pb is not None:
            bim = bim.iloc[pb]
            bed = bed[pb,:]
        if pf is not None:
            fam = fam.iloc[pf]
            bed = bed[:,pf]
        return(bim,fam,bed)

    def LD_matrix(bed):
        snps = pd.DataFrame(bed.transpose())
        ld = snps.corr()
        return ld

    def extract_region(region,gwas,geno,iid):
        rg_ind = region_index(geno[0],region[0],region[1],region[2])
        rg_stat = regional_stat(gwas,rg_ind)
        rg_geno = plink_slice(geno,pb=list(rg_ind))
        rg_bed = rg_geno[2].compute()
        rg_ld = LD_matrix(rg_bed[:,iid])
        return dict(stats=rg_stat,geno=rg_bed,ld=rg_ld)

## Extract from PLINK format

In [97]:
[plink_1 (extract genotypes)]
depend: Py_Module('pandas_plink')
input: output_from('plink_0'), geno_path, pheno_path, sumstats_path, unrelated_samples, for_each = 'regions'
output: sumstats = f'{cwd:a}/{sumstats_path:bn}_{_regions[0]}_{_regions[1]}_{_regions[2]}.sumstats.gz',
        genotype = f'{cwd:a}/{geno_path:bn}_{pheno_path:bn}_{_regions[0]}_{_regions[1]}_{_regions[2]}.genotype.gz',
        ld = f'{cwd:a}/{geno_path:bn}_{pheno_path:bn}_{_regions[0]}_{_regions[1]}_{_regions[2]}.ld.gz'
python: expand = '${ }'
    from pandas_plink import read_plink
    geno = read_plink(${_input[1]:nr})
    sumstats = read_sumstat(${_input[2]:r})
    unr = pd.read_csv(${_input[3]:r}, header=0, sep=' ', quotechar='"')
    fid = geno[1].fid.astype(int)
    fid_bool = fid.isin(unr.FID)
    fid_ind = fid_bool[fid_bool].index
    # FIXME: fid should be the ones used for a specific phenotype, not that of all the genotype samples
    # you need to load phenotype files too, matching samples in phenotypes
    # then get genotypes only for these phenotypes
    # and LD matrix based on unrelated individuals in the given phenotypes
    rg_info = extract_region(('${_region[0]}', '${_region[1]}', '${_region[2]}'), sumstats, geno, fid_ind)
    rg_info['stats'].to_csv(${_output[0]:r}, sep = "\t", header = True, index = False)
    np.savetxt(${_output[1]:r}, rg_info['geno'])
    np.savetxt(${_output[2]:r}, rg_info['ld'])

## Extract from bgen format

**FIXME: to be completed**

In [None]:
[bgen_1 (extract genotypes)]
input: output_from('bgen_0'), geno_path, pheno_path, sumstats_path, unrelated_samples, for_each = 'regions'
output: sumstats = f'{cwd:a}/{sumstats_path:bn}_{_regions[0]}_{_regions[1]}_{_regions[2]}.sumstats.gz',
        genotype = f'{cwd:a}/{geno_path:bn}_{pheno_path:bn}_{_regions[0]}_{_regions[1]}_{_regions[2]}.genotype.gz',
        ld = f'{cwd:a}/{geno_path:bn}_{pheno_path:bn}_{_regions[0]}_{_regions[1]}_{_regions[2]}.ld.gz'

## FIXME: codes below are all what you have used for prototyping. The can be removed after the workflow is developed.

In [105]:
rg_infos= extract_regions([[16,53800754,53845487],[7,28223990-500000,28223990+500000]],
                            sumstat_path,ukb_geno_path,unrelated_samples)

Mapping files: 100%|##########| 3/3 [00:42<00:00, 14.28s/it]



If you want to make comments on input and output files, you can always load and show part of them. See my `%preview` example above.

## Summary Stat

### Read GWAS stat

In [21]:
bmi_bolt_path = ukb_gwas_path/'INT-BMI/ukb_imp_v3.UKB_caucasians_BMIwaisthip_AsthmaAndT2D_INT-BMI_withagesex_041720.BoltLMM.stats.all_chr.gz'
bmi_bolt = pd.read_csv(bmi_bolt_path, compression='gzip', header=0, sep='\t', quotechar='"')

In [37]:
bmi_bolt.iloc[3:7,]

Unnamed: 0,SNP,CHR,BP,GENPOS,ALLELE1,ALLELE0,A1FREQ,F_MISS,CHISQ_LINREG,P_LINREG,BETA,SE,CHISQ_BOLT_LMM_INF,P_BOLT_LMM_INF,CHISQ_BOLT_LMM,P_BOLT_LMM
3,rs79373928,1,801536,0.005872,G,T,0.014871,0.000596,0.009184,0.92,-0.005915,0.008938,0.43801,0.51,0.358727,0.55
4,rs11240779,1,808631,0.006208,G,A,0.224724,0.006986,1.30599,0.25,0.003471,0.002604,1.77719,0.18,1.59823,0.21
5,rs57181708,1,809876,0.006208,G,A,0.100445,0.000874,6.64697,0.0099,0.008757,0.003594,5.93807,0.015,5.17968,0.023
6,rs4970383,1,838555,0.006208,A,C,0.245613,0.001787,0.43358,0.51,0.000175,0.002512,0.00487,0.94,0.003883,0.95


In [22]:
bmi_bolt

Unnamed: 0,SNP,CHR,BP,GENPOS,ALLELE1,ALLELE0,A1FREQ,F_MISS,CHISQ_LINREG,P_LINREG,BETA,SE,CHISQ_BOLT_LMM_INF,P_BOLT_LMM_INF,CHISQ_BOLT_LMM,P_BOLT_LMM
0,rs3131962,1,756604,0.004907,A,G,0.129619,0.001543,0.143797,0.700,-0.000596,0.003224,0.034136,0.850,0.008625,0.930
1,rs12562034,1,768448,0.004957,A,G,0.105098,0.001601,3.206650,0.073,0.006521,0.003525,3.423000,0.064,2.742840,0.098
2,rs4040617,1,779322,0.005007,G,A,0.127412,0.002416,0.681178,0.410,-0.001951,0.003249,0.360661,0.550,0.294739,0.590
3,rs79373928,1,801536,0.005872,G,T,0.014871,0.000596,0.009184,0.920,-0.005915,0.008938,0.438010,0.510,0.358727,0.550
4,rs11240779,1,808631,0.006208,G,A,0.224724,0.006986,1.305990,0.250,0.003471,0.002604,1.777190,0.180,1.598230,0.210
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529019,rs4824147,22,51173542,0.750810,T,C,0.061359,0.000558,0.011819,0.910,-0.001627,0.004553,0.127749,0.720,0.211562,0.650
529020,rs73174435,22,51174939,0.750825,T,C,0.053825,0.001157,3.009070,0.083,0.009323,0.004843,3.705620,0.054,3.406890,0.065
529021,rs3810648,22,51175626,0.750832,G,A,0.061006,0.002949,0.137834,0.710,-0.000627,0.004571,0.018814,0.890,0.000217,0.990
529022,rs3865764,22,51185848,0.750911,G,A,0.050675,0.001809,1.385920,0.240,0.011019,0.004979,4.896610,0.027,4.455700,0.035


In [6]:
asthma_fastgwa_path = ukb_gwas_path/'asthma/UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv.Asthma_casesbyICD10codesANDselfreport_controlsbyselfreportandicd10_noautoimmuneincontrols.fastGWA.stats.all_chr'

In [7]:
asthma_fastgwa = pd.read_csv(asthma_fastgwa_path, header=0, sep='\t', quotechar='"')

In [8]:
asthma_fastgwa

Unnamed: 0,CHR,SNP,POS,A1,A2,N,AF1,BETA,SE,P
0,1,rs3131962,756604,A,G,338837,0.129612,-0.001497,0.001272,0.239246
1,1,rs12562034,768448,A,G,338815,0.105013,-0.000502,0.001391,0.718023
2,1,rs4040617,779322,G,A,338535,0.127406,-0.001131,0.001281,0.377623
3,1,rs79373928,801536,G,T,339152,0.014847,-0.002883,0.003529,0.413952
4,1,rs11240779,808631,G,A,336983,0.224707,-0.001378,0.001027,0.179785
...,...,...,...,...,...,...,...,...,...,...
529019,22,rs4824147,51173542,T,C,339168,0.061450,-0.000804,0.001775,0.650797
529020,22,rs73174435,51174939,T,C,338963,0.053767,-0.000395,0.001891,0.834663
529021,22,rs3810648,51175626,G,A,338362,0.060905,0.000996,0.001785,0.576889
529022,22,rs3865764,51185848,G,A,338737,0.050654,-0.001565,0.001944,0.420748


### Read genotype data

In [81]:
#[global]
ukb_geno_path1='/SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv'
ukb_geno = read_plink(str(ukb_geno_path1))
ukb_unrelatedcauc_path='/SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/unrelated_n307259/UKB_unrelatedcauc_phenotypes_asthmat2dbmiwaisthip_agesex_waisthipratio_040620'
ukb_unrelatedcauc = pd.read_csv(ukb_unrelatedcauc_path, header=0, sep=' ', quotechar='"')

#the index of unrelated individuals
fid = ukb_geno[1].fid.astype(int)
fid_bool = fid.isin(ukb_unrelatedcauc.FID)
fid_ind = fid_bool[fid_bool].index

Mapping files: 100%|##########| 3/3 [00:43<00:00, 14.66s/it]


regional LD accounting for unrelated individuals

In [20]:
tmp1 = LD_matrix(asthma_fastgwa,ukb_geno,[7,28223990-500000,28223990+500000],fid_ind)

There is no BP column,trying POS


In [23]:
tmp2 = extract_regions(bmi_bolt,ukb_geno,[[16,53800754,53845487],[7,28223990-500000,28223990+500000]],fid_ind)

## Other

In [2]:
bmi_sr_path = "/SAY/dbgapstg/scratch/UKBiobank/results/BOLTLMM_results/results_imputed_data/INT-BMI/ukb_imp_v3.UKB_caucasians_BMIwaisthip_AsthmaAndT2D_INT-BMI_withagesex_041720.BoltLMM.snp_stats.all_chr.gz"

In [3]:
df = pd.read_csv(bmi_sr_path, compression='gzip', header=0, sep='\t', quotechar='"')

In [4]:
df.head()

Unnamed: 0,SNP,CHR,BP,GENPOS,ALLELE1,ALLELE0,A1FREQ,INFO,CHISQ_LINREG,P_LINREG,BETA,SE,CHISQ_BOLT_LMM_INF,P_BOLT_LMM_INF,CHISQ_BOLT_LMM,P_BOLT_LMM
0,1:692794_CA_C,1,692794,0.003338,CA,C,0.88936,0.824483,1.86946,0.17,0.003181,0.003795,0.702462,0.4,0.525906,0.47
1,rs12238997,1,693731,0.003363,A,G,0.884251,0.875969,0.314486,0.57,0.001202,0.003587,0.112243,0.74,0.07003,0.79
2,rs371890604,1,707522,0.003733,G,C,0.902904,0.803693,0.067607,0.79,-0.00016,0.004037,0.00158,0.97,0.018918,0.89
3,rs149887893,1,714596,0.003923,T,C,0.968095,0.844433,0.050703,0.82,-0.002333,0.006792,0.117972,0.73,0.145692,0.7
4,rs12184267,1,715265,0.003941,C,T,0.964573,0.926915,0.019258,0.89,-0.001602,0.006167,0.067493,0.8,0.105143,0.75


## Check files

#### GWAS results

In [1]:
ls /SAY/dbgapstg/scratch/UKBiobank/results/BOLTLMM_results/results_imputed_data

[0m[01;34masthma[0m/  [01;34mINT-BMI[0m/  [01;34mINT-WAIST[0m/  [01;34mINT-WHR[0m/


In [180]:
ls /SAY/dbgapstg/scratch/UKBiobank/results/BOLTLMM_results/results_imputed_data/asthma/

UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv.Asthma_casesbyICD10codesANDselfreport_controlsbyselfreportandicd10_noautoimmuneincontrols.fastGWA.stats.all_chr
UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv.Asthma_casesbyICD10codesANDselfreport_controlsbyselfreportandicd10_noautoimmuneincontrols.fastGWA.stats.all_chr.numgenosnps.txt
ukb_imp_v3.Asthma_casesbyICD10codesANDselfreport_controlsbyselfreportandicd10_noautoimmuneincontrols.fastGWA.log.all_chr
ukb_imp_v3.Asthma_casesbyICD10codesANDselfreport_controlsbyselfreportandicd10_noautoimmuneincontrols.fastGWA.snp_stats.all_chr.gz
ukb_imp_v3.Asthma_casesbyICD10codesANDselfreport_controlsbyselfreportandicd10_noautoimmuneincontrols.fastGWA.snp_stats.all_chr.numimpsnps.txt


#### Genotype data and imputed data

In [11]:
ls /SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv.*

/SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv.bed
/SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv.bim
/SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv.fam
/SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv.log


In [129]:
ls /SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/unrelated_n307259/UKB_unrelatedcauc_phenotypes_asthmat2dbmiwaisthip_agesex_waisthipratio_040620

/SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/unrelated_n307259/UKB_unrelatedcauc_phenotypes_asthmat2dbmiwaisthip_agesex_waisthipratio_040620


In [3]:
ls /SAY/dbgapstg/scratch/UKBiobank/genotype_files/ukb39554_imputeddataset/

[0m[01;34mdownloadscripts[0m/                            [01;32mukb_imp_chr18_v3.bgen[0m*
[01;32mencoding.ukb[0m*                               [01;32mukb_imp_chr18_v3.bgen.bgi[0m*
[01;32mfields.ukb[0m*                                 [01;32mukb_imp_chr19_v3.bgen[0m*
README                                      [01;32mukb_imp_chr19_v3.bgen.bgi[0m*
samples.txt                                 [01;32mukb_imp_chr1_v3.bgen[0m*
subset_sample_variant_ukb_imp_chr1_v3.bgen  [01;32mukb_imp_chr1_v3.bgen.bgi[0m*
subset_ukb_imp_chr1_v3.bgen                 [01;32mukb_imp_chr20_v3.bgen[0m*
[01;32mukb1_1581369705_4814.tmp_bulk[0m*              [01;32mukb_imp_chr20_v3.bgen.bgi[0m*
[01;32mukb1_1581570817_24750.tmp_bulk[0m*             [01;32mukb_imp_chr21_v3.bgen[0m*
[01;32mukb1_1581572048_25854.tmp_bulk[0m*             [01;32mukb_imp_chr21_v3.bgen.bgi[0m*
[01;32mukb32285_imp_chr22_v3_s487297.sample[0m*       [01;32mukb_imp_chr22_v3.bgen[0m*
[01;32mukb32285_imputedi

#### Phenotype data

In [5]:
ls /SAY/dbgapstg/scratch/UKBiobank/phenotype_files/pleiotropy_R01/phenotypesforanalysis

Asthma_casesbyICD10codesANDselfreport_controlsbyselfreportandicd10_forbolt030720
Asthma_casesbyICD10codesANDselfreport_controlsbyselfreportandicd10_noautoimmuneincontrols_forbolt030720
diabetes_casesbyICD10andselfreport_controls_030620
diabetes_casesbyICD10andselfreport_controlswithoutautoiummune_030720
UKB_caucasians_BMI_nopreg_adjagesex_residuals_andstandardized_022720
UKB_caucasians_BMIwaisthip_AsthmaAndT2D_withagesex_033120
UKB_caucasians_waistcircumference_adjbmiagesex_nopreg_residuals_022720
UKB_caucasians_waisthipratio_adjbmiagesex_nopreg_residuals_022720
UKBrelated_AsthmaT2D_casesbyfieldoricd10_noautoincontrols_dataforbothphenotypes_040820


## Install pandas_plink

a Python package for reading PLINK binary file format and PLINK and GCTA realized relationship matrices. The file reading is taken place via lazy loading, meaning that it saves up memory by actually reading only the genotypes that are actually accessed by the user.(https://pypi.org/project/pandas-plink/)

In [None]:
pip install pandas-plink