# Extracting data for genomic regions of interest

## Aim

To extract the summary statistics and genotype on specific genomic regions and calculate their LD matrix.

## Input and Output

### Input

- list of regions e.g. SNP rs849335 is at chr 7:28223990; and we extract 1Mb radius around it.
    - The locus will be represented by one line in the region list with 3 columns `7 27723990 28723990`
- list of all genotypes, in `bgen` format and in `plink` format.
    - The list is a file with 2 columns: `chr genotype_file_chr.ext`.
    - The first column is chromosome ID, the 2nd file is genotype for that chromosome.
    - When chromosome ID is 0, it implies that the genotype file contains all the genotypes.
- all summary statistics (eg, $\hat{\beta}$, $SE(\hat{\beta})$ and p-values)
    - These summary statistics should contain at least these columns: 
    - `chrom, pos, a0, a1, snp_id, bhat, sbhat, z, p`
    - If SNP id is not available we make it up as `chrom_pos_a1_a2`

- list of unrelated individuals (from which LD matrices will be computed)

**FIXME: examplain format of summary stats file and format of unrelated individual list**

### Output

- regional genotype
- regional LD accounting for related individuals
- reginonal summary stats

## Workflow usage

Using our minimal working example data-set,

```
sos run Region_Extraction.ipynb \
    --cwd output \
    --region-file data/regions.txt \
    --pheno-path data/phenotypes.txt \
    --geno-path data/genotype_inventory.txt \
    --sumstats-path /SAY/dbgapstg/scratch/UKBiobank/results/BOLTLMM_results/results_imputed_data/INT-BMI/ukb_imp_v3.UKB_caucasians_BMIwaisthip_AsthmaAndT2D_INT-BMI_withagesex_041720.BoltLMM.stats.all_chr.gz \
    --unrelated-samples data/UKB_unrelatedcauc_phenotypes_asthmat2dbmiwaisthip_agesex_waisthipratio_040620 
```

## Workflow codes

In [2]:
[global]
# Work directory where output will be saved to
parameter: cwd = path
# Region specifications
parameter: region_file = path
# Genotype file inventory
parameter: geno_path = path
# Phenotype path
parameter: pheno_path = path
# Path to summary stats file
parameter: sumstats_path = path
# Path to samples of unrelated individuals
parameter: unrelated_samples = path

fail_if(not region_file.is_file(), msg = 'Cannot find regions to extract. Please specify them using ``--region-file`` option.')
# Load all regions of interest. Each item in the list will be a region: (chr, start, end)
regions = [x.strip().split() for x in open(region_file).readlines() if x.strip()]

## Some utility functions

- `plink_slice`: The function of extracting regional genotype
   - p: list of bim, fam, bed
   - pb: index of bim
   - pf: index of fam

In [2]:
[utils]
output: f'{cwd:a}/utils.py'
report: expand = '${ }', output=_output
    import numpy as np
    import pandas as pd

    def read_sumstat(file):
        try:
            sumstats = pd.read_csv(file, compression='gzip', header=0, sep='\t', quotechar='"')
            sumstats = sumstats.loc[:,['CHR','BP','SNP','ALLELE1','ALLELE0','BETA','SE','P_BOLT_LMM']]
        except:
            print("This is not a BOLT result.")
            try:
                sumstats = pd.read_csv(file, header=0, sep='\t', quotechar='"')
                sumstats = sumstats.loc[:,['CHR','POS','SNP','A1','A2','BETA','SE','P']]
            except:
                raise ValueError('Plesae provide the files from BOLT or fastGWA')
        sumstats.columns = ['CHR','BP','SNP','ALT','REF','BETA','SE','P']
        return sumstats

    def region_index(bim,region, chrom_col=0, pos_col=3,allchrom=True):
        chr_bool = bim.iloc[:,chrom_col].astype(int) == region[0]
        chr_ind = chr_bool[chr_bool].index
        reg_bool = (bim.iloc[:,pos_col][chr_ind]>region[1]) & (bim.iloc[:,pos_col][chr_ind]<region[2])
        if allchrom:
            return chr_ind[reg_bool]
        else:
            return reg_bool.index       

    from scipy.stats import norm
    def p2z(pval,beta,twoside=True):
        if twoside:
            pval = pval/2
        z = [abs(norm.ppf(p)) if b>0 else -abs(norm.ppf(p)) for p,b in zip(pval,beta)]
        return z

    def regional_stat(ss,ind):
        rg_ss = ss.iloc[ind,:]
        rg_ss['Z'] = p2z(rg_ss.P,rg_ss.BETA)
        return rg_ss

    def plink_slice(p, pb=None,pf=None):
        (bim,fam,bed)=p
        if pb is not None:
            bim = bim.iloc[pb]
            bed = bed[pb,:]
        if pf is not None:
            fam = fam.iloc[pf]
            bed = bed[:,pf]
        return(bim,fam,bed)

    def LD_matrix(bed):
        snps = pd.DataFrame(bed.transpose())
        ld = snps.corr()
        return ld

    def extract_region(region,gwas,geno,pheno,unr,chroms=True,plink=True):

        # Extract the summary stat
        ss_ind = region_index(gwas,region, chrom_col=0, pos_col=1,allchrom=chroms)
        rg_stat = regional_stat(gwas,ss_ind)
        if plink:
            rg_geno = plink_slice(geno,pb=list(ss_ind))
            if not np.alltrue(rg_stat.BP==rg_geno[0].pos):
                raise ValueError('The regional genotype file and the regional result file do not match with each other 1')
            rg_bed = rg_geno[2].compute(num_workers=1)
        else:
            print("This is bgen format")
            rg_geno=[i for i in geno[0].iter_variants_in_region(region[0],region[1],region[2])]
            rg_pos = pd.Series([i[0].pos for i in rg_geno])
            print(rg_pos,rg_stat,sum(gwas.CHR==16),geno[0].nb_variants)
            if not np.alltrue(rg_stat.BP==rg_pos):
                raise ValueError('The regional genotype file and the regional result file do not match with each other 2')
            rg_bed=np.array([i[1] for i in rg_geno])

        fid = geno[1].iloc[:,0].astype(int)
        # Calculate the LD matrix based on unrelated individuals
        # FIXME: can we have both versions of LD -- from unrelated in the population and from unrelated in the sample? 
        # https://github.com/dianacornejo/pleiotropy_UKB/issues/1#issuecomment-643251513
        rg_ld = LD_matrix(rg_bed[:,fid.isin(unr.FID)])

        # genotypes in the sample of a specific phenotype
        # FIXME: do we have a guarantee that the sample ordering of genotypes and phenotypes match? 
        # I understand in PLINK format they do, but maybe BGEN also has that guarantee already? 
        # you can remove this comment if you are sure what you have done is correct about sample orders being consistent
        rg_bed = rg_bed[:,fid.isin(pheno.FID)]
        return dict(stats=rg_stat,geno=rg_bed,ld=rg_ld)

## Extract data

This step runs in parallel for all loci listed in the region file (via `for_each`).

In [1]:
[default (extract genotypes)]
depends: Py_Module('pandas_plink')
input: geno_path, pheno_path, sumstats_path, unrelated_samples, output_from('utils'), for_each = 'regions'
output: sumstats = f'{cwd:a}/{sumstats_path:bn}_{_regions[0]}_{_regions[1]}_{_regions[2]}.sumstats.gz',
        genotype = f'{cwd:a}/{geno_path:bn}_{pheno_path:bn}_{_regions[0]}_{_regions[1]}_{_regions[2]}.genotype.gz',
        ld = f'{cwd:a}/{geno_path:bn}_{pheno_path:bn}_{_regions[0]}_{_regions[1]}_{_regions[2]}.ld.gz'
python: expand = '${ }', input = _input[4], stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'
    
    # Load the file of summary statistics and standardize it.
    sumstats = read_sumstat(${_input[2]:r})
    # Load phenotype file
    pheno=pd.read_csv(${_input[1]:r},header=0,sep=' ', quotechar='"')
    # Load unrelated sample file
    unr = pd.read_csv(${_input[3]:r}, header=0, sep=' ', quotechar='"')
    
    # Load genotype file for the region of interest
    geno_inventory = dict([x.strip().split() for x in open(${_input[0]:r}).readlines() if x.strip()])
    if "${_regions[0]}" not in geno_inventory:
        geno_file = geno_inventory['0']
    else:
        geno_file = geno_inventory["${_regions[0]}"]
    import os
    if not os.path.exists(geno_file):
        # relative path
        if not os.path.exists('${_input[0]:ad}/' + geno_file):
            raise ValueError(f"Cannot find genotype file {geno_file}")
        else:
            geno_file = '${_input[0]:ad}/' + geno_file
    if geno_file.endswith('.bed'):
        plink = True
        from pandas_plink import read_plink
        geno = read_plink(geno_file)
        if len(sumstats)==len(geno[0]):
            # FIXME: why do we need `all_chrom`? 
            # a region is chr, start, end. It will have only one chrom to match
            all_chrom=True
        elif sum(sumstats.CHR==${_regions[0]})==len(geno[0]):
            all_chrom=False
        else:
            # FIXME: they dont always have to match each other -- in many cases they don't, when we 
            # only have reference genotype panel not the original genotype that generates the summary stats.
            # So, can we give a warning here instead, and only take the overlap between sumstats and 
            # genotype variants?
            raise ValueError('The PLINK genotypes and the result file do not match with each other')
    elif geno_file.endswith('.bgen'):
        plink = False
        from pybgen import PyBGEN
        bgen = PyBGEN(geno_file)
        bgen_fam = pd.read_csv(geno_file.split('.')+'.sample',header=0, sep='\t', quotechar='"')
        geno = [bgen,bgen_fam]
        if len(sumstats)==bgen.nb_variants:
            all_chrom=True
        elif sum(sumstats.CHR==${_regions[0]})==bgen.nb_variants:
            all_chrom=False
        else:
            raise ValueError('The BGEN genotypes and the result file do not match with each other')
    else:
        raise ValueError('Plesae provide the genotype files with PLINK binary format or BGEN format')
    
    rg_info = extract_region((${_regions[0]}, ${_regions[1]}, ${_regions[2]}), sumstats, geno, pheno, unr, all_chrom, plink)
    rg_info['stats'].to_csv(${_output[0]:r}, sep = "\t", header = True, index = False)
    np.savetxt(${_output[1]:r}, rg_info['geno'])
    np.savetxt(${_output[2]:r}, rg_info['ld'])