# Liftover summary stats from hg19 to hg38

The aim of this notebook is to do liftover of summary statistics data from hg19 to hg38

## Illustration on how to run this notebook

```
sos run liftover_hg19tohg38.ipynb \
    --input_path sumstats.snp_stats.gz\
    --cwd  output \
    --container_lmm lmm.sif
```

In [None]:
[global]
# Work directory where output will be saved to
parameter: cwd = path
# Input file (sumstats to do liftover)
parameter: input_path = path
# Remove unmapped coordinates from the sumstats (defaults to True)
parameter: remove_missing = True
# Container
parameter: container_lmm = 'statisticalgenetics/lmm:3.0'

In [None]:
[default_1 (export utils script)]
depends:  Py_Module('pandas'), Py_Module('liftover')
output: f'{cwd:a}/utils.py'
report:container=container_lmm, expand = '${ }', output=f'{cwd:a}/utils.py'

    import pandas as pd
    from liftover import get_lifter
    
    def read_regenie(file):
        try:
            sumstats = pd.read_csv(file, compression='gzip', header=0, sep='\t', quotechar='"')
        except:
            sumstats = pd.read_csv(file, header=0, sep='\t', quotechar='"')
        sumstats.SNP = 'chr'+sumstats.CHR.astype(str) + ':' + sumstats.POS.astype(str) + ':' + sumstats.REF.astype(str) + ':' + sumstats.ALT.astype(str)
        sumstats.CHR = sumstats.CHR.astype(int)
        sumstats.POS = sumstats.POS.astype(int)
        return sumstats

    def sumstat_liftover(ss,fr='hg19' ,to ='hg38'):
        chainmap = get_lifter(fr, to)
        lchr,lpos,lsnp = [],[],[]
        for c,p,a0,a1 in zip(ss.CHR,ss.POS,ss.REF,ss.ALT):
            try:
                new_c,new_p,_ = chainmap[int(c)][p][0]
                snp = ':'.join([new_c,str(new_p),a0,a1])
                new_c = int(new_c[3:])
            except:
                new_c, new_p = 0, 0
                snp = ':'.join(['chr'+str(c),str(p),a0,a1])
            lchr.append(new_c)
            lpos.append(new_p)
            lsnp.append(snp)
        ss.CHR = lchr
        ss.POS = lpos
        ss.SNP = lsnp
        return ss

    def main(input_path,output_path,output_unmapped,output_mapped,remove_missing):
        sums = read_regenie(input_path)
        sums1 = sumstat_liftover(sums)
        if remove_missing:
            sums1[sums1.CHR == 0].to_csv(output_unmapped, compression='gzip', sep = "\t", header = True, index = False)
            sums1[sums1.CHR != 0].to_csv(output_mapped, compression='gzip', sep = "\t", header = True, index = False)
        else:
            sums1.to_csv(output_path, compression='gzip', sep = "\t", header = True, index = False)


In [None]:
[default_2 (do liftover)]
depends: f'{cwd:a}/utils.py'
input: input_path
output: sumstats_lifted = f'{cwd}/{_input:bnn}.hg38.sumstats.gz',
        sumstats_unmapped = f'{cwd}/{_input:bnn}.hg38.sumstats_unmapped.gz',
        sumstats_mapped = f'{cwd}/{_input:bnn}.hg38.sumstats_mapped.gz'
task: trunk_workers = 1, job_size=1, walltime = '1h', mem = '8G', cores = 1, tags = f'{step_name}_{_output[0]:bn}'
python:container=container_lmm, input = f'{cwd:a}/utils.py', expand = '${ }', stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'
    
    input_path=${_input:r}
    output_path=${_output[0]:r}
    output_unmapped=${_output[1]:r}
    output_mapped=${_output[2]:r}
    remove_missing=${remove_missing}

    main(input_path,output_path,output_unmapped,output_mapped,remove_missing)