## Code for the LDstore pipeline

## Create the Z file

This file is created from the mfi files. To check the column description please look [here](https://biobank.ndph.ox.ac.uk/ukb/refer.cgi?id=531)

The z-file for the 1725 blocks needs to be created from the variant_list file stored here `/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/113022_regions_bgen`

In [None]:
import pandas as pd
file = "/mnt/vast/hpc/csg/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_mfi_chr1_v3.txt"
z_file = pd.read_csv(file,header=None,sep="\t", names=["alternate_id", "rsid", "position", "allele1", "allele2", "maf", "minor_allele", "info_score"])

In [None]:
z_file.head()

In [None]:
z_copy = z_file.copy()
z_copy=z_copy.join(z_copy['alternate_id'].str.split(':', 1, expand=True).rename(columns={0:'chromosome', 1:'other'}))

In [None]:
z_copy.head()

In [None]:
z_copy["chromosome"]=z_copy['chromosome'].str.zfill(2)

In [None]:
z_copy=z_copy.drop(columns=['other'])

In [None]:
z_copy.head()

In [None]:
basename = os.path.basename(file)

In [None]:
basename + '.z'

In [None]:
z_copy[["rsid","chromosome", "position", "allele1", "allele2", "alternate_id", "minor_allele", "maf", "info_score"]].to_csv('/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/ukb39554_imputeddataset/' + basename + ".z" , sep='\t', index=False, header=True)

## 1725 Ld blocks

In [None]:
import pandas as pd
file = "/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/113022_regions_bgen/ukb_imp_chr22_v3.22:16050408-17674295.variants"
z_file = pd.read_csv(file,header=0,sep="\t", skiprows=1, skipfooter=1, names=['alternate_ids', 'rsid', 'chromosome', 'position', 'number_of_alleles', 'allele1', 'allele2'])

In [None]:
z_file

In [None]:
chr22 = "/mnt/vast/hpc/csg/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_mfi_chr22_v3.txt"

In [None]:
chr22_file = pd.read_csv(chr22,header=None,sep="\t", names=["alternate_id", "rsid", "position", "allele1", "allele2", "maf", "minor_allele", "info_score"])

In [None]:
chr22_file

In [None]:
subset=chr22_file.join(z_file.set_index(['alternate_ids', 'rsid', 'position', 'allele1', 'allele2']), on=['alternate_id', 'rsid', 'position', 'allele1', 'allele2'], how='right')

In [None]:
subset

In [None]:
rslt_df = subset[subset['maf'] > 0.1]

In [None]:
rslt_df

In [None]:
rslt_df['alternate_id']

In [None]:
rslt_df[["alternate_id"]].to_csv('/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/ukb_imp_chr22_v3.22:16050408-17674295.variants.mafabove0.01' , sep=' ', index=False, header=False)

In [None]:
rslt_df[["rsid","chromosome", "position", "allele1", "allele2", "alternate_id"]].to_csv('/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/ukb_imp_chr22_v3.22:16050408-17674295.variants.mafabove0.01.z' , sep=' ', index=False, header=True)

### Automating creation of the z files

In [None]:
import glob, os
import pandas as pd
# assign directory
flist=glob.glob("/mnt/vast/hpc/csg/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_mfi_chr*_v3.txt")
for filename in flist:
    print(filename)
    basename=os.path.basename(filename)
    print(basename)
    file=pd.read_csv(filename,header=None,sep="\t", names=["alternate_id", "rsid", "position", "allele1", "allele2", "maf", "minor_allele", "info_score"])
    f_copy=file.copy()
    f_copy.head()
    f_copy=f_copy.join(f_copy['alternate_id'].str.split(':', 1, expand=True).rename(columns={0:'chromosome', 1:'other'}))
    f_copy=f_copy.drop(columns=['other'])
    f_copy["chromosome"]=f_copy['chromosome'].str.zfill(2)
    f_copy[["rsid","chromosome", "position", "allele1", "allele2", "alternate_id", "minor_allele", "maf", "info_score"]].to_csv("/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/ukb39554_imputeddataset/" + basename + ".z", sep=" ", index=False, header=True)

### Create the z files with the subset of variants in a region

Additionally if you want to filter for MAF this needs to be done using the mfi file for the UKB case and providing the variants that you would like to analyze in the z file. 

In [None]:
regions=glob.glob('/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/regions_chr1_22/01/*.variants')
mfi_files=glob.glob("/mnt/vast/hpc/csg/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_mfi_chr*[0-9]_v3.txt")
for filename in regions:
    print(filename)
    basename=os.path.basename(filename)
    sp = basename.split("_")[0]
    chrom= sp.lstrip("0")
    print(basename)
    fullregion = pd.read_csv(filename,header=0,sep="\t", skiprows=1, skipfooter=1, names=['alternate_ids', 'rsid', 'chromosome', 'position', 'number_of_alleles', 'allele1', 'allele2'])
    print('The number of variants in the region is:',fullregion.shape[0])
    mfi_file = [file for file in mfi_files if "chr" + chrom + "_" in file][0]
    chrom_file = pd.read_csv(mfi_file,header=None,sep="\t", names=["alternate_id", "rsid", "position", "allele1", "allele2", "maf", "minor_allele", "info_score"])
    print('The number of variants in',chrom_file,'is:',chrom_file.shape[0])
    subset_region=chrom_file.join(fullregion.set_index(['alternate_ids', 'rsid', 'position', 'allele1', 'allele2']), on=['alternate_id', 'rsid', 'position', 'allele1', 'allele2'], how='right')
    maf=0.01
    subset_maf=subset_region[subset_region['maf'] > maf]
    print('The number of variants in the maf',maf,'filtered z_file is:',subset_maf.shape[0])
    #subset_maf[["rsid","chromosome", "position", "allele1", "allele2", "alternate_ids"]].to_csv('/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/' + basename + ".z" , sep='\t', index=False, header=True)
    

In [None]:
### Creation of the masterfile
import glob
import pandas as pd
from datetime import datetime
date = datetime.now().strftime('%Y%m%d')
cwd='/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/regions_chr1_22'
incl_samples='/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/regions_chr1_22/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.white_europeans.filtered.092821_ldprun_unrelated.filtered.incl'
number_of_samples=487409
z_files = glob.glob(cwd+'/22/*.z')
df = pd.DataFrame({'z':z_files})
bgen=glob.glob(cwd+'/22/*.bgen')
df1 = pd.DataFrame({'bgen':bgen})
bgi=glob.glob(cwd+'/22/*.bgen.bgi')
df2 = pd.DataFrame({'bgi':bgi})
df3=pd.concat([df,df1,df2], axis=1)
bcor=[i.replace('bgen','bcor') for i in bgen]
df4 = pd.DataFrame({'bcor':bcor})
ld=[i.replace('bgen','ld') for i in bgen]
df5 = pd.DataFrame({'ld':ld})
df_final=pd.concat([df3,df4,df5], axis=1)
#Add a constant number to every row in the sample column (the number of samples to analyze)
df_final['n_samples'] = df_final.apply(lambda x:number_of_samples, axis=1)
df_final['incl'] = df_final.apply(lambda x:incl_samples, axis=1)
df_final.to_csv(cwd +'/masterfile22_'+ date, sep=";", index=False, header=True)

In [None]:
df_final

In [None]:
bcor=[i.replace('bgen','bcor') for i in bcor]
df = pd.DataFrame({'bcor':bcor})
ld=[i.replace('bgen','ld') for i in ld]
df = pd.DataFrame({'ld':ld})
#Add a constant number to every row in the sample column (the number of samples to analyze)
result['n_samples'] = df.apply(lambda x:number_of_samples, axis=1)
result['incl'] = df.apply(lambda x:incl_samples, axis=1)
result.to_csv(cwd +'/masterfile_'+ date, sep=";", index=False, header=True)

# Test using the alternate_id instead of rsid

First of all use plink2 to read bgen file and output the snpids to a file to compare with mfi file. Please read this [file](https://biobank.ndph.ox.ac.uk/ukb/refer.cgi?id=531) to understand why ref-first

In [None]:
module load PLINK/2.0
plink2 --bgen /mnt/vast/hpc/csg/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr22_v3.bgen 'ref-first' \
       --sample /mnt/vast/hpc/csg/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb32285_imputedindiv.sample \
       --write-snplist \
       --out /mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/chr22_snplist

In [None]:
import pandas as pd
file= '/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/ukb39554_imputeddataset/ukb_mfi_chr22_v3.txt.z'
z = pd.read_csv(file,header=0,sep=" ")

In [None]:
z.head()

In [None]:
file= '/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/chr22_snplist.snplist'
snplist = pd.read_csv(file,header=None,sep=" ", names=["alternate_id"])

In [None]:
 z[["alternate_id","chromosome", "position", "allele1", "allele2", "rsid", "minor_allele", "maf", "info_score"]].to_csv("/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/chr22_snplist_alternate_id.z", sep=" ", index=False, header=True)

In [None]:
snplist.head()

In [None]:
z.shape

In [None]:
snplist.shape

In [None]:
z['rsid'].isin(snplist['alternate_id']).value_counts()

In [None]:
snplist['alternate_id'].isin(z['rsid']).value_counts()

In [None]:
# Creation of the z file
import glob, os
import pandas as pd
regions=glob.glob('/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/regions_chr1_22/**/*.variants', recursive=True)
mfi_files=glob.glob('/mnt/vast/hpc/csg/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_mfi_chr*[0-9]_v3.txt', recursive=True)
for filename in regions:
    print(filename)
    basename=os.path.basename(filename)
    sp = basename.split("_")[0]
    chrom= sp.lstrip("0")
    print(basename)
    fullregion = pd.read_csv(filename,header=0,sep="\t", skiprows=1, skipfooter=1, names=['alternate_ids', 'rsid', 'chromosome', 'position', 'number_of_alleles', 'allele1', 'allele2'])
    print('The number of variants in the region is:',fullregion.shape[0])
    mfi_file = [file for file in mfi_files if "chr" + chrom + "_" in file][0]
    chrom_file = pd.read_csv(mfi_file,header=None,sep="\t", names=["alternate_id", "rsid", "position", "allele1", "allele2", "maf", "minor_allele", "info_score"])
    print('The number of variants in',mfi_file,'is:',chrom_file.shape[0])
    subset_region=chrom_file.join(fullregion.set_index(['alternate_ids', 'rsid', 'position', 'allele1', 'allele2']), on=['alternate_id', 'rsid', 'position', 'allele1', 'allele2'], how='right')
    maf=0.001
    subset_maf=subset_region[subset_region['maf'] > maf]
    print('The number of variants in the maf',maf,'filtered z_file is:',subset_maf.shape[0])
    subset_maf[["rsid","chromosome", "position", "allele1", "allele2", "alternate_id"]].to_csv(filename + '_' + str(maf) + '.z', sep='\t', index=False, header=True)

# Scritps

In [None]:
## Set the bash variables 
cwd=/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/
ldstore_sbatch=$cwd/ldstore_test_$(date +"%Y-%m-%d").sbatch
masterfile=/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/master_file_chr22
jobsize=1
ldstore_sos=~/project/UKBB_GWAS_dev/workflow/111722_LDstore.ipynb
tpl_file=~/project/bioworkflows/admin/csg.yml
mem='80G'
job_size=1
numThreads=4
#rsid_file=/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/chr22_snplist.snplist

ldstore_args="""default 
    --cwd $cwd
    --masterfile $masterfile
    --numThreads $numThreads 
    --mem $mem 
    --job_size $job_size
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $ldstore_sos \
    --to-script $ldstore_sbatch \
    --args "$ldstore_args"

In [None]:
# Bash variables to create the z_files
cwd=/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/regions_chr1_22/
ldstore_sbatch=$cwd/ldstore_test_$(date +"%Y-%m-%d").sbatch
masterfile=/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/master_file_chr22
jobsize=1
ldstore_sos=~/project/UKBB_GWAS_dev/workflow/111722_LDstore.ipynb
tpl_file=~/project/bioworkflows/admin/csg.yml
mem='80G'
job_size=1
numThreads=4
#rsid_file=/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/chr22_snplist.snplist

ldstore_args="""default 
    --cwd $cwd
    --masterfile $masterfile
    --numThreads $numThreads 
    --mem $mem 
    --job_size $job_size
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $ldstore_sos \
    --to-script $ldstore_sbatch \
    --args "$ldstore_args"

In [None]:
export PATH=$HOME/miniconda3/bin:$PATH
module load Singularity/3.5.3
sos run /home/dmc2245/project/UKBB_GWAS_dev/workflow/111722_LDstore.ipynb \
    default \
    --cwd /mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/\
    --masterfile /mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/masterfile_chr22_alternateid\
    --numThreads 4 \
    --mem 80G     --job_size 1 \
    -c /home/dmc2245/project/bioworkflows/admin/csg.yml \
    -q csg -s force \
    &> /mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/ldstore_test_2022-11-21.log

In [None]:
#master file with alternate_id chr22
z;bgen;bgi;bcor;ld;n_samples
/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/ukb39554_imputeddataset/chr22_test.z;/mnt/vast/hpc/csg/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr22_v3.bgen;/mnt/vast/hpc/csg/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr22_v3.bgen.bgi;/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/ukb39554_imputeddataset/ukb_imp_chr22_v3.bcor;/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/ukb39554_imputeddataset/ukb_imp_chr22_v3.ld;487409

## Error troubleshooting

* Using the alternate_id from the bgen file creates even more errors
* Running the chr22 with the `--rsid` and `--write-text` option finds duplicated rsids which stops from running
* Issue is definately arising from the rsid which is used in the bgen file and not the alternate_id which is not duplicated

### Find duplicated rsids

In [None]:
import pandas as pd
file= '/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/chr22_snplist.snplist'
rsid = pd.read_csv(file,header=None,sep=" ",names=["rsid"])

In [None]:
rsid

In [None]:
seen = set()
dupes = [x for x in rsid if x in seen or seen.add(x)] 

In [None]:
duplicate = rsid[rsid.duplicated()]

In [None]:
duplicate

### Find the list of variants present in the bgen file

In [None]:
bgenix -g /mnt/vast/hpc/csg/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr22_v3.bgen -list > /mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/chr22_variants.txt

In [None]:
bgenix -g /mnt/vast/hpc/csg/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr22_v3.bgen -list > /mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/chr22_variants.txt \
| grep rs999224

In [None]:
#This is the numer of rows in this file. However it does not reflect the number of variants since this is multiallelic
cat /mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/chr22_variants.txt \
| grep "rs999224"

* The error arises since there are some variants in the mfi file that are not present in the bgen file 

* Next step is to remove the variants not present and re-run the analysis

* Error was on the z file with some variants not having a correct chromosome column

In [None]:
cat /mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/master_file_chr22.stdout | grep "Error" | awk -F"'" '{print $2 }'\
> /mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/rsid_not_found.txt

In [None]:
cat /mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/chr22_variants.txt | grep -xF /mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/rsid_not_found.txt

In [None]:
cd /mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/
#grep -f rsid_not_found.txt chr22_variants.txt

In [None]:
cat rsid_not_found.txt |wc -l

In [None]:
sort rsid_not_found.txt | uniq | wc -l

In [None]:
# 11117 -  45 variants have same rsid but different alleles
grep -wf rsid_not_found.txt chr22_variants.txt | wc -l 

In [None]:
grep -wf rsid_not_found.txt /mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/ukb39554_imputeddataset/ukb_mfi_chr22_v3.txt.z \
> /mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/chr22_notfound.z

## New run with the block data

In [None]:
## Set the bash variables 
cwd=/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/regions_chr1_22
ldstore_sbatch=$cwd/ldstore_blocks_test$(date +"%Y-%m-%d").sbatch
masterfile=/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/regions_chr1_22/masterfile22_20221222
jobsize=1
ldstore_sos=~/project/UKBB_GWAS_dev/workflow/111722_LDstore.ipynb
tpl_file=~/project/bioworkflows/admin/csg.yml
mem='40G'
job_size=1
numThreads=4

ldstore_args="""bcor 
    --cwd $cwd
    --masterfile $masterfile
    --numThreads $numThreads 
    --mem $mem 
    --job_size $job_size
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $ldstore_sos \
    --to-script $ldstore_sbatch \
    --args "$ldstore_args"

In [None]:
export PATH=$HOME/miniconda3/bin:$PATH
module load Singularity/3.5.3
sos run /home/dmc2245/project/UKBB_GWAS_dev/workflow/111722_LDstore.ipynb \
    bcor \
    --cwd /mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/regions_chr1_22\
    --masterfile /mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/regions_chr1_22/masterfile22_20221222\
    --numThreads 4 \
    --mem 40G \
    --job_size 1

In [None]:
#master file with a block of chr22
z;bgen;bgi;bcor;ld;n_samples
/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/113022_regions_bgen/ukb_imp_chr22_v3.22:16050408-17674295.variants.z;/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/113022_regions_bgen/ukb_imp_chr22_v3.22:16050408-17674295.bgen;/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/113022_regions_bgen/ukb_imp_chr22_v3.22:16050408-17674295.bgen.bgi;/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/113022_regions_bgen/ukb_imp_chr22_v3.22:16050408-17674295.bgen.bcor;/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/113022_regions_bgen/ukb_imp_chr22_v3.22:16050408-17674295.bgen.ld;487409

## Use tabix to retrieve region

In [None]:
tabix /mnt/vast/hpc/csg/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_mfi_chr22_v3.txt 22:16050408-17674295

In [None]:
cat ~/output/ldstore/ukb_mfi_chr22_v3.txt | head

In [None]:
tabix ~/output/ldstore/ukb_mfi_chr22_v3.txt -csi

In [None]:
# Creation of the z file: it has to be space delimited and not tab delimited
[zfile]
parameter: cwd = path
parameter: region_file = path
parameter: mfi_files = ''
parameter: maf = 0.0001
parameter: numThreads = 4
parameter: job_size = 5
parameter: mem = '40G'
parameter: walltime ='2h'
import pandas as pd
import glob 
df=pd.read_csv(region_file,header=0,sep="\t", names=["chr", "start", "stop"], dtype=str)
df.columns = df.columns.str.strip()
df1 = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df1['chr'] = df1['chr'].str.replace('chr','')
df1['chr'] = df1['chr'].str.zfill(2)
region=df1.values.tolist()
mfi=glob.glob(mfi_files)
input:for_each='region'
output:f"{cwd}/test/{region[0]}/{region[0]}_{region[1]}_{region[2]}.{maf}.z"
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
python: expand = "${ }", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout'
    import glob, os
    import pandas as pd
    in_file=f"{cwd}/{_region[0]}/{_region[0]}_{_region[1]}_{_region[2]}.variants"
    basename=os.path.basename(in_file)
    sp = basename.split("_")[0]
    chrom= sp.lstrip("0")
    print(basename)
    mfi_file = [file for file in {_mfi} if "chr" + chrom + "_" in file][0]    
    fullregion = pd.read_csv(in_file,header=0,sep="\t", skiprows=1, skipfooter=1, names=['alternate_ids', 'rsid', 'chromosome', 'position', 'number_of_alleles', 'allele1', 'allele2'])
    print('The number of variants in the region is:',fullregion.shape[0])
    chrom_file = pd.read_csv(mfi_file,header=None,sep="\t", names=["alternate_id", "rsid", "position", "allele1", "allele2", "maf", "minor_allele", "info_score"])
    print('The number of variants in',mfi_file,'is:',chrom_file.shape[0])
    subset_region=chrom_file.join(fullregion.set_index(['alternate_ids', 'rsid', 'position', 'allele1', 'allele2']), on=['alternate_id', 'rsid', 'position', 'allele1', 'allele2'], how='right')
    subset_maf=subset_region[subset_region['maf'] > ${maf}]
    print('The number of variants in the maf',maf,'filtered z_file is:',subset_maf.shape[0])
    subset_maf[["rsid","chromosome", "position", "allele1", "allele2", "alternate_id"]].to_csv(${_output:r}, sep=' ', index=False, header=True)

In [None]:
export PATH=$HOME/miniconda3/bin:$PATH
module load Singularity/3.5.3
sos run /home/dmc2245/project/UKBB_GWAS_dev/code/python/LD_Store_test.ipynb \
    zfile \
    --cwd /mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/regions_chr1_22/\
    --region_file /mnt/vast/hpc/csg/UKBiobank/data/ldblocks/EUR/fourier_ls-all.bed \
    --mfi_files /mnt/vast/hpc/csg/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_mfi_chr*[0-9]_v3.txt \
    --numThreads 4 \
    --mem 40G \
    --job_size 1