# LDstore

Check tutorial [here](http://www.christianbenner.com/#)


# Installation

```
pip3 install https://files.pythonhosted.org/packages/a8/fd/f98ab7dea176f42cb61b80450b795ef19b329e8eb715b87b0d13c2a0854d/ldstore-0.1.9.tar.gz 
    ```

# Create master file

The master file is a semicolon-separated text file and contains no space. It contains the following mandatory column names and one dataset per line.

For the Z file modify this file to be rsid:chrom:pos:a1:a2. Formatting for chromosome should be 01,02,03...etc

For the sample files remember to use only unrelated individuals from the 500K genotyped participants

# Run LDstore

In [None]:
[global]
# Working directory: change accordingly
parameter: cwd = path
# Path to bgen or plink files
parameter: masterfile = ''
parameter: mem = '80G'
parameter: walltime = '36h'
parameter: numThreads = 4
parameter: job_size = 1

## Create the z file the long way

In [None]:
# Creation of the z file: it has to be space delimited and not tab delimited
import glob, os, sys
import pandas as pd
log_file= open('/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/regions_chr1_22/regions1_22.log', "w")
regions=glob.glob('/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/regions_chr1_22/**/*.variants', recursive=True)
mfi_files=glob.glob('/mnt/vast/hpc/csg/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_mfi_chr*[0-9]_v3.txt', recursive=True)
for filename in regions:
    print(filename)
    basename=os.path.basename(filename)
    sp = basename.split("_")[0]
    chrom= sp.lstrip("0")
    print(basename)
    fullregion = pd.read_csv(filename,header=0,sep="\t", skiprows=1, skipfooter=1, names=['alternate_ids', 'rsid', 'chromosome', 'position', 'number_of_alleles', 'allele1', 'allele2'])
    print('The number of variants in the region is:',fullregion.shape[0])
    mfi_file = [file for file in mfi_files if "chr" + chrom + "_" in file][0]
    chrom_file = pd.read_csv(mfi_file,header=None,sep="\t", names=["alternate_id", "rsid", "position", "allele1", "allele2", "maf", "minor_allele", "info_score"])
    print('The number of variants in',mfi_file,'is:',chrom_file.shape[0])
    subset_region=chrom_file.join(fullregion.set_index(['alternate_ids', 'rsid', 'position', 'allele1', 'allele2']), on=['alternate_id', 'rsid', 'position', 'allele1', 'allele2'], how='right')
    maf=0.0001
    subset_maf=subset_region[subset_region['maf'] > maf]
    print('The number of variants in the maf',maf,'filtered z_file is:',subset_maf.shape[0])
    subset_maf[["rsid","chromosome", "position", "allele1", "allele2", "alternate_id"]].to_csv(filename + '_' + str(maf) + '.z', sep=' ', index=False, header=True)

## Automatic creation of the masterfile

In [None]:
# Creation of the masterfile
[masterfile (Creation of the master file)]
# Number of samples present in the bgen file
parameter: number_of_samples=int
# Parameter for samples to be included the name has to end in .incl
parameter: incl_samples = path('.')
from datetime import datetime
date = datetime.now().strftime('%Y%m%d')
output: f'{cwd}/{masterfile}{date}'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
python: expand = "${ }", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout'
    import glob
    import pandas as pd
    z_files = glob.glob(${cwd:r}+'/*/*.z')
    df = pd.DataFrame({'z':z_files})
    bgen=glob.glob(${cwd:r}+'/*/*.bgen')
    df1 = pd.DataFrame({'bgen':bgen})
    bgi=glob.glob(${cwd:r}+'/*/*.bgen.bgi')
    df2 = pd.DataFrame({'bgi':bgi})
    df3=pd.concat([df,df1,df2], axis=1)
    bcor=[i.replace('bgen','bcor') for i in bgen]
    df4 = pd.DataFrame({'bcor':bcor})
    ld=[i.replace('bgen','ld') for i in bgen]
    df5 = pd.DataFrame({'ld':ld})
    df_final=pd.concat([df3,df4,df5], axis=1)
    
    print(df_final)
    #Add a constant number to every row in the sample column (the number of samples to analyze)
    df_final['n_samples'] = df_final.apply(lambda x:${number_of_samples}, axis=1)
    df_final['incl'] = df_final.apply(lambda x:${incl_samples:r}, axis=1)
    df_final.to_csv(${cwd}+'/'+${masterfile}+'_'+${date}, sep=";", index=False, header=True)


## Running LDStore for BCOR file creation

In [None]:
# Create bdose file
[bdose]
input: masterfile
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = mem, cores = numThreads, tags = f'{_input:bn}'
bash: expand= "${ }", stderr = f'{_input}.stderr', stdout = f'{_input}.stdout'   
    ~/ldstore_v2.0_x86_64/./ldstore_v2.0_x86_64  \
    --in-files ${_input:n} \
    --write-bcor --write-bdose --bdose-version 1.1 \
    --n-threads ${numThreads} 

In [None]:
# Create bcor files
[bcor]
input: masterfile
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = mem, cores = numThreads, tags = f'{_input:bn}'
bash: expand= "${ }", stderr = f'{_input}.stderr', stdout = f'{_input}.stdout'   
    ~/ldstore_v2.0_x86_64/./ldstore_v2.0_x86_64  \
    --in-files ${_input} \
    --write-bcor \
    --read-only-bgen \
    --n-threads ${numThreads} 

In [None]:
# Calculate LD
[ld_1]
input: masterfile
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = mem, cores = numThreads, tags = f'{_input:bn}'
bash: expand= "${ }", stderr = f'{_input}.stderr', stdout = f'{_input}.stdout'   
    ~/ldstore_v2.0_x86_64/./ldstore_v2.0_x86_64 \
    --in-files ${_input} \
    --bcor-to-text

In [None]:
[ld_2]
import pandas as pd
master_list = pd.read_csv(masterfile,";")
input_list = master_list.iloc[:,[0,4]].values.tolist()
input:input_list, group_by = 2
output:  f'{cwd:a}/{_input[1]:b}.npz'
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = mem, cores = numThreads, tags = f'{_input[0]:bn}'
python: expand= "${ }", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout'
    import pandas as pd
    import numpy as np
    z_file=pd.read_csv(${_input[0]:r}, " ", skiprows=1, header=None)[5].to_numpy()
    np_ld = np.loadtxt(${_input[1]:r}, dtype = "float16")
    np.savez_compressed("${_output}", np_ld, z_file, allow_pickel = True)