# LDstore

Check tutorial [here](http://www.christianbenner.com/#)


# Installation

```
pip3 install https://files.pythonhosted.org/packages/a8/fd/f98ab7dea176f42cb61b80450b795ef19b329e8eb715b87b0d13c2a0854d/ldstore-0.1.9.tar.gz 

pip install pyliblzma
```

# Create master file

The master file is a semicolon-separated text file and contains no space. It contains the following mandatory column names and one dataset per line.

For the Z file modify this file to be rsid:chrom:pos:a1:a2. Formatting for chromosome should be 01,02,03...etc

For the sample files remember to use only unrelated individuals from the 500K genotyped participants

# Minimal working example

### Step 1. Create the bgen_list_file

This file contains 2 columns: path to bgen file; path to sample file

The code I used to get the info for the UKB specific data was

```
import pandas as pd
import glob

bgen=glob.glob('/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/regions_chr1_22/'+'/01/*.bgen')
df = pd.DataFrame({'bgen':bgen})
df['sample'] = df.apply(lambda x:'/mnt/vast/hpc/csg/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb32285_imputedindiv.sample', axis=1)
df.to_csv('/mnt/vast/hpc/csg/UKBiobank/results/pleiotropy_AD_ARHI/111822_LDstore_files/regions_chr1_22/01/chr1_bgen_list.txt',sep=';', header=True, index=False)
```


### Step 2. File formatting and subsetting in PLINK

```
sos run LDstore.ipynb \
    subset_bgen \
    --cwd test \
    --masterfile masterfile_name \
    --bgen_list_file test/data/chr1_bgen_list.txt \
    --maf_filter 0.001 \
    --numThreads 10 \
    --mem 10G \
    --job_size 1 \
    --container ~/containers/lmm.sif
```

### Step 3. Create the z (variant list) file to run LDstore based on snplist file generated in previous step

```
sos run LDstore.ipynb \
    z_file \
    --cwd test \
    --masterfile \
    --numThreads 10 \
    --mem 10G \
    --job_size 1
```

### Step 4. Create the new masterfile based on subset bgen files and newly generated z files


In this step you need to provide a string for the masterfile name. The samples to include format can be look at [LDstore tutorial](http://www.christianbenner.com/#)

```
sos run ~/project/UKBB_GWAS_dev/workflow/111722_LDstore.ipynb \
    masterfile \
    --cwd test \
    --masterfile masterfile_name\
    --number_of_samples 351430 \
    --incl_samples test/samples_to_include.incl \
    --numThreads 10 \
    --mem 1G \
    --job_size 1
```

### Step 5. Get the bcor (variant correlation) files using LDstore
```
sos run LDstore.ipynb \
    bcor\
    --cwd test \
    --masterfile masterfile_name\
    --numThreads 10 \
    --mem 100G \
    --job_size 1
```

### Step 6. Generate ld text files and save output with xz compression format

```
sos run LDstore.ipynb \
    ld\
    --cwd test \
    --masterfile masterfile_name\
    --numThreads 10 \
    --mem 100G \
    --job_size 1
```


### Step 7. Optional - LiftOver the coordinates from hg19 to hg38 usign the bim file

This step is accomplished using some awk code in combination to [UCSC liftOver](https://genome.ucsc.edu/FAQ/FAQdownloads.html#liftOver) command line tool. 

```
sos run LDstore.ipynb \
     liftover \
    --cwd test \
    --masterfile 'masterfile_name'\
    --bim_name `echo test/*_hg19.bim` \
    --to_build hg38 \
    --chain_file test/hg19ToHg38.over.chain.gz
```


# Run LDstore

In [None]:
[global]
# Working directory: change accordingly
parameter: cwd = path
# Path to bgen or plink files
parameter: masterfile = ''
# Memory allocated
parameter: mem = '80G'
## Walltime for the job
parameter: walltime = '36h'
# Number of Threads
parameter: numThreads = 4
# Job size
parameter: job_size = 1
# The container with the specific software
parameter: container = 'statisticalgenetics/lmm:2.4'
parameter: container_liftover = ''

## Step 2. Subset bgen files


In [None]:
# Subset bgen files to a specific maf, change the variant id, write snplist to create *z file and output bim files for downstream liftover
[subset_bgen]
import pandas as pd
parameter: bgen_list_file = path
bgen_list = pd.read_csv(bgen_list_file,sep=";")
# This creates a list in which [0] is the original *.bgen file and [1] is the *.sample file 
input_list = bgen_list.iloc[:,[0,1]].values.tolist()
parameter: maf_filter = 0.001
input:input_list, group_by = 2
output: f'{cwd:a}/{_input[0]:bn}.{maf_filter}.subset.bgen', 
        f'{cwd:a}/{_input[0]:bn}.{maf_filter}.subset.sample', 
        f'{cwd:a}/{_input[0]:bn}.{maf_filter}.subset.snplist',
        f'{cwd:a}/{_input[0]:bn}.{maf_filter}.subset.bim',
        f'{cwd:a}/{_input[0]:bn}.{maf_filter}.subset.bgen.bgi'
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = mem, cores = numThreads, tags = f'{_input[0]:bn}'
bash: container = container, expand= "${ }", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'
    plink2 --bgen ${_input[0]} 'ref-first' \
    --sample ${_input[1]} \
    --write-snplist \
    --out ${_output[0]:n} \
    --maf ${maf_filter} \
    --export bgen-1.2 'bits=8' 'ref-first' \
    --set-all-var-ids '@:#:$r:$a' \
    --new-id-max-allele-len 100 \
    --make-just-bim

    bgenix -g ${_output[0]} -index
    

## Step 3. Create the z file using the snplist from previous step

In [None]:
[z_file (Creation of the variant list for LDStore2)]
import glob
snplist_files = glob.glob(f'{cwd}/*.snplist', recursive=True)
input: snplist_files, group_by = 1
output: f'{cwd:a}/{_input:bn}.z'
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = mem, cores = numThreads, tags = f'{_input:bn}'
python: expand = "${ }", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout'
    import pandas as pd
    snplist = pd.read_csv('${_input}', header=None, sep='\t', names=["rsid"])
    snplist[['chromosome', 'position', 'allele1', 'allele2']] = snplist['rsid'].str.split(':', expand=True)
    snplist[["rsid","chromosome", "position", "allele1", "allele2"]].to_csv('${_output}', header=True, index=False, sep= ' ')

## Step 4. Create the new masterfile with subsetted bgen

In [None]:
# Creation of the masterfile
[masterfile (Creation of the masterfile): provides=f'{cwd}/{masterfile}.masterfile']
# Number of samples present in the bgen file
parameter: number_of_samples=int
# Parameter for samples to be included the name has to end in .incl
parameter: incl_samples = path('.')
output: f'{cwd}/{masterfile}.masterfile'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
python: expand = "${ }", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout'

    import glob
    import pandas as pd

    bgen = glob.glob(${cwd:r}+'/*.bgen')
    masterfile = pd.DataFrame({
        "z": glob.glob(${cwd:r}+'/*.z'),
        "bgen": glob.glob(${cwd:r}+'/*.bgen'),
        "bgi": [i.replace('bgen', 'bgen.bgi') for i in bgen],
        "bcor": [i.replace('bgen', 'bcor') for i in bgen],
        "ld": [i.replace('bgen', 'ld') for i in bgen],
        "sample": [i.replace('bgen', 'sample') for i in bgen],
    })

    #Add a constant number to every row in the sample column (the number of samples to analyze)
    masterfile['n_samples'] =  masterfile.apply(lambda x:${number_of_samples}, axis=1)
    #Add a constant number to every row in the sample column (the samples to include in the analysis)
    masterfile['incl'] = masterfile.apply(lambda x:${incl_samples:r}, axis=1)
    masterfile = masterfile[['z', 'bgen', 'bgi', 'bcor', 'ld', 'n_samples', 'sample', 'incl']]
    masterfile.to_csv(${_output:r}, sep=";", index=False, header=True)

## Step 5 and 6 Running LDStore for BCOR and LD file creation

In [None]:
# Create bdose file
[bdose]
input: f'{cwd}/{masterfile}.masterfile'
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = mem, cores = numThreads, tags = f'{_input:bn}'
bash: expand= "${ }", stderr = f'{_input}.{step_name}.stderr', stdout = f'{_input}.{step_name}.stdout'   
    ~/ldstore_v2.0_x86_64/./ldstore_v2.0_x86_64  \
    --in-files ${_input:n} \
    --write-bcor --write-bdose --bdose-version 1.1 \
    --n-threads ${numThreads} 

In [None]:
# Create bcor files
[bcor]
#FIXME track the generation of the bcor files
import pandas as pd
master_list = pd.read_csv(f'{cwd}/{masterfile}.masterfile',sep=";")
input_list = master_list.iloc[:,[1]].values.tolist()
input: input_list, group_by=1
output: f'{cwd}/{_input:bn}.bcor'
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = mem, cores = numThreads, tags = f'{_input:bn}'
bash: expand= "${ }", stderr = f'{_input}.{step_name}.stderr', stdout = f'{_input}.{step_name}.stdout'   
    ~/ldstore_v2.0_x86_64/./ldstore_v2.0_x86_64  \
    --in-files f'${cwd}/${masterfile}.masterfile'\
    --write-bcor \
    --read-only-bgen \
    --n-threads ${numThreads} \
    --compression 'high'

In [None]:
# Calculate LD
[ld_1]
import pandas as pd
master_list = pd.read_csv(f'{cwd}/{masterfile}.masterfile',sep=";")
input_list = master_list.iloc[:,[3]].values.tolist()
input: input_list, group_by=1
output: f'{cwd}/{_input:bn}.ld'
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = mem, cores = numThreads, tags = f'{_input:bn}'
bash: expand= "${ }", stderr = f'{_input}.{step_name}.stderr', stdout = f'{_input}.{step_name}.stdout'   
    ~/ldstore_v2.0_x86_64/./ldstore_v2.0_x86_64 \ \
    --bcor-to-text \
    --bcor-file ${_input} \
    --ld-file ${_output}

In [None]:
# Output LD matriz as compressed xz format
[ld_2]
import pandas as pd
master_list = pd.read_csv(f'{cwd}/{masterfile}.masterfile',sep=";")
input_list = master_list.iloc[:,[0,4]].values.tolist()
input:input_list, group_by = 2
output:  f'{cwd:a}/{_input[0]:bn}.xz'
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = mem, cores = numThreads, tags = f'{_input[0]:bn}'
python: expand= "${ }", stderr = f'{_output}.{step_name}.stderr', stdout = f'{_output}.{step_name}.stdout'
    import pandas as pd
    import numpy as np
    import xz

    # This corresponds to the variant names (rsids) present in the zfile
    # For liftover we will use the bim file generated in step 2 of this workflow
    #z_file=pd.read_csv(${_input[0]:r}, sep=" ", skiprows=1, header=None)[0].to_numpy()
    np_ld = np.loadtxt(${_input[1]:r}, dtype = "float16")
    np.set_printoptions(formatter={'float': lambda x: "{0:0.6f}".format(x)})
    tri_lower_diag = np.tril(np_ld, k=0)
    # To save lower triangle and diagonal ld matrix as xz file. Everything above the diagonal is 0.0
    import xz
    with xz.open('${_output}', "w+", preset=9) as f:
        for r in range(tri_lower_diag.shape[0]):
            f.write(" ".join(["{:.6f}".format(x) for x in tri_lower_diag[r, :]]).encode())
            f.write(b"\n")

# Run LiftOver

In [None]:
# Run liftover
[liftover_1]
# Genome build to which you like to liftover your data
parameter: to_build = 'hg38'
parameter: chain_file = path
parameter: bim_name = paths
input: bim_name, group_by=1
output: f'{cwd:a}/{_input:bn}.bed',
        f'{cwd:a}/{_input:bn}.{to_build}.bed',
        f'{cwd:a}/{_input:bn}.{to_build}.unmapped.bed'
        #f'{cwd:a}/{_input:bn}.{to_build}.bim'
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = mem, cores = numThreads, tags = f'{_input:bn}'
bash: container=container_liftover, expand= "${ }", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'

    #First take the bim file generated in subset-bgen step and re-format to march bed file
    awk '{print "chr"$1,$4,$4,$2,$3,$5,$6}' ${_input} > ${_output[0]}
    #Now run liftover
    /home/dmc2245/liftover_ucsc/liftOver ${_output[0]} ${chain_file} ${_output[1]} ${_output[2]}
    # Count the unmapped positions
    echo "The number of unmapped variants is:"
    wc -l ${_output[2]}

In [None]:
# Organize bim file to account for unmapped variants
[liftover_2]
# Genome build to which you like to liftover your data
parameter: to_build = 'hg38'
parameter: bim_name = paths
input: bim_name, group_by=1
output: f'{cwd:a}/{_input:bn}.{to_build}.bim'
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = mem, cores = numThreads, tags = f'{_input:bn}'
python: container=container_liftover, expand= "${ }", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout'
    import pandas as pd 
    def merge_dataframes(df1, df2):
        # Merge DataFrames using a left join on the 'id' column
        merged_df = pd.merge(df1, df2, on='id', how='left', suffixes=('_df1', '_df2'))
        # Create a list to store dataframes
        result_dfs = []
        # Iterate through rows
        for index, row in merged_df.iterrows():
            if not pd.isna(row['pos_hg38']):  # If 'pos_hg38' is present in df2
                id_string = f"{int(row['chr_df2'])}:{int(row['pos_hg38'])}:{row['major_allele_df2']}:{row['minor_allele_df2']}"
                result_row = pd.Series([int(row['chr_df2']), id_string, int(row['cm_df2']), int(row['pos_hg38']), row['minor_allele_df2'], row['major_allele_df2']],
                                       index=['chr_df2', 'id', 'cm_df2', 'pos_hg38', 'minor_allele_df2', 'major_allele_df2'])
            else:
                 result_row = pd.Series([0, row['id'], int(row['cm_df1']), int(row['pos_hg19']), row['minor_allele_df1'], row['major_allele_df1']],
                                       index=['chr_df2', 'id', 'cm_df2', 'pos_hg38', 'minor_allele_df2', 'major_allele_df2'])

            result_dfs.append(result_row)
        # Concatenate the list of DataFrames into the final result DataFrame
        # Order of results in DataFrame will match with df1
        result_df = pd.DataFrame(result_dfs)

        return result_df

    df1=pd.read_csv('${_input}',sep='\t', header=None, names=["chr","id", "cm", "pos_hg19","minor_allele","major_allele"])
    df2=pd.read_csv(f'${_input:n}.${to_build}.bed',sep='\t', header=None, names=["chr","pos_hg38", "end", "id", "cm","minor_allele","major_allele"])
    df2['chr'] = df2['chr'].str.replace('chr', '')
    df_final= merge_dataframes(df1,df2)
    df_final.to_csv('${_output}', header=False, sep=' ', index=False)
    