# SuSiE RSS Overlap

Using the intermediary finemapping results from the SuSiE RSS finemapping pipeline, the current pipeline determines if there is overlap between two or more phenotypes between regions of interest.

To run this notebook follow the example:

```
sos run SuSiE_RSS.ipynb \
    --cwd /gpfs/gibbs/pi/dewan/data/UKBiobank/results/fine_mapping/f3393_hearing_aid \
    --finemapped_region_dirs_file /gpfs/gibbs/pi/dewan/data/UKBiobank/results/region_extraction/f3393_hearing_aid \
    --region_file /gpfs/gibbs/pi/dewan/data/UKBiobank/results/region_extraction/f3393_hearing_aid/regions.txt \
    --sumstats_path /gpfs/gibbs/pi/dewan/data/UKBiobank/results/FastGWA_results/results_imputed_data/f3393_hearing_aid/*.snp_stats.gz \
    --container_lmm /home/dc2325/scratch60/lmm_v_1_4.sif \
    --container_marp /gpfs/gibbs/pi/dewan/data/UKBiobank/marp.sif -s build
```

In [1]:
[global]
# Path to region extraction files
parameter: finemapped_region_dirs_file = path
#The region file after LD clumping
parameter: region_file = path
# parameter: sumstats_file = path
#The directory for output files
parameter: cwd = path
## The container with the lmm/marp software. Can be either a dockerhub image or a singularity `sif` file.
parameter: container_lmm = 'statisticalgenetics/lmm:2.0'
parameter: container_marp = 'gaow/marp'
# Specific number of threads to use
parameter: numThreads = 2

fail_if(not region_file.is_file(), msg = 'Cannot find regions to fine map. Please specify them using ``--region-file`` option.')
# Load all regions of interest. Each item in the list will be a region: (chr, start, end)
regions = [x.strip() for x in open(region_file).readlines()]
regions = [x.replace(' ', '_' ) for x in regions]

fail_if(not finemapped_region_dirs_file.is_file(), msg = 'Cannot find directories of finemapped regions. Please specify them using ``--finemapped_region_dirs_file`` option.')
finemapped_dirs = [x.strip() for x in open(finemapped_region_dirs_file).readlines()]
finemapped_dirs = [x for x in finemapped_dirs]

parameter: name = "_".join([ d.split("/")[-1] for d in finemapped_dirs ])

In [None]:
[default_1 (export utils script)]
depends: Py_Module('os'), Py_Module('pandas'), Py_Module('numpy')
output: f'{cwd:a}/utils.py'
report: expand = '${ }', output=f'{cwd:a}/utils.py'

# will load an rds file into python
def load_rds(filename, types=None):
    import os
    import pandas as pd, numpy as np
    import rpy2.robjects as RO
    import rpy2.robjects.vectors as RV
    import rpy2.rinterface as RI
    from rpy2.robjects import numpy2ri
    numpy2ri.activate()
    from rpy2.robjects import pandas2ri
    pandas2ri.activate()
    def load(data, types, rpy2_version=3):
        if types is not None and not isinstance(data, types):
            return np.array([])
        # FIXME: I'm not sure if I should keep two versions here
        # rpy2_version 2.9.X is more tedious but it handles BoolVector better
        # rpy2 version 3.0.1 converts bool to integer directly without dealing with
        # NA properly. It gives something like (0,1,-234235).
        # Possibly the best thing to do is to open an issue for it to the developers.
        if rpy2_version == 2:
            # below works for rpy2 version 2.9.X
            if isinstance(data, RI.RNULLType):
                res = None
            elif isinstance(data, RV.BoolVector):
                data = RO.r['as.integer'](data)
                res = np.array(data, dtype=int)
                # Handle c(NA, NA) situation
                if np.sum(np.logical_and(res != 0, res != 1)):
                    res = res.astype(float)
                    res[res < 0] = np.nan
                    res[res > 1] = np.nan
            elif isinstance(data, RV.FactorVector):
                data = RO.r['as.character'](data)
                res = np.array(data, dtype=str)
            elif isinstance(data, RV.IntVector):
                res = np.array(data, dtype=int)
            elif isinstance(data, RV.FloatVector):
                res = np.array(data, dtype=float)
            elif isinstance(data, RV.StrVector):
                res = np.array(data, dtype=str)
            elif isinstance(data, RV.DataFrame):
                res = pd.DataFrame(data)
            elif isinstance(data, RV.Matrix):
                res = np.matrix(data)
            elif isinstance(data, RV.Array):
                res = np.array(data)
            else:
                # I do not know what to do for this
                # But I do not want to throw an error either
                res = str(data)
        else:
            if isinstance(data, RI.NULLType):
                res = None
            else:
                res = data
        if isinstance(res, np.ndarray) and res.shape == (1, ):
            res = res[0]
        return res
    def load_dict(res, data, types):
        '''load data to res'''
        names = data.names if not isinstance(data.names, RI.NULLType) else [
            i + 1 for i in range(len(data))
        ]
        for name, value in zip(names, list(data)):
            if isinstance(value, RV.ListVector):
                res[name] = {}
                res[name] = load_dict(res[name], value, types)
            else:
                res[name] = load(value, types)
        return res
    #
    if not os.path.isfile(filename):
        raise IOError('Cannot find file ``{}``!'.format(filename))
    rds = RO.r['readRDS'](filename)
    if isinstance(rds, RV.ListVector):
        res = load_dict({}, rds, types)
    else:
        res = load(rds, types)
    return res


# get a unique list of the input sequence
def f7(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

In [None]:
[default_2 (running multi pheno analysis)]
depends: f'{cwd:a}/utils.py'
output: overlap_analysis_summary = f'{cwd:a}/{name}.overlapped.md', variants_csv = f'{cwd}/{name}_overlapped_variants.csv'
python: container=container_lmm, expand = "${ }"
    theme = '''---
    theme: base-theme
    style: |
     p {
       font-size: 24px;
       height: 900px;
       margin-top:1cm;
      }
      img {
        height: 70%;
        display: block;
        margin-left: auto;
        margin-right: auto;
      }
      body {
       margin-top: auto;
       margin-bottom: auto;
       font-family: verdana;
      }
    ---    
    '''
    import numpy as np
    import pandas as pd
    import os
    import csv
    import matplotlib.pyplot as plt
    import seaborn

    regions = ${regions}
    finemapped_dirs = ${finemapped_dirs}

    sep = '\n\n---\n'

    def fail_if(b, msg):
        if b:
            raise ValueError(msg)
    
    # will load an rds file into python
    def load_rds(filename, types=None):
        import os
        import pandas as pd, numpy as np
        import rpy2.robjects as RO
        import rpy2.robjects.vectors as RV
        import rpy2.rinterface as RI
        from rpy2.robjects import numpy2ri
        numpy2ri.activate()
        from rpy2.robjects import pandas2ri
        pandas2ri.activate()
        def load(data, types, rpy2_version=3):
            if types is not None and not isinstance(data, types):
                return np.array([])
            # FIXME: I'm not sure if I should keep two versions here
            # rpy2_version 2.9.X is more tedious but it handles BoolVector better
            # rpy2 version 3.0.1 converts bool to integer directly without dealing with
            # NA properly. It gives something like (0,1,-234235).
            # Possibly the best thing to do is to open an issue for it to the developers.
            if rpy2_version == 2:
                # below works for rpy2 version 2.9.X
                if isinstance(data, RI.RNULLType):
                    res = None
                elif isinstance(data, RV.BoolVector):
                    data = RO.r['as.integer'](data)
                    res = np.array(data, dtype=int)
                    # Handle c(NA, NA) situation
                    if np.sum(np.logical_and(res != 0, res != 1)):
                        res = res.astype(float)
                        res[res < 0] = np.nan
                        res[res > 1] = np.nan
                elif isinstance(data, RV.FactorVector):
                    data = RO.r['as.character'](data)
                    res = np.array(data, dtype=str)
                elif isinstance(data, RV.IntVector):
                    res = np.array(data, dtype=int)
                elif isinstance(data, RV.FloatVector):
                    res = np.array(data, dtype=float)
                elif isinstance(data, RV.StrVector):
                    res = np.array(data, dtype=str)
                elif isinstance(data, RV.DataFrame):
                    res = pd.DataFrame(data)
                elif isinstance(data, RV.Matrix):
                    res = np.matrix(data)
                elif isinstance(data, RV.Array):
                    res = np.array(data)
                else:
                    # I do not know what to do for this
                    # But I do not want to throw an error either
                    res = str(data)
            else:
                if isinstance(data, RI.NULLType):
                    res = None
                else:
                    res = data
            if isinstance(res, np.ndarray) and res.shape == (1, ):
                res = res[0]
            return res
        def load_dict(res, data, types):
            '''load data to res'''
            names = data.names if not isinstance(data.names, RI.NULLType) else [
                i + 1 for i in range(len(data))
            ]
            for name, value in zip(names, list(data)):
                if isinstance(value, RV.ListVector):
                    res[name] = {}
                    res[name] = load_dict(res[name], value, types)
                else:
                    res[name] = load(value, types)
            return res
        #
        if not os.path.isfile(filename):
            raise IOError('Cannot find file ``{}``!'.format(filename))
        rds = RO.r['readRDS'](filename)
        if isinstance(rds, RV.ListVector):
            res = load_dict({}, rds, types)
        else:
            res = load(rds, types)
        return res

    # get a unique list of the input sequence
    def f7(seq):
        seen = set()
        seen_add = seen.add
        return [x for x in seq if not (x in seen or seen_add(x))]
    
    # using this fxn we can find the overlap of any number of phenotypes
    def find_overlap(regions, region_rds):
        tot_overlap = []
        for r in regions:
            rds = region_rds[r]

            overlap = dict()

            # put all the variants in the first pheno into overlap
            rd = rds[0]
            if rd["sets"]["cs"] == None:
                print(f"one phenotype has no CS")
                tot_overlap.append(dict())
                continue

            for cset in rd["sets"]["cs"].keys():
                if isinstance(rd["sets"]["cs"][cset], np.ndarray):
                    print("first rd array")
                    for snp_ind in rd["sets"]["cs"][cset]:
                        snp_ind = snp_ind.item() - 1
                        overlap[rd["pos"][snp_ind]] = [snp_ind]
                else:
                    snp_ind = rd["sets"]["cs"][cset]
                    snp_ind = snp_ind.item() - 1
                    overlap[rd["pos"][snp_ind]] = [snp_ind]

            try:
                for n, rd in enumerate(rds[1:]):
                    if rd["sets"]["cs"] == None:
                        # we raise an error that will be caught when we know we have at least one pheno that doesn't
                        # have any cs in this region, because we will find no overlaps here and should just move on
                        # to the next region
                        print("one phenotype has no CS")
                        raise ValueError("") 

                    temp = dict()
                    for cset in rd["sets"]["cs"].keys():
                        if isinstance(rd["sets"]["cs"][cset], np.ndarray):
                            for snp_ind in rd["sets"]["cs"][cset]:
                                snp_ind = snp_ind.item() - 1
                                temp[rd["pos"][snp_ind]] = snp_ind
                        else:
                            snp_ind = rd["sets"]["cs"][cset]
                            snp_ind = snp_ind.item() - 1
                            temp[rd["pos"][snp_ind]] = snp_ind

                    # now we should have an ordered list of all the indicies wrt each phenotype for any snp that's overlapped
                    org_overlap = list(overlap.keys())
                    for o in org_overlap:
                        if o not in temp.keys():
                            del overlap[o]
                        else:
                            overlap[o].append(temp[o])
            
                if not overlap:
                    # don't have any overlap so move on to the next region
                    raise ValueError("")
                tot_overlap.append(overlap)

            except:
                print(f"no overlap in region {r}")
                tot_overlap.append(dict())
                continue


        return tot_overlap

    # return the total number of credible variants
    def total_CVariants(rd):
        tot = 0
        if rd["sets"]["cs"] == None:
            return 0

        for cset in rd["sets"]["cs"].keys():
            if isinstance(rd["sets"]["cs"][cset], np.ndarray):
                for snp_ind in rd["sets"]["cs"][cset]:
                    tot += 1
            else:
                tot += 1

        return tot

    for i in finemapped_dirs:
        fail_if (not os.path.isdir(i), msg = f'{i} is not a directory')
    
    region_rds = dict()
    for r in regions:
        temp = []
        for d in finemapped_dirs:
            for i in os.listdir(d):
                 if os.path.isfile(os.path.join(d,i)) and r in i and ".SuSiE_RSS.rds" in i:
                    temp.append(load_rds(os.path.join(d,i)))
        fail_if(not len(temp) == len(finemapped_dirs), msg = f'region {r} has an error when obtaining rsd files')
        fail_if(not len(temp) > 1, msg = 'Do not have at least two phenotypes to compare')
        region_rds[r] = temp


    with open(${_output["overlap_analysis_summary"]:r}, "w") as f:
        tot_overlap = find_overlap(regions, region_rds)

        # first want to make a bar graph of the total number of credible variants in each phenotype for each region

        # each region will be one group
        # per group we'll have the number of variants per phenotype, and then the overlapped number

        phenos = [ d.split("/")[-1] for d in finemapped_dirs ]
    
        data = []
        col = ["Region"]
        col.extend([ "full " + p for p in phenos])
        col.append("Overlap")

        for en, r in enumerate(regions):
            temp = [r]

            # for each phenotype collect the total number of 
            for rd in region_rds[r]:
                temp.append(total_CVariants(rd))
            temp.append(len(tot_overlap[en]))
            data.append(temp)

        df = pd.DataFrame(data, columns=col)

        df.plot(x="Region", kind='bar', stacked=False, figsize=(14,5))
        plt.ylabel("Count of Credible Variants")
        plt.savefig("${cwd:a}/combo1.png", bbox_inches='tight')
    
        text_temp = ""
        text_temp += f"#\n\n Numerical Summary of Overlap \n"
        text_temp += f"![](${cwd:a}/combo1.png){sep} \n \n"
        f.write(text_temp)

        # this will be descriptive of the distribution
        data = []
        col = ["Region"]
        col.extend([ "only " + p for p in phenos])
        col.append("Overlap")

        for en, r in enumerate(regions):
            total = 0
            temp = []
            # for each phenotype collect the total number of 
            for rd in region_rds[r]:
                i = total_CVariants(rd) - len(tot_overlap[en])
                total += i
                temp.append(i)
            i = len(tot_overlap[en])
            total += i
            temp.append(i)
            temp = [t/total if total != 0 else 0 for t in temp]
            temp.insert(0, r)
            data.append(temp)

        df = pd.DataFrame(data, columns=col)

        df.plot(x="Region", kind='bar', stacked=True, title='Numerical Summary of Overlap', figsize=(14,5))
        plt.ylabel("Fraction of Credible Variants in Categories")
        plt.savefig("${cwd:a}/combo2.png", bbox_inches='tight')
    
        text_temp = ""
        text_temp += f"#\n\n Numerical Summary of Overlap \n"
        text_temp += f"![](${cwd:a}/combo2.png){sep} \n \n"
        f.write(text_temp)


        seaborn.set(style='ticks')
        variant_info = []


        for en, r in enumerate(regions):

            overlap = tot_overlap[en]
            if not overlap:
                continue # because we have no overlap in this

    
            data = dict()
            data["SNP"] = []
            data["PIP"] = []
            data["Pheno"] = []

            overlap = tot_overlap[en]
            col = ["SNP", "PIP", "Pheno"]

            for var in overlap.keys():
                for pheno_ind, rd in enumerate(region_rds[r]):
                    data["SNP"].append(var)
                    data["PIP"].append(rd["pip"][overlap[var][pheno_ind]])
                    data["Pheno"].append(phenos[pheno_ind])

            df = pd.DataFrame(data, columns=col)
            fg = seaborn.FacetGrid(data=df, hue='Pheno', hue_order=phenos, height=5, aspect=2.5)
            fg.map(plt.scatter, "SNP", 'PIP').add_legend()
            plt.title("Overlapped Variants for region "+r)
            plt.ticklabel_format(style='plain', axis='x')
            plt.ylim(0,1)
            plt.savefig(f"${cwd:a}/{r}_scatter.png", bbox_inches='tight')

            text_temp = ""
            text_temp += f"#\n\n Finemapping Overlap {r} Scatter\n"
            text_temp += f"![](${cwd:a}/{r}_scatter.png){sep} \n \n"
            f.write(text_temp)

            rds = region_rds[r]
            rd = rds[0]
            text_temp = ""
            text_temp += f"#\n\n Finemapping Overlap {r} \n"
            text_temp += "| chr number | pos | region id | \n"
            text_temp += "| --- | --- | --- | \n"

            for snp in overlap.keys():
                text_temp += f'| {rd["chr"][0]} | {snp} | {r} | \n'
                i = overlap[snp][0] # index for the first pheno. w the assumption that all ref and alts are the same
                temp = [rd["chr"][i], rd["pos"][i], rd["ref"][i], rd["alt"][i], r]
                for en, phen_rd in enumerate(region_rds[r]):
                    temp.append(phen_rd["pip"][overlap[snp][en]])
                variant_info.append(temp)
        
            text_temp += sep
            f.write(text_temp)

        col = ["chr", "pos", "ref", "alt", "rid"]
        col.extend([f"{p}_pip" for p in phenos])
        print(col)
        df = pd.DataFrame(variant_info, columns=col)
        print(df)
        df.to_csv(${_output["variants_csv"]:r}, sep = ",", header = True, index = False)



In [None]:
# Generate analysis report: HTML file, and optionally PPTX file
[default_3]
output: f"{_input['overlap_analysis_summary']:n}.html"
sh: container=container_marp, expand = "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    node /opt/marp/.cli/marp-cli.js ${_input['overlap_analysis_summary']} -o ${_output:a} \
        --title '${region_file:bnn} overlap fine mapping analysis' \
        --allow-local-files
    node /opt/marp/.cli/marp-cli.js ${_input['overlap_analysis_summary']} -o ${_output:an}.pptx \
        --title '${region_file:bnn} overal fine mapping analysis' \
        --allow-local-files