### Notebook used to QC SEA imputed data

Allocate computing resources on Slurm cluster for imputation data QC


```bash
$ sdev -m 120 -J jNotebook
$ jupyter-notebook --no-browser --ip=0.0.0.0 --port=8010
```

IMPORTANT: Tested only with kernel version <code>Python 3.6.8</code>

Import required libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as tkr
import seaborn as sns
import numpy as np
import os, glob, psutil
from zipfile import ZipFile
import subprocess
import dotenv
import re

Create functions used during script

In [None]:
# Define a function to sort chromosomes as expected by humans (used during plotting)
def natural_sort(l):
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
    return sorted(l, key=alphanum_key)

# Functions used to add vertical and horizontal mean/median lines during plotting
def add_mean_horizontal_line(data, var=None, **kws):
    
    # If no variable provided skip adding mean line
    if not var: return
    
    #Calculate mean for each group
    m = np.mean(data[var])
    
    #Get current axis
    ax = plt.gca()
    
    #add line at group mean
    ax.axhline(
        m,
        color='maroon',
        lw=2,
        ls='--'
        )
        
    ax.text(
        0.7, 
        m+0.1 if (m+0.1) < 0.9 else m-0.125,
        f'mean={m:.2f}',
        # transforms positions to range from (0,0) to (1,1)
        transform=ax.transAxes,
        color='maroon',
        fontweight='bold',
        fontsize=8
        )

def add_median_horizontal_line(data, var=None, **kws):
    
    # If no variable provided skip adding median line
    if not var: return
    
    #Calculate median for each group
    m = np.median(data[var])
    
    #Get current axis
    ax = plt.gca()
    
    #add line at group median
    ax.axhline(
        m,
        color='maroon',
        lw=2,
        ls='--'
        )
        
    ax.text(
        0.6,
        m+0.1 if (m+0.1) < 0.9 else m-0.125,
        f'median={m:.2f}',
        # transforms positions to range from (0,0) to (1,1)
        transform=ax.transAxes,
        color='maroon',
        fontweight='bold',
        fontsize=8
        )
    
def add_mean_vertical_line(data, var=None, **kws):
    
    # If no variable provided skip adding mean line
    if not var: return
    
    #Calculate mean for each group
    m = np.mean(data[var])
    
    #Get current axis
    ax = plt.gca()
    
    #add line at group mean
    ax.axvline(
        m,
        color='maroon',
        lw=2,
        ls='--'
        )
        
    ax.text(
        m+0.05 if (m+0.1) < 0.9 else m-0.4,
        0.65,
        f'mean={m:.2f}',
        # transforms positions to range from (0,0) to (1,1)
        transform=ax.transAxes,
        color='maroon',
        fontweight='bold',
        fontsize=8
        )
    
def add_median_vertical_line(data, var=None, **kws):
    
    # If no variable provided skip adding median line
    if not var: return
    
    #Calculate median for each group
    m = np.median(data[var])
    
    #Get current axis
    ax = plt.gca()
    
    #add line at group median
    ax.axvline(
        m,
        color='maroon',
        lw=2,
        ls='--'
        )
        
    ax.text(
        m+0.05 if (m+0.1) < 0.9 else m-0.4,
        0.65,
        f'median={m:.2f}',
        # transforms positions to range from (0,0) to (1,1)
        transform=ax.transAxes,
        color='maroon',
        fontweight='bold',
        fontsize=8
        )



Extract all imputed zip files

In [None]:
# Loop over topmed folder with zip files extracting only info.gz files
# ZipFile function is too slow to extract large files so we are forced
# to make a system call to unzip directly from OS
dotenv.load_dotenv("./imputed_genotypes/imputation_passwords.env")

for imputation_zip_file in glob.glob( "./imputed_genotypes/*/*/*.zip"):
    print("Extracting file: {0}".format(imputation_zip_file) )
        
    imputation_zip_file_info = {
        "root_folder": imputation_zip_file.split( "/" )[0],
        "imputation_root_folder": imputation_zip_file.split( "/" )[1],
        "imputation_server": imputation_zip_file.split( "/" )[2],
        "genome_build": imputation_zip_file.split( "/" )[3],
        "filename": imputation_zip_file.split( "/" )[4]
    }
    
    password_key = imputation_zip_file_info["imputation_server"] + "_" + imputation_zip_file_info["genome_build"]
    zip_password = os.getenv( password_key )

    zip_files_folder = os.path.dirname( imputation_zip_file )

    # print( "unzip -P " + zip_password + " -d " + zip_files_folder + " " + imputation_zip_file)    
    subprocess.run(
        ["unzip", "-P", zip_password, "-d", zip_files_folder, imputation_zip_file],
        stdout=subprocess.PIPE
    )

Read <code>info.gz</code> files with metrics from imputation

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as tkr
import seaborn as sns
import numpy as np
import os, glob, psutil
from zipfile import ZipFile
import subprocess
import dotenv
import re

os.chdir("/labs/tassimes/rodrigoguarischi/projects/sea/imputed_data/topmed/")

li = []
n_file = 0

file_list = glob.glob( "./**/*.info.gz", recursive = True )

# Read all info.gz files printing memory stats during loading
for current_info_file in file_list:
    n_file += 1
    memory_used = psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2;
    print( "[{0} of {1}; {2:,.2f} MB] Reading file: {3}".format(n_file, len(file_list), memory_used, current_info_file ) )
    file_info = current_info_file.split( sep = "/" )
    current_race = file_info[1]
    file_name = file_info[2]
    current_imputation_info = pd.read_table( current_info_file )
    info_snps_above_005 = current_imputation_info[ current_imputation_info["MAF"] >= 0.005 ]
    info_snps_above_005 = info_snps_above_005.assign( race=current_race )
    # Delete info table to save memory
    del current_imputation_info
    li.append( info_snps_above_005 )

# Concatenate everything on pandas dataframe named imputation_info
imputation_info = pd.concat(li, axis=0, ignore_index=True)

# Delete li to save memory
del li

# Print memory status
memory_used = psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2;
print( "Total memory used during loading: {0:,.2f} MB".format(memory_used) )

imputation_info

[1 of 46; 142.29 MB] Reading file: ./blacks/chr19.info.gz
[2 of 46; 142.29 MB] Reading file: ./blacks/chr18.info.gz
[3 of 46; 142.29 MB] Reading file: ./blacks/chr9.info.gz
[4 of 46; 142.29 MB] Reading file: ./blacks/chr17.info.gz
[5 of 46; 142.29 MB] Reading file: ./blacks/chr5.info.gz
[6 of 46; 142.29 MB] Reading file: ./blacks/chr4.info.gz
[7 of 46; 142.29 MB] Reading file: ./blacks/chr16.info.gz
[8 of 46; 142.29 MB] Reading file: ./blacks/chr1.info.gz
[9 of 46; 142.29 MB] Reading file: ./blacks/chr12.info.gz
[10 of 46; 142.29 MB] Reading file: ./blacks/chr15.info.gz
[11 of 46; 142.29 MB] Reading file: ./blacks/chr8.info.gz
[12 of 46; 142.29 MB] Reading file: ./blacks/chr3.info.gz
[13 of 46; 142.29 MB] Reading file: ./blacks/chr7.info.gz
[14 of 46; 142.29 MB] Reading file: ./blacks/chr2.info.gz
[15 of 46; 142.29 MB] Reading file: ./blacks/chr14.info.gz
[16 of 46; 142.29 MB] Reading file: ./blacks/chr6.info.gz
[17 of 46; 142.29 MB] Reading file: ./blacks/chr21.info.gz
[18 of 46; 663.

Unnamed: 0,SNP,REF(0),ALT(1),ALT_Frq,MAF,AvgCall,Rsq,Genotyped,LooRsq,EmpR,EmpRsq,Dose0,Dose1,race
0,chr21:10035213:G:A,G,A,0.00630,0.00630,0.99435,0.22704,Imputed,-,-,-,-,-,blacks
1,chr21:10039257:T:C,T,C,0.00579,0.00579,0.99479,0.26245,Imputed,-,-,-,-,-,blacks
2,chr21:10041326:AT:A,AT,A,0.00716,0.00716,0.99365,0.18958,Imputed,-,-,-,-,-,blacks
3,chr21:10043906:A:AT,A,AT,0.02261,0.02261,0.97908,0.14589,Imputed,-,-,-,-,-,blacks
4,chr21:10055730:A:G,A,G,0.00892,0.00892,0.99186,0.20103,Imputed,-,-,-,-,-,blacks
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795407,chr22:50803709:T:A,T,A,0.01478,0.01478,0.98522,0.00295,Imputed,-,-,-,-,-,whites
795408,chr22:50803843:C:G,C,G,0.06840,0.06840,0.93160,0.08471,Imputed,-,-,-,-,-,whites
795409,chr22:50804129:A:T,A,T,0.04074,0.04074,0.95926,0.02850,Imputed,-,-,-,-,-,whites
795410,chr22:50805809:C:T,C,T,0.00542,0.00542,0.99458,0.00573,Imputed,-,-,-,-,-,whites


In [18]:
def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'Q_%s' % n
    return percentile_

for MAF_threshold in [0.01, 0.05]:
    print( "MAF threshold >= " + str(MAF_threshold) )
    # print( imputation_info[ imputation_info["MAF"] >= MAF_threshold ].groupby( [ "race" ] )["Rsq"].agg( [ "count", "min", percentile(25), "mean", "median", percentile(75), "max",  ] ).round(4) )
    print( imputation_info[ ( imputation_info["MAF"] >= MAF_threshold ) & ( imputation_info["MAF"] <= MAF_threshold ) ].groupby( [ "race" ] )["Rsq"].agg( [ "count", "min", percentile(25), "mean", "median", percentile(75), "max",  ] ).round(4) )
    print( "\n" )


MAF threshold >= 0.01
         count     min    Q_25    mean  median    Q_75  max
race                                                       
blacks  403044  0.0040  0.4777  0.6237  0.6670  0.8058  1.0
whites  251991  0.0018  0.2894  0.5196  0.5338  0.7467  1.0


MAF threshold >= 0.05
         count     min    Q_25    mean  median    Q_75  max
race                                                       
blacks  226229  0.0213  0.4649  0.6201  0.6613  0.8064  1.0
whites  170944  0.0100  0.3250  0.5457  0.5705  0.7760  1.0


MAF threshold >= 0.1
         count     min    Q_25    mean  median    Q_75  max
race                                                       
blacks  161300  0.0213  0.4524  0.6128  0.6532  0.8025  1.0
whites  134202  0.0150  0.3330  0.5476  0.5751  0.7747  1.0


MAF threshold >= 0.25
        count     min    Q_25    mean  median    Q_75  max
race                                                      
blacks  77576  0.0213  0.4200  0.5947  0.6366  0.7920  1.0
whites  74

In [None]:
imputation_root_folder = "./imputed_genotypes/"

li = []
n_file = 0

file_list = glob.glob( imputation_root_folder + "/**/" +  "/*.info.gz", recursive = True )

selected_chrs = ["chr19.info.gz", "chr20.info.gz", "chr21.info.gz", "chr22.info.gz", "chrX.info.gz"]

# Read all info.gz files printing memory stats during loading
for current_imputation_file in file_list:

    n_file += 1
    memory_used = psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2;
    print( "[{0} of {1}; {2:,.2f} MB] Reading file: {3}".format(n_file, len(file_list), memory_used, current_imputation_file ) )
    
    file_info = current_imputation_file.split( sep = "/" )
    imputation_ref = file_info[2]
    genome_build = file_info[3]
    file_name = file_info[4]
    
    # Skip some chromosomes for testing purposes
    if(file_name not in selected_chrs):
        continue
    
    current_imputation_info = pd.read_table( current_imputation_file )
    current_imputation_info["Reference_Panel"] = imputation_ref
    current_imputation_info["Genome_Build"] = genome_build

    li.append( current_imputation_info )

# Concatenate everything on pandas dataframe named imputation_info
imputation_info = pd.concat(li, axis=0, ignore_index=True)

# Delete li to save memory
del li

# Print memory status
memory_used = psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2;
print( "Total memory used during loading: {0:,.2f} MB".format(memory_used) )

Create <code>chr</code> and <code>pos</code> atributes in the dataframe

In [None]:
imputation_info_coords = imputation_info.SNP.str.split(":", expand = True)[[0,1]]
imputation_info_coords.columns = ["chr", "pos"]

imputation_info = imputation_info.join( imputation_info_coords )

# Print the first 5 lines
imputation_info.head()

In [None]:
memory_used = psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2;
print( "Memory used {0:,.2f} MB".format(memory_used) )

Small adjustments on pandas dataframes to enable proper interpretation/plotting

In [None]:
# Standardize chr names to downstream analysis (Info files are different if from hg19 or hg38)
imputation_info["chr"] = imputation_info["chr"].str.replace("^chr","", regex=True)

# Subset only to SNPs that were genotyped and save it on another dataframe
imputation_info_genotyped = imputation_info[ imputation_info.Genotyped == "Genotyped" ]

# Cast numeric variables to enable plotting
imputation_info = imputation_info.astype( { 
        'pos': 'int'
    }
)

print( "Object imputation_info:\n", imputation_info.dtypes , "\n" )

imputation_info_genotyped = imputation_info_genotyped.astype( { 
        'LooRsq': 'float', 
        'EmpR': 'float', 
        'EmpRsq': 'float',
        'Dose0': 'float',
        'Dose1': 'float',
        'pos': 'int'
    }
)

print( "Object imputation_info_genotyped:\n", imputation_info_genotyped.dtypes , "\n" )

In [None]:
maf_categories = [0, 0.0001, 0.001, 0.05, 0.5]

imputation_info["MAF_categories"] = pd.cut(
    imputation_info.MAF,
    maf_categories,
    include_lowest=True
    )

In [None]:
sns.boxplot(
    x = "MAF_categories",
    y = "Rsq",
    data = imputation_info,
    hue="Reference_Panel",
    hue_order=['michigan_hrc','topmed'],
    width=0.7,
    fliersize=2
    )

# Put the legend out of the figure
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
sns.stripplot(
    x = "MAF_categories",
    y = "Rsq",
    data = imputation_info,
    hue="Reference_Panel",
    hue_order=['michigan_hrc','topmed'],
    size = 1,
    dodge=True
    )

# Put the legend out of the figure
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
# ax = sns.boxplot(x="tip", y="day", data=tips, whis=np.inf)

ax = sns.boxplot(
    x = "MAF_categories",
    y = "Rsq",
    data = imputation_info,
    hue="Reference_Panel",
    hue_order=['michigan_hrc','topmed'],
    size = 1,
    dodge=True
    )

ax = sns.stripplot(
    x = "MAF_categories",
    y = "Rsq",
    data = imputation_info,
    hue="Reference_Panel",
    hue_order=['michigan_hrc','topmed'],
    size = 1,
    dodge=True
    )

In [None]:
sns.relplot(
    x = "MAF_categories",
    y = "Rsq",
    data = imputation_info_subset,
    kind='line',
    hue="Reference_Panel",
    hue_order=['michigan_hrc','topmed']
    )

# Put the legend out of the figure
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
imputation_info_subset = imputation_info.sample(100000)
imputation_info_subset["MAF_categories"] = imputation_info_subset["MAF_categories"].astype(str)

In [None]:
imputation_info_subset.dtypes

In [None]:
imputation_info_subset["MAF_categories"].head

In [None]:
g = sns.histplot(
    x = "Rsq",
    binwidth=0.1,
    binrange=(0,1),
    data = imputation_info,
#     data = imputation_info.sample(1000),
    hue = "Reference_Panel",
    hue_order=['michigan_hrc','topmed'],
    multiple='dodge',
    shrink=0.8
    )

g.set_yscale("log")

# # Put the legend out of the figure
# plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
sns.set_style('white')

# Define which attribute to use on y axis: Rsq
y_atribute = "Rsq"

g = sns.FacetGrid(
    imputation_info,
    col="chr",
    col_wrap=4,
    col_order=natural_sort( imputation_info.chr.unique() ),
    hue="Genotyped",
    sharex=False
    )
g.map_dataframe(
    sns.scatterplot,
    x="pos",
    y=y_atribute,
    alpha=0.4,
    s=6
    )

g.map_dataframe(
    add_median_horizontal_line, 
    var=y_atribute
    )
g.set_axis_labels(
    "chromosome position",
    y_atribute
    )

# Add a general title to the figure
g.fig.subplots_adjust(top=0.90)
g.fig.suptitle( "Genotyped and imputed SNPs from Michigan HRC hg19 (N={0:,})".format( len(imputation_info) ) )

# Edit title for the subplots
imputation_info_counts_by_chr = imputation_info["chr"].value_counts()
g.set_titles( col_template="{col_name}" )
for subfig in g.axes:
    chr_id = str( subfig.get_title() )
    number_of_snps = imputation_info_counts_by_chr[ chr_id ]
    subfig.set_title("chr{0} (N={1:,})".format(chr_id, number_of_snps) )

# format the labels with f-strings
for ax in g.axes.flat:
    ax.xaxis.set_major_formatter(tkr.FuncFormatter(lambda x, p: f'{round(x/1000000)}M'))
    
g.add_legend()

In [None]:
# sns.set_style('white')

# # Define which attribute to use on y axis: Rsq, LooRsq, EmpR, EmpRsq, Dose0, Dose1
# y_atribute = "LooRsq"

# g = sns.FacetGrid(
#     imputation_info_genotyped,
# #   row="Reference_Panel",
#     col="chr",
#     col_wrap=4,
#     col_order=natural_sort( imputation_info_genotyped.chr.unique() ),
#     sharex=False
#     )
# g.map_dataframe(
#     sns.scatterplot,
#     x="pos",
#     y=y_atribute,
#     alpha=0.5,
#     s=10
#     )

# g.map_dataframe(
#     add_median_horizontal_line, 
#     var=y_atribute
#     )
# g.set_axis_labels(
#     "chromosome position",
#     y_atribute
#     )

# # Add a general title to the figure
# g.fig.subplots_adjust(top=0.90)
# g.fig.suptitle( "Genotyped SNPs (N={0:,})".format( len(imputation_info_genotyped) ) )

# # Edit title for the subplots
# imputation_info_genotyped_counts_by_chr = imputation_info_genotyped["chr"].value_counts()
# g.set_titles( col_template="{col_name}" )
# for subfig in g.axes:
#     chr_id = str( subfig.get_title() )
#     number_of_snps = imputation_info_genotyped_counts_by_chr[ chr_id ]
#     subfig.set_title("chr{0} (N={1:,})".format(chr_id, number_of_snps) )

# # format the labels with f-strings
# for ax in g.axes.flat:
#     ax.xaxis.set_major_formatter(tkr.FuncFormatter(lambda x, p: f'{round(x/1000000)}M'))

In [None]:
sns.set_style('white')

# Define which attribute to use on y axis: Rsq, LooRsq, EmpR, EmpRsq, Dose0, Dose1
y_atribute = "LooRsq"

g = sns.FacetGrid(
    imputation_info_genotyped,
    row="chr",
    col="Reference_Panel",
#     col_wrap=4,
#     col_order=natural_sort( imputation_info_genotyped.chr.unique() ),
    sharex=False
    )
g.map_dataframe(
    sns.scatterplot,
    x="pos",
    y=y_atribute,
    alpha=0.5,
    s=10
    )

g.map_dataframe(
    add_median_horizontal_line, 
    var=y_atribute
    )
g.set_axis_labels(
    "chromosome position",
    y_atribute
    )

# Add a general title to the figure
g.fig.subplots_adjust(top=0.90)
g.fig.suptitle( "Genotyped SNPs (N={0:,})".format( len(imputation_info_genotyped) ) )

# Edit title for the subplots
imputation_info_genotyped_counts_by_chr = imputation_info_genotyped["chr"].value_counts()
g.set_titles( col_template="{col_name}" )
for subfig in g.axes:
    chr_id = str( subfig.get_title() )
    number_of_snps = imputation_info_genotyped_counts_by_chr[ chr_id ]
    subfig.set_title("chr{0} (N={1:,})".format(chr_id, number_of_snps) )

# format the labels with f-strings
for ax in g.axes.flat:
    ax.xaxis.set_major_formatter(tkr.FuncFormatter(lambda x, p: f'{round(x/1000000)}M'))

In [None]:
maf_categories = [0, 0.0001, 0.01, 0.05, 0.5]

imputation_info["MAF_categories"] = pd.cut(
    imputation_info.MAF,
    maf_categories
    ).astype(str)

# median_rsq_by_maf_tmp = imputation_info.groupby("MAF_categories")["Rsq"].median()

# median_rsq_by_maf = pd.DataFrame(
#     {
#     "MAF_categories": median_rsq_by_maf_tmp.index.astype(str),
#     "Median_observed_imput_rsq": median_rsq_by_maf_tmp.values
#     }
# )

# median_rsq_by_maf.head()

sns.lineplot(
    x = "MAF_categories",
    y = "Rsq",
    data = imputation_info,
    color = "maroon"
    )

In [None]:
sns.set_style('white')

g = sns.FacetGrid(
    imputation_info,
    col="chr",
    col_wrap=4,
    col_order=natural_sort( imputation_info.chr.unique() ),
    hue="Genotyped",
    sharex=False
    )
g.map_dataframe(
    sns.histplot,
    x="Rsq",
    binwidth=0.1,
    binrange=(0,1)
    )

g.map_dataframe(
    add_median_vertical_line, 
    var="Rsq"
    )

# Add a general title to the figure
g.fig.subplots_adjust(top=0.90)
g.fig.suptitle( "Genotyped and imputed SNPs from Michigan HRC hg19 (N={0:,})".format( len(imputation_info) ) )

# Edit title for the subplots
imputation_info_counts_by_chr = imputation_info["chr"].value_counts()
g.set_titles( col_template="{col_name}" )
for subfig in g.axes:
    chr_id = str( subfig.get_title() )
    number_of_snps = imputation_info_counts_by_chr[ chr_id ]
    subfig.set_title("chr{0} (N={1:,})".format(chr_id, number_of_snps) )
    
# format the labels with f-strings
for ax in g.axes.flat:
    ax.yaxis.set_major_formatter(tkr.FuncFormatter(lambda x, p: f'{round(x/1000)}k'))
    # ax.set_yscale("log")
    
# g.set_yscale("log")
g.add_legend()

In [None]:
maf_categories = [0, 0.0001, 0.01, 0.05, 0.5]

imputation_info["MAF_categories"] = pd.cut(
    imputation_info.MAF,
    maf_categories
    )

median_rsq_by_maf_tmp = imputation_info.groupby("MAF_categories")["Rsq"].median()

median_rsq_by_maf = pd.DataFrame(
    {
    "MAF_categories": median_rsq_by_maf_tmp.index.astype(str),
    "Median_observed_imput_rsq": median_rsq_by_maf_tmp.values
    }
)

g = sns.lineplot(
    x = "MAF_categories",
    y = "Median_observed_imput_rsq",
    data = median_rsq_by_maf,
    color = "maroon"
    )

g.set_title( "Genotyped and imputed SNPs from Michigan HRC hg19\n(N={0:,})".format( len(imputation_info) ) )
g.set(
    xlabel = "Reference panel Minor Allele Frequency",
    ylabel = "Median Observed Imput Rsq"
)

In [None]:
    # sns.histplot,
    # x="Rsq",
    # bins=10

sns.set_style('white')

sns.histplot(
    x = "MAF",
    y = "Rsq",
    # hue="maf_categories",
    data = imputation_info,
    bins = [0,0.0001,0.01,0.05,0.5]
    # stat='probability',
    # cumulative=True,
    # fill=False,
    # element='step'
)


In [None]:
# imputation_info["maf_categories"] = pd.cut(
#     imputation_info,
#     [-np.inf, 0.001, 0.1, 0.2, 0.3, np.inf]
#     )

print( imputation_info.groupby("maf_categories")["Rsq"].mean() )

# sns.histplot(
#     x = "Rsq",
#     # y = "Rsq",
#     hue="maf_categories",
#     data = imputation_info
#     # cumulative=True,
#     # kde=True
# )
#     x = "Rsq",
#     data = imputation_info,
#     bins = 10,
#     element='step',
#     col=
#     )


Create a plot of **Rsq** along with the chromosome genomic positions

In [None]:
# Create scatter plot of genomic positions by estimated r2 (from minimac4 leave one out column)
plt.scatter( 
    x = chr21_genotyped.pos.astype(int), 
    y = chr21_genotyped.LooRsq.astype(float), 
    marker = 'o',
    color='gray',
    s=3,
    alpha=0.75
    )

# Add horizontal line with mean LooRsq
mean_rsq = statistics.mean( chr21_genotyped.LooRsq.astype(float) )
plt.axhline(
    y=mean_rsq, 
    color='red',
    linestyle='--'
    )
plt.text( 
    x = max(chr21_genotyped.pos.astype(int))*0.825,
    y = mean_rsq+0.025,
    s = "Mean R\u00b2 = {:.2f}".format(mean_rsq),
    color='red'
    )

# Adjust labels and axis
plt.title( "chr21 (N = " + str(len(chr21_genotyped)) + ")" )
plt.xlabel("chr21 genomic position")
plt.ylabel("Empirical (Leave-One-Out) R\u00b2")

In [None]:
# Make a histogram of LooRsq
plt.hist(
  x = chr21_genotyped.LooRsq.astype(float),
  bins = 10,
  color='black',
  alpha = 0.6
  )

# Adjust labels and axis
plt.title( "chr21 (N = " + str(len(chr21_genotyped)) + ")" )
plt.xlabel("Empirical (Leave-One-Out) R\u00b2")
plt.ylabel("Frequency")

In [None]:
# Get number of passing variants at different RSq levels 
rsq_filter = [0, 0.001, 0.1, 0.2, 0.3]

total = len(chr21.Rsq.astype(float))

for c_rsq in rsq_filter:
    passing = sum ( chr21.Rsq.astype(float) >= c_rsq)
    print ("Passing = {0:,}, filtered= {1:,} \t (RSq threshold = {2:,})".format(passing, total-passing, c_rsq) )

Create a plot of **MAF** by **Mean imputation Rsq** 