In [70]:
import pandas as pd

In [71]:
path = "/home/users/jolivier/oak/users/jolivier/repos/hla-assoc/"
outpath = "output/frequency_stratification/"

# Create csv of all posterior probabilities

In [72]:
# map phenotype names to id codes
phe_names = {}
HC_map = open(path + "data/highconfidenceqc_map.csv","r")
HC_map.readline()
for line in HC_map.readlines():
    line = line.split(",")
    phe_names[line[0]] = line[1][:-1]
HC_map.close()
cancer_map = open(path + "data/cancermap.csv","r")
cancer_map.readline()
for line in cancer_map.readlines():
    line =line.split(",")
    phe_names[line[0]] = line[1][:-1]
cancer_map.close()


In [73]:
# create dataframe of values we're interested in and write to csv
all_post = {"phe_ID" : [], "phenotype" : [], "allelotype" : [], "posterior_probability" : []}
df_post = pd.read_csv(path + "scripts/output/hap_post_plot/phe_hap_table_post_adjp.csv", index_col = 0, header=0)
for phe in df_post.index.values:
    for alel in df_post.columns.values:
        post = df_post[alel][phe]
        if not pd.isnull(post):
            all_post["phe_ID"].append(phe)
            all_post["phenotype"].append(phe_names[phe])
            all_post["allelotype"].append(alel)
            all_post["posterior_probability"].append(post)
all_post_df = pd.DataFrame(all_post).sort_values(by=["posterior_probability"], ascending=False)
all_post_df.to_csv(outpath + "all_post_BMA.csv", index=False)

## Breakdown of significant BMA findings

In [74]:
cutoff = 0.8
cut_all_post_df = all_post_df[all_post_df["posterior_probability"] > cutoff]
print("Number of post probabilities above {} : {}".format(cutoff,len(cut_all_post_df)))
print("Number of phenotypes with post probabilities above {}: {}".format(cutoff,len(set(cut_all_post_df["phe_ID"]))))
print("Number of allelotypes with post probabilities above {}: {}".format(cutoff,len(set(cut_all_post_df["allelotype"]))))

Number of post probabilities above 0.8 : 60
Number of phenotypes with post probabilities above 0.8: 31
Number of allelotypes with post probabilities above 0.8: 30


# Find frequency breakdown of allelotypes

In [75]:
# create allele frequency dictionary
allele_freq_file = open(path + "notebooks/output/check_firth/allele_freq.csv","r")
allele_freq_file.readline()
allele_freq_dict = {}
for line in allele_freq_file.readlines():
    line = line.split(",")
    allele_freq_dict[line[0]] = int(line[1][:-3])
allele_freq_file.close()

BMA_alleles_file = open(path + "notebooks/output/compare_cutoff_pvals/BMA_allele_torun.txt", "r")
BMA_alleles = set(BMA_alleles_file.readline().split())
BMA_alleles_file.close()

# only consider alleles that we ran BMA on
for key in allele_freq_dict.keys():
    if key not in BMA_alleles:
        del allele_freq_dict[key]
        

## Frequency cutoffs being used and number of allelotypes in each level

In [76]:
num_individuals = 337208.
thresh1 = 0.01
thresh2 = 0.05

bins = [[],[],[]]

for key in allele_freq_dict.keys():
    perc = allele_freq_dict[key]/num_individuals
    if perc < thresh1:
        bins[0].append(key)
    elif perc < thresh2:
        bins[1].append(key)
    else:
        bins[2].append(key)
print("num less than  {}: {} \nnum [{},{}): {} \nnum greater than {}: {}".format(thresh1,len(bins[0]), thresh1, thresh2,len(bins[1]), thresh2,len(bins[2])))

num less than  0.01: 4 
num [0.01,0.05): 10 
num greater than 0.05: 48


In [77]:
full_post = pd.read_csv(path + "scripts/output/hap_post_plot/phe_hap_table_post_adjp.csv", header=0, index_col=0)
full_EV = pd.read_csv(path + "scripts/output/hap_post_plot/phe_hap_table_EV_adjp.csv", header=0, index_col=0)
full_SD = pd.read_csv(path + "scripts/output/hap_post_plot/phe_hap_table_SD_adjp.csv", header=0, index_col=0)

In [79]:
# divide into rare, midlevel, and common alleles
rare_post = full_post[bins[0]]
rare_post.to_csv(outpath + "rare_post_" + str(thresh1) + "_" + str(thresh2) + ".csv")
rare_EV = full_EV[bins[0]]
rare_EV.to_csv(outpath + "rare_EV_" + str(thresh1) + "_" + str(thresh2) + ".csv")
rare_SD = full_SD[bins[0]]
rare_SD.to_csv(outpath + "rare_SD_" + str(thresh1) + "_" + str(thresh2) + ".csv")


mid_post = full_post[bins[1]]
mid_post.to_csv(outpath + "uncommon_post_" + str(thresh1) + "_" + str(thresh2) + ".csv")
mid_EV = full_EV[bins[1]]
mid_EV.to_csv(outpath + "uncommon_EV_" + str(thresh1) + "_" + str(thresh2) + ".csv")
mid_SD = full_SD[bins[1]]
mid_SD.to_csv(outpath + "uncommon_SD_" + str(thresh1) + "_" + str(thresh2) + ".csv")


common_post = full_post[bins[2]]
common_post.to_csv(outpath + "common_post_" + str(thresh1) + "_" + str(thresh2) + ".csv")
common_EV = full_EV[bins[2]]
common_EV.to_csv(outpath + "common_EV_" + str(thresh1) + "_" + str(thresh2) + ".csv")
common_SD = full_SD[bins[2]]
common_SD.to_csv(outpath + "common_SD_" + str(thresh1) + "_" + str(thresh2) + ".csv")