In [1]:
import pandas as pd
import numpy as np
import rpy2.robjects as robjects
import matplotlib.pyplot as plt
from matplotlib import colors

# Get list of phenotypes and haplotypes

In [2]:
phe_list = []
# get names of cancer phe files

cancerphes_file = open("cancerphes.txt", "r")
for line in cancerphes_file.readlines():
    phe_list.append(line[:-1])
cancerphes_file.close()

# get names of HC phe files
for i in range(446):
    # after filtering there are no cases for HC65
    if i != 65:
        phe_list.append("HC" + str(i))

In [3]:
# get haplotype names

hap_list = []

hap_file = open("all_haps.txt","r")
for line in hap_file.readlines():
    hap_list.append(line[:-1])
hap_file.close()


# Create dataframe with p values

In [4]:
p_1_df = pd.read_csv('../manuscript/genotype_assoc_unadj_p_all_1.csv', index_col=0)
p_2_df = pd.read_csv('../manuscript/genotype_assoc_unadj_p_all_2.csv', index_col=0)

# Subset by frequency

In [9]:
# create allele frequency dictionary
allele_freq_file = open("allele_freq.csv","r")
allele_freq_file.readline()
allele_freq_dict = {}
for line in allele_freq_file.readlines():
    line = line.split(",")
    allele_freq_dict[line[0]] = int(line[1][:-3])
allele_freq_file.close()

In [10]:
allele_freq_dict["A_201"]

180969

In [11]:
# create phe frequency dictionary
phe_freq_file = open("phe_freq.csv","r")
phe_freq_file.readline()
phe_freq_dict = {}
for line in phe_freq_file.readlines():
    line = line.split(",")
    phe_freq_dict[line[0]] = int(line[1][:-1])
phe_freq_file.close()

In [12]:
phe_freq_dict["1060"]

17400

In [13]:
def adjust_pvals(p_df, phe_freq, allele_freq, phe_freq_dict, allele_freq_dict, thresh):
    phe_subset = []
    for phe in phe_freq_dict.keys():
        if phe_freq_dict[phe] > phe_freq:
            phe_subset.append(phe)
    allele_subset = []
    for allele in allele_freq_dict.keys():
        if allele_freq_dict[allele] > allele_freq:
            allele_subset.append(allele)
    # subset dataframe by phe_subset and allele_subset
    sub_df = p_df[allele_subset]
    sub_df = sub_df.loc[phe_subset]
    #print("sub_df shape: {}".format(sub_df))
    # turn into 1-d numpy array of p values
    s = sub_df.as_matrix().shape
    flat_df = sub_df.as_matrix().flatten()
    total = flat_df.shape[0]
    nom_p = robjects.FloatVector(flat_df)
    rpadjust = robjects.r['p.adjust']
    adj_p = np.array(rpadjust(nom_p, "BY"))
    adj_p = np.reshape(adj_p,s)
    # find the number of adjusted values below the threshold
    below_thresh = (adj_p < thresh).sum()
    adj_p_df = pd.DataFrame(adj_p,index=sub_df.index.values, columns=sub_df.columns.values)
    return below_thresh, total, adj_p_df

# Create table with different cutoff values

In [14]:
num_individuals = 337208 #from the cohort we have defined

below_thresh, total, adj_p_1_df = adjust_pvals(p_1_df, 500, num_individuals*0.001, phe_freq_dict, allele_freq_dict, 0.05)
below_thresh, total, adj_p_2_df = adjust_pvals(p_2_df, 500, num_individuals*0.001, phe_freq_dict, allele_freq_dict, 0.05)

  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [15]:
adj_p_1_df.to_csv("genotype_assoc_adj_p_all_1.csv")
adj_p_2_df.to_csv("genotype_assoc_adj_p_all_2.csv")

## Number with significant entries

In [16]:
sig_1_df = adj_p_1_df.loc[(adj_p_1_df < 0.05).sum(axis=1) > 0, :]
sig_1_df = sig_1_df[sig_1_df.columns[(sig_1_df<0.05).any()]]
print("number of phenotypes with significant entry: {}".format(sig_1_df.shape[0]))
print("number of allelotypes with significant entry: {}".format(sig_1_df.shape[1]))
sig_1_df.to_csv("genotype_assoc_adj_p_sig_1.csv")

number of phenotypes with significant entry: 52
number of allelotypes with significant entry: 107


In [17]:
sig_2_df = adj_p_2_df.loc[(adj_p_2_df < 0.05).sum(axis=1) > 0, :]
sig_2_df = sig_2_df[sig_2_df.columns[(sig_2_df<0.05).any()]]
print("number of phenotypes with significant entry: {}".format(sig_2_df.shape[0]))
print("number of allelotypes with significant entry: {}".format(sig_2_df.shape[1]))
sig_2_df.to_csv("genotype_assoc_adj_p_sig_2.csv")

number of phenotypes with significant entry: 58
number of allelotypes with significant entry: 59
