In [12]:
import numpy as np
import os
import pandas as pd
import rpy2.robjects as robjects
import time

In [2]:
outpath = "output/interactions/"

# Get phenotypes with at least two significant associatons

In [3]:
# get list of significant HC phenotypes
f = open("output/compare_cutoff_pvals/sig_phe_HC.txt","r")
sig_phes = f.readline().split()
f.close()

# get list of significant cancer phenotypes
f = open("output/compare_cutoff_pvals/sig_phe_cancer.txt","r")
sig_phes += f.readline().split()
f.close()

len(sig_phes)

48

# Find which analyses have finished

In [34]:
fin_phes = []
for file in os.listdir("../scripts/output/interact/"):
    if file.endswith(".txt"):
        fin_phes.append(file.split("_")[0])
        #print(os.path.join("/mydir", file))
len(fin_phes)

47

# Find missing

In [35]:
set(sig_phes) - set(fin_phes)

{'HC303'}

# Get interaction p values

In [36]:
inter_df = pd.DataFrame(index = fin_phes, columns = ["DRB1_404-DQB1_302","DRB1_404-DQA1_301"])

In [37]:
t0 = time.time()
count = 1
for file in os.listdir("../scripts/output/interact/"):
    if file.endswith(".txt"):
        phe = file.split("_")[0]
        f = open("../scripts/output/interact/" + file)
        print(count)
        count += 1
        print(file)
        print("time: {}\n".format(time.time() - t0))
        for line in f.readlines():
            if line[0] == "$":
                inter = line[2:-2]
                p_line = False
                p_ind = 0
            elif "Pr(>|z|)" in line:
                line = line.replace("Std. Error", "stderr")
                line = line.replace("z value", "zvalue")
                p_ind = line.split().index("Pr(>|z|)")
                p_line = True
            elif line.startswith("as.numeric(gcounts1):as.numeric(gcounts2)"):
                if p_line:
                    p_val = line.split()[p_ind + 1]
                    if p_val == "<2e-16" or p_val == "<":
                        p_val = 2e-16
                    else:
                        p_val = float(p_val)
                    #got_p = True
                    p_line = False
                    inter2 = inter.split("-")[1] + "-" + inter.split("-")[0]
                    if inter in inter_df.columns.values:
                        inter_df[inter][phe] = p_val
                    elif inter2 in inter_df.columns.values:
                        inter_df[inter2][phe] = p_val
                    else:
                        inter_df[inter] = np.nan
                        inter_df[inter][phe] = p_val

        f.close()
print("time: {}".format(time.time() - t0))

1
HC150_rounded_inter.txt
time: 0.00484013557434

2
HC310_rounded_inter.txt
time: 0.0581440925598

3
HC432_rounded_inter.txt
time: 0.170589923859



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


4
HC91_rounded_inter.txt
time: 0.210999965668

5
HC321_rounded_inter.txt
time: 3.93786597252



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


6
HC78_rounded_inter.txt
time: 4.62984490395

7
HC382_rounded_inter.txt
time: 7.16750311852

8
HC422_rounded_inter.txt
time: 35.2265000343

9
1041_rounded_inter.txt
time: 41.9052789211

10
HC22_rounded_inter.txt
time: 43.2125449181

11
HC169_rounded_inter.txt
time: 43.2498350143

12
HC28_rounded_inter.txt
time: 43.2986199856

13
HC38_rounded_inter.txt
time: 43.572206974

14
HC269_rounded_inter.txt
time: 78.5729091167

15
HC179_rounded_inter.txt
time: 79.6602110863

16
HC55_rounded_inter.txt
time: 80.1530580521

17
HC219_rounded_inter.txt
time: 91.4957969189

18
1047_rounded_inter.txt
time: 163.14230895

19
1060_rounded_inter.txt
time: 163.410588026

20
HC322_rounded_inter.txt
time: 165.300184965

21
HC273_rounded_inter.txt
time: 165.345788002

22
1003_rounded_inter.txt
time: 165.66305995

23
HC170_rounded_inter.txt
time: 167.541171074

24
HC243_rounded_inter.txt
time: 171.671494007

25
HC96_rounded_inter.txt
time: 172.0597651

26
1053_rounded_inter.txt
time: 173.651909113

27
HC295_rou

In [38]:
inter_df.columns.values

array(['DRB1_404-DQB1_302', 'DRB1_404-DQA1_301', 'DQA1_501-DQB1_201', ...,
       'A_301-B_2705', 'A_301-C_102', 'A_301-DRB1_1303'], dtype=object)

In [39]:
inter_df.to_csv(outpath + "interact_pvals.csv")
inter_df;

# Adjust pvalues

In [40]:
thresh = 0.05
s = inter_df.as_matrix().shape
flat_df = inter_df.as_matrix().flatten()
total = flat_df.shape[0]
nom_p = robjects.FloatVector(flat_df)
rpadjust = robjects.r['p.adjust']
adj_p = np.array(rpadjust(nom_p, "BY"))
adj_p = np.reshape(adj_p,s)
# find the number of adjusted values below the threshold
below_thresh = (adj_p < thresh).sum()
adj_p_df = pd.DataFrame(adj_p,index=inter_df.index.values, columns=inter_df.columns.values)
print("number below threshold: {}".format(below_thresh))
adj_p_df.to_csv(outpath + "interact_pvals_adj.csv")
adj_p_df;

number below threshold: 191


  # Remove the CWD from sys.path while we load stuff.


In [41]:
sig_df = adj_p_df.loc[(adj_p_df < thresh).sum(axis=1) > 0, :]
sig_df = sig_df[sig_df.columns[(sig_df< thresh).any()]]
print("number of phenotypes with significant entry: {}".format(sig_df.shape[0]))
print("number of allelotypes with significant entry: {}".format(sig_df.shape[1]))
sig_df.to_csv(outpath + "interact_pvals_adj_sig.csv")
sig_df;

number of phenotypes with significant entry: 14
number of allelotypes with significant entry: 159


In [42]:
sig_df[sig_df < thresh].count().sort_values(ascending = False).to_csv(outpath + "allele_sig_counts.csv")
T_sig_df = sig_df.transpose()
T_sig_df[T_sig_df < thresh].count().sort_values(ascending = False).to_csv(outpath + "phe_sig_counts.csv")