In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os, sys, h5py, time
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import tensorflow as tf
import scipy

import sys
sys.path.append('../../..')
import mutagenesisfunctions as mf
import bpdev as bd
import helper
from deepomics import neuralnetwork as nn
from deepomics import utils, fit, visualize, saliency

import contacts

from Bio import AlignIO
import time as time
import pandas as pd
np.random.seed(42)

In [30]:
def vis_contacts(fam, comp='atom_dist', min_dist=8.0):

    ######### EXTRACT DATA ###########

    #extract SoM results after an APC correction
    arrayspath = 'Arrays/%s_mlp_%s.npy'%(fam,numhidden)
    hol_mut2 = np.load(arrayspath)
    seqlen,_, dims,_ = hol_mut2.shape
    C = bd.get_wc(arrayspath, seqlen, dims, bpugSQ=0, denoise='APC')

    #Load in the EC annotation
    ECannotfile = '%s/%s.EC.interaction.txt'%(fam,fam) 
    EC_df = pd.read_csv(ECannotfile, delimiter='\s+')

    ######### QUALITATIVE CONTACTS ###########

    #Get the top L/2 SoM scores
    bp_shade = bd.plot_wcrank(C, seqlen, seqlen//2)
    bp_shade[np.tril_indices(seqlen)] = 0.
    SoM_top = np.where(bp_shade > 0.)
    #Top L/2 EC scores
    topEC = EC_df.loc[:seqlen//2, ['Rfam_reduced_position1', 'Rfam_reduced_position2']]
    #All cWW annotated interactions
    cWW_int = EC_df[EC_df['interactions'] == 'cWW']
    #All PDB contacts < 8 angstroms
    close_cont = EC_df[EC_df['minimum_atom_distance'] <= min_dist] 
    
    #Retrieve Gtest
    filename = 'Arrays/marks_gtests.hdf5'
    with h5py.File(filename, 'r') as dataset:
        gtest = np.array(dataset['%s_gtest'%(fam)])
    #Get the top L/2 Gtest scores
    g_shade = bd.plot_wcrank(gtest, seqlen, seqlen//2)
    g_shade[np.tril_indices(seqlen)] = 0.
    gtest_top = np.where(g_shade > 0.)

    #plot
    fig = plt.figure(figsize=(15,15))
    ax1 = fig.add_subplot(2,2,1)
    ax1.scatter(SoM_top[0], SoM_top[1], c='b', label = 'SoM')
    ax1.scatter(topEC.iloc[:,1], topEC.iloc[:,0], c='r', label = 'EC')
    ax1.set_title('SoM vs. top L/2 ECs')
    ax1.legend(loc='center right')

    ax2 = fig.add_subplot(2,2,2)
    ax2.scatter(SoM_top[0], SoM_top[1], c='b', label = 'SoM')
    ax2.scatter(cWW_int.iloc[:,2], cWW_int.iloc[:,1], c='m', label = 'cWW')
    ax2.set_title('SoM vs. all cWW contacts')
    ax2.legend(loc='center right')

    ax3 = fig.add_subplot(2,2,3)
    ax3.scatter(SoM_top[0], SoM_top[1], c='b', label = 'SoM')
    ax3.scatter(close_cont.iloc[:,2], close_cont.iloc[:,1], c='k', label = 'PDB contacts')
    ax3.set_title('SoM vs. contacts < %sA'%(min_dist))
    ax3.legend(loc='center right')
    
    ax4 = fig.add_subplot(2,2,4)
    ax4.scatter(SoM_top[0], SoM_top[1], c='b', label = 'SoM')
    ax4.scatter(gtest_top[1], gtest_top[0], c='g', label = 'MI with APC')
    ax4.set_title('SoM vs. top L/2 MI scores')
    ax4.legend(loc='center right')
    plt.axis('equal')
    plt.savefig('%s/Images_mlp/%s_contacts_vis_%sA.png'%(fam,fam,min_dist))
    plt.savefig('Contacts_vis/%s_contacts_vis_%sA.png'%(fam,min_dist))

    ######### QUANTITATIVE GRAPHS ###########

    #Get the real positive and negative pdb contacts
    def groundtruth(comp=comp):

        if comp == 'atom_dist':
            real_pos = np.zeros((seqlen,seqlen))
            for ii in range(len(close_cont)):
                real_pos[close_cont.iloc[ii, 1], close_cont.iloc[ii, 2]] = 1.
            real_neg = np.ones((seqlen,seqlen))
            for ii in range(len(close_cont)):
                real_neg[close_cont.iloc[ii, 1], close_cont.iloc[ii, 2]] = 0.
                
        if comp == 'cWW_int':
            real_pos = np.zeros((seqlen,seqlen))
            for ii in range(len(cWW_int)):
                real_pos[cWW_int.iloc[ii, 1], cWW_int.iloc[ii, 2]] = 1.
            real_neg = np.ones((seqlen,seqlen))
            for ii in range(len(cWW_int)):
                real_neg[cWW_int.iloc[ii, 1], cWW_int.iloc[ii, 2]] = 0.
        return (real_pos, real_neg)

    def sumstats_mi(thresh):
        #Get the top thresh Gtest scores
        g_pos = bd.plot_wcrank(gtest, seqlen, thresh*2)
        g_pos[np.tril_indices(seqlen)] = 0.
        #Get the bottom thresh SoM scores
        g_neg = (~g_pos.astype('bool'))*1 #incantation converts binary to boolean, flips them then back to binary

        TP = np.sum(real_pos*g_pos)
        FP = np.sum(real_neg*g_pos)
        TN = np.sum(real_neg*g_neg)
        FN = np.sum(real_pos*g_neg)

        PPV = TP/(TP+FP)
        FDR = FP/(TP+FP)
        TPR = TP/(TP+FN)

        return (PPV, FDR, TPR, TP, FP, TN, FN)
    
    def sumstats(thresh):
        #Get the top thresh SoM scores
        som_pos = bd.plot_wcrank(C, seqlen, thresh*2)
        som_pos[np.tril_indices(seqlen)] = 0.
        #Get the bottom thresh SoM scores
        som_neg = (~som_pos.astype('bool'))*1 #incantation converts binary to boolean, flips them then back to binary

        TP = np.sum(real_pos*som_pos)
        FP = np.sum(real_neg*som_pos)
        TN = np.sum(real_neg*som_neg)
        FN = np.sum(real_pos*som_neg)

        PPV = TP/(TP+FP)
        FDR = FP/(TP+FP)
        TPR = TP/(TP+FN)

        return (PPV, FDR, TPR, TP, FP, TN, FN)

    def sumstats_EC(thresh):
        #Top thresh EC scores
        n_EC_P = len(EC_df.iloc[:thresh, :])
        EC_pos = np.zeros((seqlen,seqlen))
        for ii in range(n_EC_P):
            EC_pos[EC_df.iloc[ii, 1], EC_df.iloc[ii, 2]] = 1.
        EC_neg = np.ones((seqlen,seqlen))
        for ii in range(n_EC_P):
            EC_neg[EC_df.iloc[ii, 1], EC_df.iloc[ii, 2]] = 0.

        TP = np.sum(real_pos*EC_pos)
        FP = np.sum(real_neg*EC_pos)
        TN = np.sum(real_neg*EC_neg)
        FN = np.sum(real_pos*EC_neg)

        PPV = TP/(TP+FP)
        FDR = FP/(TP+FP)
        TPR = TP/(TP+FN)

        return (PPV, FDR, TPR, TP, FP, TN, FN)
    
    real_pos, real_neg = groundtruth(comp=comp)
    extent = seqlen

    EC_ppv = []
    EC_tpr = []
    SoM_ppv = []
    SoM_tpr = []
    g_ppv = []
    g_tpr = []
    for tr in range(extent):
        ecst = sumstats_EC(tr)
        somst = sumstats(tr)
        gst = sumstats_mi(tr)

        EC_ppv.append(ecst[0])
        EC_tpr.append(ecst[2])

        SoM_ppv.append(somst[0])
        SoM_tpr.append(somst[2])
        
        g_ppv.append(gst[0])
        g_tpr.append(gst[2])        
        

    fig = plt.figure(figsize=(12,5))
    ax1 = fig.add_subplot(1,2,1)
    ax1.plot(range(extent),SoM_ppv, '-b', label='SoM')
    ax1.plot(range(extent),EC_ppv, '-r', label='EC')
    ax1.plot(range(extent),g_ppv, '-g', label='MI')
    ax1.set_xlabel('Number of contacts predicted')
    ax1.set_ylabel('Positive Predictive Value')
    ax1.legend()

    ax2 = fig.add_subplot(1,2,2)
    ax2.plot(range(extent),SoM_tpr, 'b', label='SoM')
    ax2.plot(range(extent),EC_tpr, '-r', label='EC')
    ax2.plot(range(extent),g_tpr, '-g', label='MI')
    ax2.set_xlabel('Number of contacts predicted')
    ax2.set_ylabel('True Positive Rate')
    ax2.legend()
    
    
    if comp == 'atom_dist':
        ax1.set_title('Contacts within %sA predicted'%(min_dist))
        ax2.set_title('Contacts within %sA predicted'%(min_dist))
        quanttitle = '%sA_contacts'%(min_dist)
    if comp == 'cWW_int':
        ax1.set_title('cWW interactions predicted')
        ax2.set_title('cWW interactions predicted')
        quanttitle = 'cWW_int'
        
    plt.savefig('%s/Images_mlp/%s_%s_stats.png'%(fam,fam,quanttitle))
    plt.savefig('Stats_graphs/%s_%s_stats.png'%(fam, quanttitle))
    plt.close('all')
    
    ######### QUANTITATIVE CONTACTS ###########

In [5]:
#Load the SoM results file
families = ['RF00002', 'RF00005', 'RF00010', 'RF00017', 'RF00023', 'RF00050',
            'RF00059', 'RF00162', 'RF00167', 'RF00169', 'RF00174', 'RF00234',
            'RF00380', 'RF00504', 'RF01734', 'RF01786', 'RF01831', 'RF01852', 'RF02001']#, 'RF01960']

numhidden = 512

In [32]:
for fam in families:
    starttime = time.time()
    print (fam)
    vis_contacts(fam, min_dist=5.0, comp='atom_dist')
    vis_contacts(fam, min_dist=5.0, comp='cWW_int')
    print ('Graphs made:', mf.sectotime(time.time()-starttime))

RF00002




Graphs made: 15.03s
RF00005
Graphs made: 4.71s
RF00010
Graphs made: 1min 29.040000000000006s
RF00017
Graphs made: 59.1s
RF00023
Graphs made: 1min 32.370000000000005s
RF00050
Graphs made: 11.55s
RF00059
Graphs made: 7.61s
RF00162
Graphs made: 7.94s
RF00167
Graphs made: 7.39s
RF00169
Graphs made: 6.88s
RF00174
Graphs made: 21.64s
RF00234
Graphs made: 15.94s
RF00380
Graphs made: 17.82s
RF00504
Graphs made: 6.08s
RF01734
Graphs made: 4.0s
RF01786
Graphs made: 5.61s
RF01831
Graphs made: 7.0s
RF01852
Graphs made: 5.89s
RF02001
Graphs made: 18.36s


In [50]:
#Just calculate the MIs and save them for later access
#Perform Gtest
gtestsave = 'Arrays/marks_gtests.hdf5'
with h5py.File(gtestsave, 'w') as f:
    for fam in families:
        starttime = time.time()
        filename = '../../data_marks/%s/%s_red.hdf5'%(fam, fam)
        with h5py.File(filename, 'r') as dataset:
            X_data = np.array(dataset['X_data'])
        numdata = X_data.shape[0]
        X_data = X_data[:numdata//2, :, 0, :4]
        gtest = contacts.g_test(X_data)
        gtest[np.isnan(gtest)] = 0.
        gtest = contacts.apc_correction(gtest)
        f.create_dataset('%s_gtest'%(fam), data=gtest.astype(np.float32), compression='gzip')
        print (fam)
        print ('Gtest and saving:', mf.sectotime(time.time()-starttime))

RF00002
Gtest and saving: 13min 12.07000000000005s
RF00005
Gtest and saving: 2min 4.829999999999998s
RF00010
Gtest and saving: 50.03s
RF00017
Gtest and saving: 2min 35.83000000000001s
RF00023
Gtest and saving: 52.39s
RF00050
Gtest and saving: 5.21s
RF00059
Gtest and saving: 8.24s
RF00162
Gtest and saving: 3.66s
RF00167
Gtest and saving: 1.86s
RF00169
Gtest and saving: 3.42s
RF00174
Gtest and saving: 21.87s
RF00234
Gtest and saving: 1.68s
RF00380
Gtest and saving: 3.14s
RF00504
Gtest and saving: 3.51s
RF01734
Gtest and saving: 0.39s
RF01786
Gtest and saving: 0.24s
RF01831
Gtest and saving: 0.52s
RF01852
Gtest and saving: 1.06s
RF02001
Gtest and saving: 4.86s


In [None]:
ec_col = EC_df.loc[:seqlen//2, ['score']]
som_col = [C[ii,jj] for ii,jj in zip(SoM_top[0],SoM_top[1])]