In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os, sys, h5py
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import tensorflow as tf
import scipy

import sys
sys.path.append('../../..')
import mutagenesisfunctions as mf
import bpdev as bd
import helper
from deepomics import neuralnetwork as nn
from deepomics import utils, fit, visualize, saliency

from Bio import AlignIO
import time as time
import pandas as pd
np.random.seed(42)

In [None]:
def vis_contacts(fam):

    ######### EXTRACT DATA ###########

    #extract SoM results after an APC correction
    arrayspath = 'Arrays/%s_mlp_%s.npy'%(fam,numhidden)
    hol_mut2 = np.load(arrayspath)
    seqlen,_, dims,_ = hol_mut2.shape
    C = bd.get_wc(arrayspath, seqlen, dims, bpugSQ=0, denoise='APC')

    #Load in the EC annotation
    ECannotfile = 'RF00002.EC.interaction.txt' #CHANGE
    EC_df = pd.read_csv(ECannotfile, delimiter='\s+')

    ######### QUALITATIVE CONTACTS ###########

    #Get the top L/2 SoM scores
    bp_shade = bd.plot_wcrank(C, seqlen, seqlen//2, cmap='viridis')
    bp_shade[np.tril_indices(seqlen)] = 0.
    SoM_top = np.where(bp_shade > 0.)
    #Top L/2 EC scores
    topEC = EC_df.loc[:seqlen//2, ['Rfam_reduced_position1', 'Rfam_reduced_position2']]
    #All cWW annotated interactions
    cWW_int = EC_df[EC_df['interactions'] == 'cWW']
    #All PDB contacts < 8 angstroms
    close_cont = EC_df[EC_df['minimum_atom_distance'] <= 8.0]

    #plot
    fig = plt.figure(figsize=(24,7))
    ax = fig.add_subplot(1,3,1)
    ax = plt.scatter(SoM_top[0], SoM_top[1], c='y', label = 'SoM')
    ax = plt.scatter(topEC.iloc[:,1], topEC.iloc[:,0], c='r', label = 'EC')
    #ax.set_title('SoM vs. top L/2 ECs')

    ax = fig.add_subplot(1,3,2)
    ax = plt.scatter(SoM_top[0], SoM_top[1], c='y', label = 'SoM')
    ax = plt.scatter(cWW_int.iloc[:,1], cWW_int.iloc[:,0], c='b', label = 'cWW')
    #ax.set_title('SoM vs. all cWW contacts')

    ax = fig.add_subplot(1,3,3)
    ax = plt.scatter(SoM_top[0], SoM_top[1], c='y', label = 'SoM')
    ax = plt.scatter(close_cont.iloc[:,1], close_cont.iloc[:,0], c='g', label = 'PDB contacts')
    #ax.set_title('SoM vs. contacts < 8A')
    plt.axis('equal')
    plt.savefig('%s/Images_mlp/%s_contacts_vis.png'%(fam,fam))
    plt.savefig('Contacts_vis/%s_contacts_vis.png'%(fam))

    ######### QUANTITATIVE CONTACTS ###########

    #Get the real positive and negative pdb contacts
    real_pos = np.zeros((seqlen,seqlen))
    for ii in range(len(close_cont)):
        real_pos[close_cont.iloc[ii, 1], close_cont.iloc[ii, 2]] = 1.
    real_neg = np.ones((seqlen,seqlen))
    for ii in range(len(close_cont)):
        real_neg[close_cont.iloc[ii, 1], close_cont.iloc[ii, 2]] = 0.

    def sumstats(thresh):
        #Get the top thresh SoM scores
        som_pos = bd.plot_wcrank(C, seqlen, thresh*2)
        som_pos[np.tril_indices(seqlen)] = 0.
        #Get the bottom thresh SoM scores
        som_neg = (~som_pos.astype('bool'))*1 #incantation converts binary to boolean, flips them then back to binary

        TP = np.sum(real_pos*som_pos)
        FP = np.sum(real_neg*som_pos)
        TN = np.sum(real_neg*som_neg)
        FN = np.sum(real_pos*som_neg)

        PPV = TP/(TP+FP)
        FDR = FP/(TP+FP)
        TPR = TP/(TP+FN)

        return (PPV, FDR, TPR, TP, FP, TN, FN)

    def sumstats_EC(thresh):
        #Top thresh EC scores
        n_EC_P = len(EC_df.iloc[:thresh, :])
        EC_pos = np.zeros((seqlen,seqlen))
        for ii in range(n_EC_P):
            EC_pos[EC_df.iloc[ii, 1], EC_df.iloc[ii, 2]] = 1.
        EC_neg = np.ones((seqlen,seqlen))
        for ii in range(n_EC_P):
            EC_neg[EC_df.iloc[ii, 1], EC_df.iloc[ii, 2]] = 0.

        TP = np.sum(real_pos*EC_pos)
        FP = np.sum(real_neg*EC_pos)
        TN = np.sum(real_neg*EC_neg)
        FN = np.sum(real_pos*EC_neg)

        PPV = TP/(TP+FP)
        FDR = FP/(TP+FP)
        TPR = TP/(TP+FN)

        return (PPV, FDR, TPR, TP, FP, TN, FN)

    extent = seqlen

    EC_ppv = []
    EC_tpr = []
    SoM_ppv = []
    SoM_tpr = []
    for tr in range(extent):
        ecst = sumstats_EC(tr)
        somst = sumstats(tr)

        EC_ppv.append(ecst[0])
        EC_tpr.append(ecst[2])

        SoM_ppv.append(somst[0])
        SoM_tpr.append(somst[2])

    fig = plt.figure(figsize=(12,5))
    ax1 = fig.add_subplot(1,2,1)
    ax1 = plt.plot(range(extent),SoM_ppv, label='SoM')
    ax1 = plt.plot(range(extent),EC_ppv, label='EC')
    #ax1.set_xlabel('Number of contact predicted')
    #ax1.set_ylabel('Positive Predictive Value')

    ax2 = fig.add_subplot(1,2,2)
    ax2 = plt.plot(range(extent),SoM_tpr, label='SoM')
    ax2 = plt.plot(range(extent),EC_tpr, label='EC')
    #ax1.set_xlabel('Number of contact predicted')
    #ax1.set_ylabel('True Positive Rate')
    plt.savefig('%s/Images_mlp/%s_stats.png'%(fam,fam))
    plt.savefig('Stats_graphs/%s_stats.png'%(fam))

In [None]:
#Load the SoM results file
families = ['RF00002', 'RF00005', 'RF00010', 'RF00017', 'RF00023', 'RF00050',
            'RF00059', 'RF00162', 'RF00167', 'RF00169', 'RF00174', 'RF00234',
            'RF00380', 'RF00504', 'RF01734', 'RF01786', 'RF01831', 'RF01852', 'RF01960', 'RF02001']

numhidden = 512

In [None]:
for fam in families:
    vis_contacts(fam)