In [1]:
import os
import glob
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef, f1_score

from similarityregression import PairwiseAlignment as pwsaln
from similarityregression import PredictSimilarity as srpred

# Evaluate SR Models on new PBM data

In [122]:
CMs = {}
Running_Truth = []
Running_Preds = []

OmitTrainingConstructs = False

for loc_heldout_Y in glob.glob('DNA/ByFamily/*/TrainingData/Heldout.Y_Sims_PctID.csv.gz'):
    FID = loc_heldout_Y.split('/')[2]
    #Check if model was built
    if os.path.isfile('DNA/SRModels/' + FID + '.json'):
        fSRModel = srpred.ReadSRModel('DNA/SRModels/' + FID + '.json')
        print 'Working on:', FID
    else:
        fSRModel = srpred.ReadSRModel('DNA/SRModels/NO_THRESHOLD.json')
        print 'Working on:', FID, '(NO_THRESHOLD)'
        
    heldout_Y = pd.read_csv(loc_heldout_Y, index_col=[0,1])
    if OmitTrainingConstructs:
        TrainingConstructs = set()
        with open(loc_heldout_Y.replace('Heldout.Y_Sims_PctID.csv.gz', 'CVTestIndicies_i0.txt')) as cvfile:
            for line in cvfile:
                line = line.strip().split('\t')
                TrainingConstructs.add(line[0])
        #Get rid of comparisons w/ TrainingConstructs
        keeprows = [False]*heldout_Y.shape[0]
        for c, i in enumerate(list(heldout_Y.index)):
            if len(TrainingConstructs.intersection(i)) == 0:
                keeprows[c] = True
        heldout_Y = heldout_Y.loc[keeprows,]
    if fSRModel['Model.Class'] == 'SimilarityRegression':
        heldout_X = pd.read_csv('DNA/ByFamily/' + FID + '/TrainingData/Heldout.X_' + fSRModel['SR.Features']  + '.csv.gz', index_col=[0,1])
        heldout_X = heldout_X.loc[heldout_Y.index,]
        #Score w/ python
        Scores = []
        for i, x in heldout_X.iterrows():
            f = (x - fSRModel['SR.FeatureScales.mean'])/fSRModel['SR.FeatureScales.sd']
            f[np.isnan(f)] = 0
            score = fSRModel['SR.Intercept'] + np.dot(fSRModel['SR.Weights'], f)
            if fSRModel['SR.LogisticTransform'] == True:
                score = srpred.logistic(score)
            Scores.append(score)
        heldout_Y['SRScores'] = Scores
    else:
        heldout_Y['SRScores'] = heldout_Y[fSRModel['Model.Name']]
    
    heldout_Y = heldout_Y[heldout_Y['EClass'].isnull() == False] #For some of the NO_THRESHOLD families
    if heldout_Y.shape[0] > 0:
        #Evaluate Predictions
        heldout_Y['Class'] = 'Amb'
        heldout_Y.loc[heldout_Y['EClass'] == 1,'Class'] = 'HSim'
        heldout_Y.loc[heldout_Y['EScoreOverlap'] < 0.2,'Class'] = 'Dis'

        heldout_Y['Pred'] = 'Amb'
        heldout_Y.loc[heldout_Y['SRScores'] > fSRModel['Threshold.HSim'] ,'Pred'] = 'HSim'
        heldout_Y.loc[heldout_Y['SRScores'] < fSRModel['Threshold.Dis'] ,'Pred'] = 'Dis'

        CM = confusion_matrix(heldout_Y['Class'], heldout_Y['Pred'], labels=["HSim", "Amb", "Dis"])
        CMs[FID] = CM
        Running_Truth += list(heldout_Y['Class'])
        Running_Preds += list(heldout_Y['Pred'])
        
        heldout_Y['SR.Name'] = fSRModel['Model.Class']
        heldout_Y.to_csv('HeldoutScores/' + FID + '.csv')

Working on: F002_1.97d (NO_THRESHOLD)
Working on: F009_1.97d
Working on: F026_1.97d
Working on: F039_1.97d
Working on: F082_1.97d
Working on: F088_1.97d (NO_THRESHOLD)
Working on: F091_1.97d
Working on: F135_1.97d
Working on: F140_1.97d (NO_THRESHOLD)
Working on: F141_1.97d (NO_THRESHOLD)
Working on: F158_1.97d (NO_THRESHOLD)
Working on: F169_1.97d (NO_THRESHOLD)
Working on: F170_1.97d
Working on: F173_1.97d
Working on: F174_1.97d
Working on: F196_1.97d
Working on: F201_1.97d
Working on: F223_1.97d
Working on: F231_1.97d
Working on: F238_1.97d
Working on: F251_1.97d
Working on: F266_1.97d
Working on: F273_1.97d
Working on: F278_1.97d
Working on: F281_1.97d (NO_THRESHOLD)
Working on: F282_1.97d
Working on: F291_1.97d (NO_THRESHOLD)
Working on: F293_1.97d (NO_THRESHOLD)
Working on: F296_1.97d (NO_THRESHOLD)
Working on: F301_1.97d
Working on: F310_1.97d (NO_THRESHOLD)
Working on: F314_1.97d
Working on: F315_1.97d
Working on: F317_1.97d (NO_THRESHOLD)
Working on: F323_1.97d
Working on: F32

In [5]:
count = 0
for FID, cm in CMs.items():
    if count == 0:
        HPreds = cm.copy()
    else:
        HPreds += cm
    count += 1
    
print HPreds
print 'PERFORMANCE BY CLASS'
print 'HSim       | Precision:', 1.0*HPreds[0,0]/sum(HPreds[:,0]), 'Recall:', 1.0*HPreds[0,0]/sum(HPreds[0,:])
print 'Amb        | Precision:', 1.0*HPreds[1,1]/sum(HPreds[:,1]), 'Recall:', 1.0*HPreds[1,1]/sum(HPreds[1,:])
print 'Dissimilar | Precision:', 1.0*HPreds[2,2]/sum(HPreds[:,2]), 'Recall:', 1.0*HPreds[2,2]/sum(HPreds[2,:])
print 'COMMON METRICS'
print 'Matthews Correlation Coefficient (MCC):', matthews_corrcoef(Running_Truth, Running_Preds) 
print 'F1:', f1_score(Running_Truth, Running_Preds, average = 'micro')

[[  898  1338   140]
 [  122  5480  1042]
 [   94 20553 24546]]
PERFORMANCE BY CLASS
HSim       | Precision: 0.806104129264 Recall: 0.377946127946
Amb        | Precision: 0.200211903109 Recall: 0.824804334738
Dissimilar | Precision: 0.954057835821 Recall: 0.543137211515
COMMON METRICS
Matthews Correlation Coefficient (MCC): 0.289570033574
F1: 0.570416689724


In [18]:
CM_DF = pd.DataFrame(HPreds,
             index = ['Highly Similar', 'Ambiguous', 'Dissimilar'], 
             columns=['Highly Similar', 'Ambiguous', 'Dissimilar'])
CM_DF.to_csv('ConfusionMatrix_HeldoutPreds.csv')

# Evaluate heldout TFs in-traning NNs

In [123]:
count = 0
for loc_heldout in glob.glob('HeldoutScores/*'):
    FID = loc_heldout.split('/')[-1].replace('.csv','')
    heldout_Y = pd.read_csv(loc_heldout)
    AllConstructs = set(list(heldout_Y['MID_x']) + list(heldout_Y['MID_y']))
    TrainingConstructs = set()
    with open('DNA/ByFamily/%s/TrainingData/CVTestIndicies_i0.txt'%FID) as cvfile:
        for line in cvfile:
            line = line.strip().split('\t')
            TrainingConstructs.add(line[0])
    HeldoutConstructs = AllConstructs.difference(TrainingConstructs)
    NNs = []
    for HeldoutConstruct in HeldoutConstructs:
        HeldoutConstruct_Comps = heldout_Y[(heldout_Y['MID_x'] == HeldoutConstruct) | (heldout_Y['MID_y'] == HeldoutConstruct)].copy()
        HeldoutConstruct_Comps = HeldoutConstruct_Comps.sort_values('SRScores', ascending=False)
        NNs.append( HeldoutConstruct_Comps.index[0] )
    NNs = list(set(NNs))
    NNs.sort() 
    NNs = heldout_Y.loc[NNs].copy()
    NNs['Family_ID'] = FID
    
    if count == 0:
        heldout_NN = NNs.copy()
    else:
        heldout_NN = pd.concat([heldout_NN, NNs])
    count += 1

In [141]:
fams = pd.read_csv('../CisBP/DNA/DBFiles/tf_families.tab', index_col=0, delimiter='\t')
heldout_NN['Family_Name'] = [fams['Family_Name'].get(x) for x in heldout_NN['Family_ID']]
heldout_NN.to_csv('../Figures/FigureSX_HeldoutNNEscore/HeldoutNNs.csv', index = False)

#Parse Out Thresholds
Thresh = []
for FID in set(heldout_NN['Family_ID']):
    if os.path.isfile('DNA/SRModels/' + FID + '.json'):
        fSRModel = srpred.ReadSRModel('DNA/SRModels/' + FID + '.json')
        print 'Working on:', FID
        o = [fSRModel['Family_ID'], fSRModel['Family_Name'], fSRModel['Threshold.HSim'],  fSRModel['Threshold.Dis']]
    else:
        fSRModel = srpred.ReadSRModel('DNA/SRModels/NO_THRESHOLD.json')
        print 'Working on:', FID, '(NO_THRESHOLD)'
        o = [FID, fams.loc[FID, 'Family_Name'], fSRModel['Threshold.HSim'],  fSRModel['Threshold.Dis']]
    Thresh.append(o)
Thresh = pd.DataFrame(Thresh, columns = ['FID', 'Family_Name', 'HSim', 'Dis'])
Thresh.to_csv('../Figures/FigureSX_HeldoutNNEscore/ModelThresholds.csv', index = False)

Working on: F170_1.97d
Working on: F091_1.97d
Working on: F315_1.97d
Working on: F223_1.97d
Working on: F141_1.97d (NO_THRESHOLD)
Working on: F082_1.97d
Working on: F039_1.97d
Working on: F158_1.97d (NO_THRESHOLD)
Working on: F273_1.97d
Working on: F278_1.97d
Working on: F026_1.97d
Working on: F238_1.97d
Working on: F196_1.97d
Working on: F231_1.97d
Working on: F251_1.97d
Working on: F310_1.97d (NO_THRESHOLD)
Working on: F169_1.97d (NO_THRESHOLD)
Working on: F281_1.97d (NO_THRESHOLD)
Working on: F009_1.97d
Working on: F296_1.97d (NO_THRESHOLD)
Working on: F323_1.97d
Working on: F266_1.97d
Working on: F282_1.97d
Working on: F174_1.97d
Working on: F293_1.97d (NO_THRESHOLD)
Working on: F201_1.97d
Working on: F173_1.97d
Working on: F301_1.97d
Working on: F314_1.97d
Working on: F324_1.97d
Working on: F135_1.97d
