In [1]:
import os
import glob
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef, f1_score

from similarityregression import PairwiseAlignment as pwsaln
from similarityregression import PredictSimilarity as srpred

# Test

In [2]:
#AP2
CurrentFamily = 'F009_1.97d'
fSRModel = srpred.ReadSRModel('DNA/SRModels/' + CurrentFamily + '.json')

#Read in original predictions
Y = pd.read_csv('DNA/ByFamily/F009_1.97d/TrainingData/Y_Sims_PctID.csv.gz', index_col=[0,1])
if fSRModel['Model.Class'] == 'SimilarityRegression':
    featureset = fSRModel['SR.Features']
    X = pd.read_csv('DNA/ByFamily/F009_1.97d/TrainingData/X_' + featureset  + '.csv.gz', index_col=[0,1])
OriginalPreds = pd.read_csv('DNA/ByFamily/F009_1.97d/Models/Predictions_FinalModel.csv')
Y['R.' + fSRModel['Model.Name']] = list(OriginalPreds[fSRModel['Model.Name']])

#Score w/ python
for i, x in X.iterrows():
    f = (x - fSRModel['SR.FeatureScales.mean'])/fSRModel['SR.FeatureScales.sd']
    score = fSRModel['SR.Intercept'] + np.dot(fSRModel['SR.Weights'], f)
    if fSRModel['SR.LogisticTransform'] == True:
        score = srpred.logistic(score)
    Y.loc[i,'NP.' + fSRModel['Model.Name']] = score

cc = np.corrcoef(Y.iloc[:, -1], Y.iloc[:, -2])
print 'CorCoef (R & Python):', cc[0,1]

CorCoef (R & Python): 1.0


# Evaluate SR Models

In [9]:
CMs = {}
Running_Truth = []
Running_Preds = []

OmitTrainingConstructs = False

for loc_heldout_Y in glob.glob('DNA/ByFamily/*/TrainingData/Heldout.Y_Sims_PctID.csv.gz'):
    FID = loc_heldout_Y.split('/')[2]
    print FID
    #Check if model was built
    if os.path.isfile('DNA/SRModels/' + FID + '.json'):
        fSRModel = srpred.ReadSRModel('DNA/SRModels/' + FID + '.json')
        heldout_Y = pd.read_csv(loc_heldout_Y, index_col=[0,1])
        if OmitTrainingConstructs:
            TrainingConstructs = set()
            with open(loc_heldout_Y.replace('Heldout.Y_Sims_PctID.csv.gz', 'CVTestIndicies_i0.txt')) as cvfile:
                for line in cvfile:
                    line = line.strip().split('\t')
                    TrainingConstructs.add(line[0])
            #Get rid of comparisons w/ TrainingConstructs
            keeprows = [False]*heldout_Y.shape[0]
            for c, i in enumerate(list(heldout_Y.index)):
                if len(TrainingConstructs.intersection(i)) == 0:
                    keeprows[c] = True
            heldout_Y = heldout_Y.loc[keeprows,]
        if fSRModel['Model.Class'] == 'SimilarityRegression':
            heldout_X = pd.read_csv('DNA/ByFamily/' + FID + '/TrainingData/Heldout.X_' + fSRModel['SR.Features']  + '.csv.gz', index_col=[0,1])
            heldout_X = heldout_X.loc[heldout_Y.index,]
            #Score w/ python
            Scores = []
            for i, x in heldout_X.iterrows():
                f = (x - fSRModel['SR.FeatureScales.mean'])/fSRModel['SR.FeatureScales.sd']
                score = fSRModel['SR.Intercept'] + np.dot(fSRModel['SR.Weights'], f)
                if fSRModel['SR.LogisticTransform'] == True:
                    score = srpred.logistic(score)
                Scores.append(score)
            heldout_Y['SRScores'] = Scores
        else:
            heldout_Y['SRScores'] = heldout_Y[fSRModel['Model.Name']]
        
        if heldout_Y.shape[0] > 0:
            #Evaluate Predictions
            heldout_Y['Class'] = 'Amb'
            heldout_Y.loc[heldout_Y['EClass'] == 1,'Class'] = 'HSim'
            heldout_Y.loc[heldout_Y['EScoreOverlap'] < 0.2,'Class'] = 'Dis'

            heldout_Y['Pred'] = 'Amb'
            heldout_Y.loc[heldout_Y['SRScores'] > fSRModel['Threshold.HSim'] ,'Pred'] = 'HSim'
            heldout_Y.loc[heldout_Y['SRScores'] < fSRModel['Threshold.Dis'] ,'Pred'] = 'Dis'

            CM = confusion_matrix(heldout_Y['Class'], heldout_Y['Pred'], labels=["HSim", "Amb", "Dis"])
            CMs[FID] = CM
            Running_Truth += list(heldout_Y['Class'])
            Running_Preds += list(heldout_Y['Pred'])

F002_1.97d
F009_1.97d
F026_1.97d
F039_1.97d
F082_1.97d
F088_1.97d
F091_1.97d
F135_1.97d
F140_1.97d
F141_1.97d
F158_1.97d
F169_1.97d
F170_1.97d
F173_1.97d
F174_1.97d
F196_1.97d
F201_1.97d
F223_1.97d
F231_1.97d
F238_1.97d
F251_1.97d
F266_1.97d
F273_1.97d
F278_1.97d
F281_1.97d
F282_1.97d
F291_1.97d
F293_1.97d
F296_1.97d
F301_1.97d
F310_1.97d
F314_1.97d
F315_1.97d
F317_1.97d
F323_1.97d
F324_1.97d


In [10]:
count = 0
for FID, cm in CMs.items():
    if count == 0:
        HPreds = cm.copy()
    else:
        HPreds += cm
    count += 1
    
print HPreds
print 'PERFORMANCE BY CLASS'
print 'HSim       | Precision:', 1.0*HPreds[0,0]/sum(HPreds[:,0]), 'Recall:', 1.0*HPreds[0,0]/sum(HPreds[0,:])
print 'Amb        | Precision:', 1.0*HPreds[1,1]/sum(HPreds[:,1]), 'Recall:', 1.0*HPreds[1,1]/sum(HPreds[1,:])
print 'Dissimilar | Precision:', 1.0*HPreds[2,2]/sum(HPreds[:,2]), 'Recall:', 1.0*HPreds[2,2]/sum(HPreds[2,:])
print 'COMMON METRICS'
print 'Matthews Correlation Coefficient (MCC):', matthews_corrcoef(Running_Truth, Running_Preds) 
print 'F1:', f1_score(Running_Truth, Running_Preds, average = 'micro')

[[  726  1512   122]
 [   79  5670   912]
 [   75 22307 22764]]
PERFORMANCE BY CLASS
HSim       | Precision: 0.825 Recall: 0.307627118644
Amb        | Precision: 0.192275085625 Recall: 0.851223540009
Dissimilar | Precision: 0.95655097067 Recall: 0.504230718115
COMMON METRICS
Matthews Correlation Coefficient (MCC): 0.272437500282
F1: 0.538335148707
