In [1]:
import os
import glob
import json
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, matthews_corrcoef,fbeta_score, f1_score

WorkingOn = 'DNA'

BaselineMethod = 'PctID_L'

#Should models with windowed feature smoothing be accepted?
AllowSmooth = False
AllowSmooth_Exemptions = set(['F091_1.97d'])

loc_ModelPerformances = glob.glob(WorkingOn + '/ByFamily/*/Models/PRThresholdData.csv')

tf_families = pd.read_csv('../CisBP/' + WorkingOn + '/DBFiles/tf_families.tab', sep = '\t', skiprows = [1], index_col=0)
tf_families = tf_families.rename(index=str, columns={"Cutoff": "CisBP_Cutoff"})

for loc_ModelPerformance in loc_ModelPerformances:
    FID = loc_ModelPerformance.split('/')[2]
    loc_YSim = loc_ModelPerformance.replace('Models/PRThresholdData.csv', 'TrainingData/Y_Sims_PctID.csv.gz')
    try:
        YSims = pd.read_csv(loc_YSim, compression='gzip')
        tf_families.loc[FID, 'NumberOfPairs'] = YSims.shape[0]
        tf_families.loc[FID, 'Experiments'] = len(set(list(YSims['MID_x']) + list(YSims['MID_y'])))
    except:
        continue

# Parse/Calculate SR performance metrics 
Metrics = [Precision, Recall, MCC, F1 (micro/macro)]

In [2]:
MulticlassPerformances = []
ConfusionMats = {}

for loc_ModelPerformance in loc_ModelPerformances:
    FID = loc_ModelPerformance.split('/')[2]
    ConfusionMats[FID]= {}
    print FID
    #Preds/Labels (Training)
    Preds = pd.read_csv(WorkingOn + '/ByFamily/' + FID + '/Models/Predictions_TestSet.csv')
    Y = pd.read_csv(WorkingOn + '/ByFamily/' + FID + '/TrainingData/Y_Sims_PctID.csv.gz')
    Y['TN'] = Y['EScoreOverlap'] >= 0.2
    Y['TN'] = Y.TN.astype(int)
    
    #Preds/Labels (Final)
    FinalPreds = pd.read_csv(WorkingOn + '/ByFamily/' + FID + '/Models/Predictions_FinalModel.csv')
    
    #Assign True Labels
    Preds['True'] = 'Amb'
    Preds.loc[Preds['EClass'] == 1, 'True'] = 'HS' 
    Preds.loc[Preds['TN'] == 0, 'True'] = 'Dis' 
    Y['True'] = 'Amb'
    Y.loc[Y['EClass'] == 1, 'True'] = 'HS' 
    Y.loc[Y['TN'] == 0, 'True'] = 'Dis'
    FinalPreds['True'] = 'Amb'
    FinalPreds.loc[FinalPreds['Class'] == 1, 'True'] = 'HS' 
    FinalPreds.loc[FinalPreds['TN'] == 0, 'True'] = 'Dis' 
    
    #Positives
    Performances = pd.read_csv(loc_ModelPerformance)
    Performances = Performances[Performances['Model'] != 'PctID_S']
    if (AllowSmooth == False) and (FID not in AllowSmooth_Exemptions):
        Performances = Performances.loc[['Smooth' not in x for x in Performances['Model']],]
    
    #Negatives
    NPerformances = pd.read_csv(loc_ModelPerformance.replace('PRThresholdData', 'NPVThresholdData'))
    
    for method in set(Performances['Model']):
        #Get Positive Info
        Performances_HSim = Performances[(Performances['Model'] == method) &
                                         (Performances['Precision_TEST'] >= 0.75) &
                                         (Performances['Threshold'].isnull() == False)]
        if Performances_HSim.shape[0] >= 1:
            Performances_HSim_Selected = Performances_HSim.iloc[0]
            thresh_class = Performances_HSim_Selected['Threshold']
        else:
            thresh_class = None
        
        #Get Negative Info
        NPerformances_Dis = NPerformances[(NPerformances['Model'] == method) &
                                          (NPerformances['NPV_Target'] >= 0.95) &
                                          (NPerformances['Threshold'] < thresh_class)]
        if NPerformances_Dis.shape[0] >= 1:
            NPerformances_Dis_Selected = NPerformances_Dis.iloc[0]
            thresh_tn = NPerformances_Dis_Selected['Threshold']
        else:
            thresh_tn = None
            
        #print FID, method, thresh_class, thresh_tn

        if 'PctID_' in method:
            cpreds = Y[[method, 'True']].copy()
            cpreds.columns = ['Score', 'True']
            fpreds = cpreds.copy()
        else:
            cpreds = Preds[[method, 'True']].copy()
            cpreds.columns = ['Score', 'True']
            fpreds = FinalPreds[[method, 'True']].copy()
            fpreds.columns = ['Score', 'True']
            
        #Assign Predicted Labels (LOCO-CV Micro-Performance)
        cpreds['Predicted'] = 'Amb'
        if thresh_class != None:
            cpreds.loc[cpreds['Score'] > thresh_class, 'Predicted'] = 'HS' 
        if thresh_tn != None:
            cpreds.loc[cpreds['Score'] < thresh_tn, 'Predicted'] = 'Dis'
        
        cm = confusion_matrix(cpreds['True'], cpreds['Predicted'], labels=["HS", "Amb", "Dis"])
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        MCC = matthews_corrcoef(cpreds['True'], cpreds['Predicted'])
        F1_macro = fbeta_score(cpreds['True'], cpreds['Predicted'], average='macro', beta=1)
        F1_micro = fbeta_score(cpreds['True'], cpreds['Predicted'], average='micro', beta=1)
        #print FID, method, MCC, F1_macro, F1_micro
        
        #Assign Predicted Labels (Final Model Performance (all data))
        fpreds['Predicted'] = 'Amb'
        fpreds.loc[fpreds['Score'] > thresh_class, 'Predicted'] = 'HS' 
        fpreds.loc[fpreds['Score'] < thresh_tn, 'Predicted'] = 'Dis'
        final_cm = confusion_matrix(fpreds['True'], fpreds['Predicted'], labels=["HS", "Amb", "Dis"])
        final_cm_normalized = final_cm.astype('float') / final_cm.sum(axis=1)[:, np.newaxis]
        final_MCC = matthews_corrcoef(fpreds['True'], fpreds['Predicted'])
        final_F1_macro = fbeta_score(fpreds['True'], fpreds['Predicted'], average='macro', beta=1)
        final_F1_micro = fbeta_score(fpreds['True'], fpreds['Predicted'], average='micro', beta=1)
        
        #print FID, method, MCC, final_MCC, F1_macro, final_F1_macro, F1_micro, final_F1_micro
        performancestats = [FID, method, MCC, final_MCC, F1_macro, final_F1_macro, F1_micro, final_F1_micro]
        if thresh_class != None:
            performancestats += [Performances_HSim_Selected['Recall_TEST'], Performances_HSim_Selected['Recall_FINAL']]
        else:
            performancestats += [np.nan, np.nan]
        
        if thresh_tn != None:
            performancestats += [NPerformances_Dis_Selected['NegativeRecall_TEST'], NPerformances_Dis_Selected['NegativeRecall_FINAL']]
        else:
            performancestats += [np.nan, np.nan]
        
        performancestats += [thresh_class, thresh_tn]
        MulticlassPerformances.append(performancestats)
        ConfusionMats[FID][method] = {'Raw.Test': cm, 'Raw.Final':final_cm, 'Norm.Test':cm_normalized, 'Norm.Final':final_cm_normalized}

MulticlassPerformances = pd.DataFrame(MulticlassPerformances, columns=['FID', 'Model', 'MCC_TEST', 'MCC_FINAL', 'F1macro_TEST', 'F1macro_FINAL', 'F1micro_TEST', 'F1micro_FINAL', 
                                                                       'Recall_TEST', 'Recall_FINAL', 'NegativeRecall_TEST', 'NegativeRecall_FINAL', 'Thresh_HSim', 'Thresh_Dis'])
MulticlassPerformances = MulticlassPerformances.sort_values(['FID', 'MCC_TEST'], ascending=[True,False])
MulticlassPerformances['Family_Name'] = [tf_families['Family_Name'].get(x) for x in MulticlassPerformances['FID']]

#Select Best Models by MCC Train
MulticlassPerformances.loc[MulticlassPerformances['Model'] == BaselineMethod, 'ModelSelection'] = 'BaselineMethod'
for FID, data in MulticlassPerformances.groupby('FID'):
    MulticlassPerformances.loc[data.index,'Model.Rank'] = range(1,data.shape[0] + 1) 
    data = data[(data['Model'] != BaselineMethod) & (data['Thresh_HSim'].isnull() == False)]
    if data.shape[0] > 0:
        MulticlassPerformances.loc[data.index[0], 'ModelSelection'] = 'SR'
MulticlassPerformances.to_csv('MulticlassPerformances_All.csv', index=False)

F009_1.97d
F026_1.97d
F028_1.97d


  'precision', 'predicted', average, warn_for)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


F039_1.97d
F082_1.97d
F091_1.97d
F135_1.97d
F169_1.97d
F170_1.97d
F173_1.97d
F174_1.97d


  'recall', 'true', average, warn_for)


F180_1.97d
F196_1.97d
F201_1.97d
F212_1.97d
F223_1.97d
F231_1.97d
F238_1.97d
F243_1.97d
F251_1.97d
F266_1.97d
F273_1.97d
F278_1.97d
F282_1.97d
F294_1.97d
F299_1.97d
F301_1.97d
F305_1.97d
F312_1.97d
F314_1.97d
F315_1.97d
F323_1.97d
F324_1.97d


# Parse Best SR Models -> Scoring Files
### Use %ID when SR is worse

In [3]:
if os.path.isdir(WorkingOn + '/SRModels') == False:
    os.mkdir(WorkingOn + '/SRModels')
    
Overides_ModelSelection = {'F170_1.97d' : 'AvgB62.Logistic'}

In [6]:
for FID, data in MulticlassPerformances.groupby('FID'):
    data = data[data['Thresh_HSim'].isnull() == False]
    #Find the best model
    if data.shape[0] > 0:
        if FID in Overides_ModelSelection:
            SelectedModel = Overides_ModelSelection[FID]
            dSM = data[data['Model'] == SelectedModel]
            dSM = dSM.iloc[0,]
        else:
            dSM = data.iloc[0,]
            SelectedModel = dSM['Model']
    else:
        SelectedModel = None
    
    #Initialize container for selected SR/Alignment model
    ScoringModel = {}
    #Output Data
    if SelectedModel != None:
        tf_families.loc[FID, 'Model'] = SelectedModel
        ScoringModel['Family_ID'] =  FID
        ScoringModel['Family_Name'] =  dSM['Family_Name']
        if SelectedModel == 'PctID_L':
            ScoringModel['Model.Name'] = 'PctID_L'
            ScoringModel['Model.Class'] = 'SequenceIdentity'
            ScoringModel['Threshold.HSim'] = dSM['Thresh_HSim']
            ScoringModel['Threshold.Dis'] = dSM['Thresh_Dis']
        else:
            #Add PctID_L as the Baseline.Method into the scoring file
            BaselineModel = data.loc[data['ModelSelection'] == 'BaselineMethod',]
            BaselineModel = BaselineModel.iloc[0,]
            ScoringModel['Baseline'] = {}
            ScoringModel['Baseline']['Name'] = 'PctID_L'
            ScoringModel['Baseline']['Class'] = 'SequenceIdentity'
            ScoringModel['Baseline']['Threshold.HSim'] = BaselineModel['Thresh_HSim']
            ScoringModel['Baseline']['Threshold.Dis'] = BaselineModel['Thresh_Dis']
            
            #Add other model information
            ScoringModel['Model.Name'] = SelectedModel
            ScoringModel['Model.Class'] = 'SimilarityRegression'
            ScoringModel['SR.Features'] = SelectedModel.split('.')[0].replace(' (Smooth3)', '_Smooth3') #For Heldout files
            if SelectedModel.split('.')[1] == 'Logistic':
                ScoringModel['SR.LogisticTransform'] = True
            else:
                ScoringModel['SR.LogisticTransform'] = False
            ScoringModel['Threshold.HSim'] = dSM['Thresh_HSim']
            ScoringModel['Threshold.Dis'] = dSM['Thresh_Dis']
            #Read weights
            SRCoefs = pd.read_csv(WorkingOn + '/ByFamily/' + FID + '/Models/ModelCoefficents.csv', index_col=0)
            SRCoefs = SRCoefs[SelectedModel]
            ScoringModel['SR.Intercept'] = SRCoefs[0]
            ScoringModel['SR.Weights'] = list(SRCoefs[1:])
            #Read feature scaling (mean/sd)
            SRXscales = pd.read_csv(WorkingOn + '/ByFamily/' + FID + '/Models/XScales.csv', index_col=[1,2])
            SRXscales = SRXscales.iloc[:,1:] #Drop weird index column
            SRXscales = SRXscales.loc[SelectedModel.split('.')[0],]
            ScoringModel['SR.FeatureScales.mean'] = list(SRXscales.loc['mean',])
            ScoringModel['SR.FeatureScales.sd'] = list(SRXscales.loc['sd',])
        #Output
        with open(WorkingOn + '/SRModels/' + FID + '.json', 'w') as loc_FamilySRModel:
            loc_FamilySRModel.write(json.dumps(ScoringModel, indent = 4, sort_keys= True))



# Parse Model Improvements (Log2FC, % change, Abs. change ) to Long Form for ggplot2

In [None]:
SelectedModels_LF = []
for FID, data in MulticlassPerformances.groupby('FID'):
    print 'Parsing:', FID
    if ('SR' in list(data['ModelSelection'])) and ('BaselineMethod' in list(data['ModelSelection'])):
        SelectedModel = MulticlassPerformances.loc[data[data['ModelSelection'] == 'SR'].index[0]]
        BaselineModel = MulticlassPerformances.loc[data[data['ModelSelection'] == 'BaselineMethod'].index[0]]
        
        rline = [FID, SelectedModel['Family_Name'], SelectedModel['Model']]
        rline += SelectedModel['Model'].split('.')
        #Recall_FINAL
        oline = rline + ['Recall @ 75% Precision', SelectedModel['Recall_FINAL'], BaselineModel['Recall_FINAL']]
        SelectedModels_LF.append(oline)
        #NegativeRecall_FINAL
        oline = rline + ['Specificity @ 95% NPV', SelectedModel['NegativeRecall_FINAL'], BaselineModel['NegativeRecall_FINAL']]
        SelectedModels_LF.append(oline)
        #MCC_FINAL
        oline = rline + ['Matthews correlation coefficient (MCC)', SelectedModel['MCC_FINAL'], BaselineModel['MCC_FINAL']]
        SelectedModels_LF.append(oline)
        #Number of Pairs
        oline = rline + ['Experiments', np.nan, np.nan]
        SelectedModels_LF.append(oline)
        
SelectedModels_LF = pd.DataFrame(SelectedModels_LF, columns=['Family_ID', 'Family_Name', 'SRModel', 'Features', 'Response', 'PerformanceMetric', 'SR', 'PctID'])
SelectedModels_LF['Log2FC'] = np.log2(SelectedModels_LF['SR']/SelectedModels_LF['PctID'])
SelectedModels_LF['PercentChange'] = 100*(SelectedModels_LF['SR'] - SelectedModels_LF['PctID'])/SelectedModels_LF['PctID']
SelectedModels_LF['AbsoluteChange'] = SelectedModels_LF['SR'] - SelectedModels_LF['PctID']

for FID in set(SelectedModels_LF['Family_ID']):
    numexperiments = tf_families['Experiments'].get(FID)
    SelectedModels_LF.loc[(SelectedModels_LF['Family_ID'] == FID) & (SelectedModels_LF['PerformanceMetric'] == 'Experiments'), 'Log2FC'] = numexperiments
    SelectedModels_LF.loc[(SelectedModels_LF['Family_ID'] == FID) & (SelectedModels_LF['PerformanceMetric'] == 'Experiments'), 'PercentChange'] = numexperiments
    SelectedModels_LF.loc[(SelectedModels_LF['Family_ID'] == FID) & (SelectedModels_LF['PerformanceMetric'] == 'Experiments'), 'AbsoluteChange'] = numexperiments
    
SelectedModels_LF.to_csv('SelectedModelImprovements_LF.csv', index=False)

# Compare to DNAproDB

In [None]:
import os

Selections = {}
for i,d in SelectedModels_LF.iterrows():
    Coefs = pd.read_csv(WorkingOn + '/ByFamily/'+ d['Family_ID'] + '/Models/ModelCoefficents.csv', index_col=0)
    CoefsOfInterest = Coefs[d['SRModel']]
    Selections[d['Family_Name']] = (d['SRModel'], CoefsOfInterest)

loc_ContactFreqs = '/Users/slambert/Data/DNAproDB/PFamContactFrequencies/'

count = 0
for fname, info in Selections.items():
    DBDname = tf_families.loc[tf_families['Family_Name']==fname , 'DBDs'][0]
    if os.path.isfile(loc_ContactFreqs + DBDname + '.csv'):
        print fname, DBDname
        coefs = info[1]
        cfreqs = pd.read_csv(loc_ContactFreqs + DBDname + '.csv', index_col=0)
        cfreqs.index = ['p' + str(x) for x in cfreqs.index]
        cfreqs['Coef'] = np.nan
        for p, val in coefs.iteritems():
            cfreqs.loc[p, 'Coef'] = val
        cfreqs['Family_Name'] = fname
        cfreqs.reset_index(inplace=True)
        if count == 0:
            allfreqs = cfreqs.copy()
        else:
            allfreqs = pd.concat([allfreqs, cfreqs])
        count +=1

allfreqs.to_csv('JointModelContactFreqs.csv', index=False)

# Plot Confusion matrices

In [None]:
% matplotlib inline
import matplotlib.pyplot as plt
import itertools


def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
plot_confusion_matrix(ConfusionMats['F135_1.97d']['AvgB62.Regression']['Raw.Final'], ['Highly Similar', 'Ambiguous', 'Dissimilar'], True)

In [None]:
def plot_two_confusion_matricies(cm1, cm2, cmnames, classes,
                                normalize=False,
                                main_title='Confusion matrix',
                                cmap=plt.cm.Greys):
    """
    This function prints and plots two confusion matricies.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm1 = cm1.astype('float') / cm1.sum(axis=1)[:, np.newaxis]
        cm2 = cm2.astype('float') / cm2.sum(axis=1)[:, np.newaxis]

    plt.subplot(1, 2, 1)
    plt.imshow(cm1, interpolation='nearest', cmap=cmap)
    plt.title(cmnames[0])
    #plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm1.max() / 2.
    for i, j in itertools.product(range(cm1.shape[0]), range(cm1.shape[1])):
        plt.text(j, i, format(cm1[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm1[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
    plt.subplot(1, 2, 2)
    plt.imshow(cm2, interpolation='nearest', cmap=cmap)
    plt.title(cmnames[1])
    #plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm2.max() / 2.
    for i, j in itertools.product(range(cm2.shape[0]), range(cm2.shape[1])):
        plt.text(j, i, format(cm2[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm2[i, j] > thresh else "black")
    plt.tight_layout()
    #plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
    plt.suptitle(main_title)

CM_CurrentFam = ConfusionMats['F135_1.97d']
PerformanceSet = 'Raw.Final'
method_SR = 'AvgB62.Regression'
SR = CM_CurrentFam[method_SR][PerformanceSet]
method_Baseline = 'PctID_L'
Baseline = CM_CurrentFam[method_Baseline][PerformanceSet]           
classnames = ['Highly Similar', 'Ambiguous', 'Dissimilar']
plot_two_confusion_matricies(Baseline, SR, [method_Baseline, 'SR' + '(' + method_SR + ')'], classnames, True, 'C2H2 ZF (F135_1.97d)')

In [None]:
CM_CurrentFam = ConfusionMats['F278_1.97d']
PerformanceSet = 'Raw.Final'
method_SR = 'AvgB62.Regression'
SR = CM_CurrentFam[method_SR][PerformanceSet]
method_Baseline = 'PctID_L'
Baseline = CM_CurrentFam[method_Baseline][PerformanceSet]           
classnames = ['Highly Similar', 'Ambiguous', 'Dissimilar']
plot_two_confusion_matricies(Baseline, SR, [method_Baseline, 'SR' + '(' + method_SR + ')'], classnames, True, 'NHR (F278_1.97d)')

In [None]:
CM_CurrentFam = ConfusionMats['F324_1.97d']
PerformanceSet = 'Raw.Final'
method_SR = 'AvgB62.Logistic'
SR = CM_CurrentFam[method_SR][PerformanceSet]
method_Baseline = 'PctID_L'
Baseline = CM_CurrentFam[method_Baseline][PerformanceSet]           
classnames = ['Highly Similar', 'Ambiguous', 'Dissimilar']
plot_two_confusion_matricies(Baseline, SR, [method_Baseline, 'SR' + '(' + method_SR + ')'], classnames, True, 'Zn Cluster (F324_1.97d)')

In [7]:
tf_families

Unnamed: 0_level_0,Family_Name,DBDs,DBD_Count,CisBP_Cutoff,NumberOfPairs,Experiments,Model
Family_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
F001_1.97d,"ABF1,B3","BAF1_ABF1,B3",2,0.700,,,
F002_1.97d,ABF1,BAF1_ABF1,1,0.700,,,
F003_1.97d,"ABF1,C2H2 ZF","BAF1_ABF1,zf-C2H2",2,0.700,,,
F004_1.97d,"ABF1,Forkhead","BAF1_ABF1,Forkhead",2,0.700,,,
F005_1.97d,"ABF1,LOB","BAF1_ABF1,DUF260",2,0.700,,,
F006_1.97d,"ABF1,Myb/SANT","BAF1_ABF1,Myb_DNA-binding",2,0.700,,,
F007_1.97d,AFT,AFT,1,0.700,,,
F008_1.97d,"AFT,FAR1","AFT,FAR1",2,0.700,,,
F009_1.97d,AP2,AP2,1,0.700,4851.0,99.0,PctID.Regression
F010_1.97d,"AP2,B3","AP2,B3",2,0.700,,,
