In [None]:
"""
This iterates over all .graph files in a directory, and, for each
graph file, iterates over all instances in the file. For each instance,
the script writes to file a row of text containing the following info:

the row begins with "[filename] [graph index]"
and is followed by all the flow values for that graph (separated by spaces).
"""

from gtruth_counter import get_flows_dict

froots = ['zebra', 'mouse', 'human', 'salmon']
datadirs = ['zebrafish', 'mouse', 'human', 'salmon']

for idx in [0,1,2,3]:
    froot = froots[idx]
    print("Working on {}".format(froot))
    truth_dir = '/home/kyle/data/rnaseq/' + datadirs[idx] + '/'
    outputfilename = './data/edge-flows-' + froot + '.txt'
    
    get_flows_dict(truth_dir, outputfilename)
    print("\t done")

In [1]:
# Collect path-weights from the master files, write to file

from pathweight_scrape import toboggan_output_weights_parser as get_pweights

froots = ['zebra', 'mouse', 'human', 'salmon']
datadirs = ['zebrafish', 'mouse', 'human', 'salmon']

for idx in [3]:
    froot = froots[idx]
    print("Working on {}".format(froot))
    toboggan_output_dir = './data/'
    inputfilename =  froot + '-master-file.txt'
    
    outputfilename = './data/path-weights-' + froot + '.txt'
    
    toboggan_path_weights = get_pweights(toboggan_output_dir + inputfilename)
    
    with open(outputfilename, 'w') as outputfile:
        for key, weightset in toboggan_path_weights.items():
            line = key
            for weight in weightset:
                line = line + ' ' + str(weight)
            outputfile.write(line + ' \n')
    print("\t done")

Working on salmon
	 done


In [1]:
# Load from file the pathweights and flowvalues for each solution,
# compile results

froots = ['zebra', 'mouse', 'human', 'salmon']
datadirs = ['zebrafish', 'mouse', 'human', 'salmon']

dataset_info = {}
dataset_totals = {}

for idx in [0,1,2,3]:
    froot = froots[idx]
    print("Working on {}".format(froot))
    edgefilename = './data/edge-flows-' + froot + '.txt'
    pathfilename = './data/path-weights-' + froot + '.txt'
    
    # build dictionary for edge flows
    edgeflows = {}
    with open(edgefilename, 'r') as inputf:
        for line in inputf:
            parts = line.strip().split()
            gkey = parts[0] + ' ' + parts[1]
            flowset = []
            for flowval in parts[2:]:
                flowset.append(int(flowval))
            edgeflows[gkey] = set(flowset)

    # build array of results
    results = []
    totals_flow_vals = 0
    totals_pweights_overlap = 0
    with open(pathfilename, 'r') as inputf:
        for line in inputf:
            parts = line.strip().split()
            gkey = parts[0] + ' ' + parts[1]
            
            flowset = edgeflows[gkey]
            num_flow_vals = len(flowset)
            
            num_pweights_overlap = 0
            for weightval in parts[2:]:
                if int(weightval) in flowset:
                    num_pweights_overlap += 1
            results.append( num_pweights_overlap/len(parts[2:]) )
            totals_flow_vals += len(parts[2:])
            totals_pweights_overlap += num_pweights_overlap

    dataset_totals[froot] = [totals_pweights_overlap, totals_flow_vals]
    dataset_info[froot] = results
    print("\t done")

Working on zebra
	 done
Working on mouse
	 done
Working on human
	 done
Working on salmon
	 done


In [2]:
# Print in latex-table format

for froot in froots:
    print("{:<10}   &  {:3.3f}\\% \\\\".format(froot,
                                    sum(dataset_info[froot])/len(dataset_info[froot]))
         )
print( "\midrule")
print( "total   & {:3.3f}\\% \\\\".format( sum( sum(dataset_info[froot]) for froot in froots) / sum( len(dataset_info[froot]) for froot in froots)) )

zebra        &  0.996\% \\
mouse        &  0.996\% \\
human        &  0.997\% \\
salmon       &  0.998\% \\
\midrule
total   & 0.997\% \\
