In [None]:
import numpy as np
import pandas as pd
import scipy
import sklearn.metrics.pairwise as sklp

In [17]:
"""
Compute the Spearman distance between sample and every member of the
cohort and return the N nearest samples from the cohort.
"""
def nearest_samples(cohort, sample, N=-1):
    # Reduce to only common features
    print("Computing intersection")
    intersection = cohort.index.intersection(sample.index)
    cohort = cohort[cohort.index.isin(intersection)].sort_index(axis=0)
    sample = sample[sample.index.isin(intersection)].sort_index(axis=0)
    
    # Column wise rank transform to turn correlation into spearman
    print("Transforming Rank")
    a = np.apply_along_axis(scipy.stats.rankdata, 1, cohort.values.T)
    b = np.apply_along_axis(scipy.stats.rankdata, 1, sample.values.T)

    # Compute spearman distances
    print("Computing distances")
    distances = sklp.pairwise_distances(X=a, Y=b, metric="correlation", n_jobs=1)
            
    # Rank and return top N
    rank = 1 - pd.DataFrame(distances, cohort.columns.values)
    return rank.sort_values(by=0, ascending=False)[0:N]

In [18]:
%%time
# Read cohort
cohort = pd.read_csv("/data/references/compendium/v4/expression.tsv", sep="\t", index_col=0)
print("Cohort Shape:", cohort.shape)
cohort.head()

('Cohort Shape:', (27130, 11073))
CPU times: user 3min 51s, sys: 37.6 s, total: 4min 29s
Wall time: 4min 29s


In [19]:
%%time
# Concordance with prior run using the Tumormap docker
tests = [
    ("TH27_0682_S01",
     "/data/notebooks/e-t-k/protocol_batches/thops88/output/TH27_0682_S01/rsem.genes.tpm.hugo.log2plus1.dedupe.tab",
     "/data/notebooks/e-t-k/protocol_batches/thops88/output/TH27_0682_S01/tumormap_results.txt"),
    ("TH27_0682_S01",
     "/data/notebooks/e-t-k/protocol_batches/thops88/output/TH27_0682_S02/rsem.genes.tpm.hugo.log2plus1.dedupe.tab",
     "/data/notebooks/e-t-k/protocol_batches/thops88/output/TH27_0682_S02/tumormap_results.txt"),
    ("FAIL.TH03_0118_S03",
     "/data/notebooks/e-t-k/protocol_batches/thops88/output/FAIL.TH03_0118_S03/rsem.genes.tpm.hugo.log2plus1.dedupe.tab",
     "/data/notebooks/e-t-k/protocol_batches/thops88/output/FAIL.TH03_0118_S03/tumormap_results.txt"),
    ("FAIL.TH03_0145_S02",
     "/data/notebooks/e-t-k/protocol_batches/thops88/output/FAIL.TH03_0145_S02/rsem.genes.tpm.hugo.log2plus1.dedupe.tab",
     "/data/notebooks/e-t-k/protocol_batches/thops88/output/FAIL.TH03_0145_S02/tumormap_results.txt")
]

for sample_id, sample_path, original_result_path in tests:
    print("Computing closest for {}".format(sample_id))
    sample = pd.read_csv(sample_path, sep="\t", index_col=0)
    closest = nearest_samples(cohort, sample, 6)

    original_results = pd.read_csv(original_result_path, sep="\t", header=None)
    print("Original:")
    print original_results

    print("New:")
    new_results = pd.DataFrame({0: sample_id, 1: closest.index, 2: closest[0].values}, index=np.arange(len(closest)))
    print new_results

    print("Concordant: {}".format(np.isclose(original_results[2], new_results[2])))

Computing closest for TH27_0682_S01
Computing intersection
Transforming Rank
Computing distances
Original:
               0                    1         2
0  TH27_0682_S01  TARGET-20-PASFEW-09  0.914514
1  TH27_0682_S01  TARGET-20-PASTTW-09  0.906125
2  TH27_0682_S01  TARGET-20-PARCCH-03  0.905646
3  TH27_0682_S01  TARGET-20-PASVYA-04  0.900991
4  TH27_0682_S01  TARGET-20-PALGKX-04  0.891944
5  TH27_0682_S01  TARGET-20-PASXNR-04  0.891034
New:
               0                    1         2
0  TH27_0682_S01  TARGET-20-PASFEW-09  0.914514
1  TH27_0682_S01  TARGET-20-PASTTW-09  0.906125
2  TH27_0682_S01  TARGET-20-PARCCH-03  0.905646
3  TH27_0682_S01  TARGET-20-PASVYA-04  0.900991
4  TH27_0682_S01  TARGET-20-PALGKX-04  0.891944
5  TH27_0682_S01  TARGET-20-PASXNR-04  0.891034
Concordant: [ True  True  True  True  True  True]
Computing closest for TH27_0682_S01
Computing intersection
Transforming Rank
Computing distances
Original:
               0                1         2
0  TH27_0682_S0