In [None]:
import re
from pathlib import Path
import pandas as pd

from ratschlab_common.viz import create_grid, clean_axes_grid
import matplotlib.pyplot as plt

In [None]:
wdir = Path('/cluster/work/grlab/projects/projects2019-supervario/')

In [None]:
path_lut = {
    '10xbreast': wdir/'10x_data_breastcancer/sliceB/processed_files/matrices_homoProp0.5_epsilon0.001_theta0.01/simMat_diff.csv',
    '10xbreast8cell': wdir/'10x_data_breastcancer/sliceB/processed_files/matrices_8cells/simMat_diff.csv',
    'varsim005x': wdir/'hana_varsim_data/matrices_h0.15/simMat_diff.csv',
    'varsim01x': wdir/'hana_varsim_data/matrices_cov0.1/simMat_diff.csv',
    'varsim02x': wdir/'hana_varsim_data/matrices_cov0.2/simMat_diff.csv'
}

data_lut = {
    key: -pd.read_csv(path, header=None).values
    for key, path
    in path_lut.items()
}

In [None]:
def split_hist(ax, data, **kwargs):
    healthy = np.arange(data.shape[0]) < data.shape[0]//2

    ax.hist(data[healthy][:, healthy].ravel(), label='healthy', **kwargs)
    ax.hist(data[~healthy][:, ~healthy].ravel(), label='tumor', **kwargs)
    ax.hist(data[healthy][:, ~healthy].ravel(), label='mismatch', **kwargs)
    return ax

In [None]:
fig, axes = create_grid(len(data_lut), 2)

for ax, (key, data) in zip(axes.ravel(), data_lut.items()):
    ax.hist(data.ravel(), bins=50)
    ax.set_title(key)
clean_axes_grid(axes)

In [None]:
fig, axes = create_grid(len(data_lut), 2)
for ax, (key, data) in zip(axes.ravel(), data_lut.items()):
    data = data.copy()
    ax.imshow(data)
clean_axes_grid(axes)

In [None]:
fig, axes = create_grid(len(data_lut), 2)
for ax, (key, data) in zip(axes.ravel(), data_lut.items()):
    data = data.copy()
    data[data > 0] = 0
    ax.hist(data.ravel() / data.std(), bins=50)
    ax.set_title(key)
clean_axes_grid(axes)

In [None]:
preproc_list = ['cut', 'zscore-exp_beta0.5', 'shift']


In [None]:
def preproc(key, data):
    if key == 'cut':
        data[data < 0] = 0
        
    elif key == 'shift':
        data = data - data.min()
    
    elif key.startswith('zscore-exp_beta'):
        beta = float(key.replace('zscore-exp_beta', ''))
        data = zscore(data)
        data = np.exp(beta * data)
    else:
        raise ValueError
    
    return data

In [None]:
import numpy as np
def zscore(data):
    data = data - data.mean()
    data = data / data.std()
    return data

In [None]:
for beta in [0.1, 0.25, 0.5]:
    fig, axes = create_grid(len(data_lut), 2)
    for ax, (key, data) in zip(axes.ravel(), data_lut.items()):
        data = data.copy()
        data = zscore(data)
        data = np.exp(beta * data)
        split_hist(ax, data, bins=50, range=(0,3), alpha=0.5)
        ax.set_title(key)
        if ax in axes[:, 0]:
            ax.legend()
    clean_axes_grid(axes)

In [None]:
labels = dict()
for key, data in data_lut.items():
    for pp in ['zscore-exp_beta0.5', 'zscore-exp_beta0.1', 'zscore-exp_beta0.25']:
        print(key, pp)
        data = preproc(pp, data.copy())
        model = SpectralClustering(2, affinity='precomputed', n_init=100, random_state=15)
        model.fit(data)
        labels[f'{key}-{pp}'] = model.labels_

In [None]:
for key, label in labels.items():
    print(key, label[:label.size//2].mean(), label[label.size//2:].mean())