Experiments for setting up prototypical networks

In [1]:
import sys
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import mixture

In [2]:
sns.set(color_codes=True)

In [3]:
COMPUTE_ENGINE_SRC_PATH = "../"

if COMPUTE_ENGINE_SRC_PATH not in sys.path:
    sys.path.append(COMPUTE_ENGINE_SRC_PATH)
else:
    print("Path='{0}' already in system path ".format(COMPUTE_ENGINE_SRC_PATH))

In [4]:
from compute_engine.src.utils import load_data_file
from compute_engine.src.utils import make_data_array

### Load the data

In [5]:
# TUF region data
wga_mu_tuf_file = "../data/train/wga_windows_mean_0_TUF_I_CHR_1_MEAN_CUTOFF.txt"
nowga_mu_tuf_file = "../data/train/no_wga_windows_mean_0_TUF_I_CHR_1_MEAN_CUTOFF.txt"

# single copy deletion data
wga_mu_single_copy_del_file = "../data/train/wga_windows_mean_0_SINGLE_COPY_DELETION_CHR_2_MEAN_CUTOFF.txt"
nowga_mu_single_copy_del_file = "../data/train/no_wga_windows_mean_0_SINGLE_COPY_DELETION_CHR_2_MEAN_CUTOFF.txt"

# duplication data
wga_mu_duplication_file = "../data/train/wga_windows_mean_0_DUPLICATION_CHR_1_MEAN_CUTOFF.txt"
nowga_mu_duplication_file = "../data/train/no_wga_windows_mean_0_DUPLICATION_CHR_1_MEAN_CUTOFF.txt"

# full copy deletion data
wga_mu_del_file = "../data/train/wga_windows_mean_0_DELETE_CHR_1_MEAN_CUTOFF.txt"
nowga_mu_del_file = "../data/train/no_wga_windows_mean_0_DELETE_CHR_1_MEAN_CUTOFF.txt"

In [6]:
wga_mu_tuf = load_data_file(filename=wga_mu_tuf_file, type_convert="FLOAT")
nowga_mu_tuf = load_data_file(filename=nowga_mu_tuf_file, type_convert="FLOAT")

assert len(wga_mu_tuf) == len(nowga_mu_tuf), "Invalid data size for TUF data"

wga_mu_single_copy_del = load_data_file(filename=wga_mu_single_copy_del_file, 
                                             type_convert="FLOAT")
nowga_mu_single_copy_del = load_data_file(filename=nowga_mu_single_copy_del_file, 
                                                type_convert="FLOAT")

assert len(wga_mu_single_copy_del) == len(nowga_mu_single_copy_del), \
"Invalid data size for single copy deletion data"

wga_mu_duplication = load_data_file(filename=wga_mu_duplication_file, type_convert="FLOAT")
nowga_mu_duplication = load_data_file(filename=nowga_mu_duplication_file, type_convert="FLOAT")

assert len(wga_mu_duplication) == len(nowga_mu_duplication), "Invalid data size for duplication data"

wga_mu_del = load_data_file(filename=wga_mu_del_file, type_convert="FLOAT")
nowga_mu_del = load_data_file(filename=nowga_mu_del_file, type_convert="FLOAT")

assert len(wga_mu_del) == len(nowga_mu_del), "Invalid data size for full deletion data"

### Mix the data

In [7]:

# WGA sample
wga_mu = []
wga_mu.extend(wga_mu_single_copy_del)
wga_mu.extend(wga_mu_duplication)
wga_mu.extend(wga_mu_del)


# NO-WGA sample
no_wga_mu = []
no_wga_mu.extend(nowga_mu_single_copy_del)
no_wga_mu.extend(nowga_mu_duplication)
no_wga_mu.extend(nowga_mu_del)

### Cluster the reference data

In [8]:
# Dictionay that holds the states for the HMM 
states={}

In [9]:
data = make_data_array(wga_mu=wga_mu, 
                       no_wga_mu=no_wga_mu, gc=None, 
                       use_ratio=False, use_gc=False)

data = np.array(data)

assert data.shape == (len(wga_mu), 2)

#### Options for GMM clustering

In [None]:
NUM_CLUSTERS=5
COVARIANCE_TYPE = 'diag'
MAX_ITRS=1000
TOL=1.0e-5
N_INIT=1

In [None]:
# the GMM cluster
gmm = mixture.GaussianMixture(n_components=NUM_CLUSTERS,
                              covariance_type=COVARIANCE_TYPE,
                              tol=TOL, max_iter=MAX_ITRS, n_init=N_INIT)

# fit the data
gmm.fit(data)
   

In [None]:
print("Converged: ", gmm.converged_)
print("BIC: ", gmm.bic(data))    

In [None]:
labels = gmm.predict(data)
print("Len of labels: ", len(labels))

In [None]:
colors = np.array(['green', 'blue', 'red',
                       'yellow', 'pink', 'orange', 'purple', 'navy',
                       'brown'])

# add black color for outliers (if any)
colors = np.append(colors, ["#000000"])
colors = colors[labels]

plt.scatter(no_wga_mu, wga_mu, color=colors)
plt.xlabel("NO-WGA ")
plt.ylabel("WGA")
plt.xlim(xlim)
plt.ylim(ylim)
plt.show()


In [None]:
# map that holds the association between the
# component color and component index

color_comp_assoc = {}
for label, color in zip(labels, colors):
    if color in color_comp_assoc.keys():
        assert color_comp_assoc[color][0] == label
        color_comp_assoc[color][1] += 1
    else:
        color_comp_assoc[color] = [label, 1]