In [40]:
%%time
import os
import numpy as np
os.chdir('../src/')
from util import clean_data as cd
from util import viz
import algos.basic as basic
import algos.similarity as sim
import pandas as pd
METADATA_PATH = '../data/Phenotypic_V1_0b_preprocessed1.csv'
PREFIX = '../data/rois_ez/'
POSTFIX = '_rois_ez.1D'

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 21.9 µs


In [41]:
%%time
md = cd.get_metadata(METADATA_PATH, ['FILE_ID', 'DX_GROUP'])

CPU times: user 28 ms, sys: 0 ns, total: 28 ms
Wall time: 25 ms


In [42]:
%%time
good_md = cd.filter_file_names(md, PREFIX, POSTFIX)
f_names = np.array(map(lambda x: x['FILE_ID'], good_md))
dx_groups = np.array(map(lambda x: int(x['DX_GROUP']), good_md))
p = np.random.permutation(len(dx_groups))
dx_groups = dx_groups[p]
f_names = f_names[p]

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 172 ms


In [43]:
%%time
data = cd.get_data(f_names)
data = cd.run_function(basic.transpose, data)
data = cd.run_function(basic.mean_center, data)

Processed 0000 of 0884 brains.
Processed 0100 of 0884 brains.
Processed 0200 of 0884 brains.
Processed 0300 of 0884 brains.
Processed 0400 of 0884 brains.
Processed 0500 of 0884 brains.
Processed 0600 of 0884 brains.
Processed 0700 of 0884 brains.
Processed 0800 of 0884 brains.
Processed 0000 of 0884 brains.
Processed 0100 of 0884 brains.
Processed 0200 of 0884 brains.
Processed 0300 of 0884 brains.
Processed 0400 of 0884 brains.
Processed 0500 of 0884 brains.
Processed 0600 of 0884 brains.
Processed 0700 of 0884 brains.
Processed 0800 of 0884 brains.
CPU times: user 11.1 s, sys: 256 ms, total: 11.4 s
Wall time: 17.1 s


In [None]:
%%time
n_train = 614
n_tune = 90
n_test = 180
D = {
    'train' : data[:n_train],
    'tune' : data[n_train:n_train + n_tune],
    'test' : data[:-n_test]
}
L = {
    'train' : dx_groups[:n_train],
    'tune' : dx_groups[n_train:n_train + n_tune],
    'train' : dx_groups[:-n_test]
}
print len(D['train'])
print len(D['tune'])
print len(D['test'])
train_a, train_c = cd.split_groups(D['train'], L['train'])
tune_a, tune_c = cd.split_groups(D['tune'], L['tune'])
data_a = cd.concat_group(data_a)
data_c = cd.concat_group(data_c)
tune_a = cd.concat_group(data_a)
tune_c = cd.concat_group(data_c)
print data_a.shape
print data_c.shape
print tune_a.shape
print tune_c.shape

614
90
704


In [7]:
params = {
    'k' : 15,
    'h' : 2,
    'd_cont' : 5,
    'd_words' : 5
}
model = cd.train(data_a, params)
 = cd.embed(data_a, model)
print embedding.shape

(15, 8850)


In [None]:
%%time
correl = cd.run_function(basic.correl, data)
viz.heatmap_channels(cd.run_function(basic.transpose, data),
         1,
         title='Activity over time',
         xtitle='Time step',
         ytitle='Region of Interest')
viz.heatmap_channels(correl,
         1,
         title='Correlation',
         xtitle='ROI',
         ytitle='ROI')

In [None]:
%%time
tsap = cd.run_function(sim.tsap, data)
n_clusts = cd.run_function(sim.n_clusts, tsap)
avg_clusts = cd.run_function(sim.avg_clust_size, tsap)
min_clusts = cd.run_function(sim.min_clust_size, tsap)
max_clusts = cd.run_function(sim.max_clust_size, tsap)
viz.line(n_clusts,
         1,
         title='Number of Clusters Over Time',
         xtitle='Time step',
         ytitle='Number of Clusters')
viz.line(avg_clusts,
         1,
         title='Average Cluster Size Over Time',
         xtitle='Time step',
         ytitle='Number of Clusters')
viz.line(max_clusts,
         1,
         title='Maximum Cluster Size Over Time',
         xtitle='Time step',
         ytitle='Number of Clusters')
viz.line(min_clusts,
         1,
         title='Minimum Cluster Size Over Time',
         xtitle='Time step',
         ytitle='Number of Clusters')

In [None]:
viz.heatmap_channels(tsap,
                     1,
                     title='Clusters over Time',
                     xtitle='Timestep',
                     ytitle='ROI',
                     ztitle='Cluster Label')

In [None]:
%%time
dist = cd.run_function(sim.dtw, data)

In [None]:
viz.heatmap_channels(dist,
                     1,
                     title='Raw Data Heatmap',
                     xtitle='Timestep',
                     ytitle='ROI',
                     ztitle='4D Intensity (mean centered)')

In [None]:
%%time
affinity = cd.run_function(basic.neg, dist)
labels = cd.run_function(sim.ap, affinity)
ordered = cd.run_function(basic.cluster_sort, zip(data, labels))
correl = cd.run_function(basic.correl, ordered)

In [None]:
viz.heatmap_channels(correl,
                     1,
                     title='Raw Data Heatmap',
                     xtitle='Timestep',
                     ytitle='ROI',
                     ztitle='4D Intensity (mean centered)')

In [None]:
%%time
sses = cd.run_function(basic.SSE, zip(data, labels))
ssses = [np.sum(s) for s in sses]

In [None]:
import matplotlib.pyplot as plt
plt.scatter(ssses, dx_groups)
plt.show()

In [None]:
%%time
silhouette = cd.run_function(basic.silhouette, zip(dist, labels))
plt.scatter(silhouette, dx_groups)
plt.show()