In [None]:
from mvtrajectories import *
import pandas as pd
import numpy as np
import math
import random
import matplotlib.pyplot as plt
from copy import deepcopy
%matplotlib inline

In [None]:
mb = pd.read_csv('mb-relative-abundances.csv')

### k-means

#### Parameter Search

In [None]:
random.seed(30821)
ktable = pd.DataFrame(columns = ['k', 'Rand', 'Dunn'], dtype = np.float64)
ks = [4, 6, 8, 10]
N = 10
scalar = 0.001
nrm = zero_inflated_lp_norm
ids = list(mb['patientID'].unique())
patients = mb.groupby(['patientID']).size()
six = list(patients[patients >= 6].index) # somewhat large initial trajectories
def mean(x):
    return sum(x) / len(x)

itr = 1
for k in ks:
    rs = []
    ds = []
    
    # make a list of Trajectory class objects
    trajectories = []
    for i in range(len(ids)):
        subset = mb[mb['patientID'] == ids[i]]
        trajectories.append(Trajectory(ids[i], subset.iloc[:,5:].values, subset.iloc[:,1].values, scalar))
    
    # initialize clusters randomly
    init_ids = random.sample(six, k) # initial clusters have cluster size 6
    clusters = []
    for i in init_ids:
        idx = ids.index(i)
        clusters.append(deepcopy(trajectories[idx]))
    for i in range(k):
        clusters[i].cluster = i # name clusters
        
    prev_assign = [0 for x in range(len(trajectories))]
    curr_assign = [1 for x in range(len(trajectories))]
    ctr = 0
    while curr_assign != prev_assign:
        ctr += 1
        prev_assign = curr_assign
        # assign to clusters
        for i in range(len(ids)):
            frsp = free_space(trajectories[i].longitudinal, 
                              clusters[0].longitudinal,
                              trajectories[i].times,
                              clusters[0].times,
                              nrm)
            idx = clusters[0].cluster
            dist = frechet_dist(frsp)
            traj = backtrack(frsp)
            for j in range(1, k):
                frsp = free_space(trajectories[i].longitudinal, 
                                  clusters[j].longitudinal,
                                  trajectories[i].times,
                                  clusters[j].times,
                                  nrm)
                curr = frechet_dist(frsp)
                if curr < dist:
                    dist = curr
                    traj = backtrack(frsp)
                    idx = clusters[j].cluster
            trajectories[i].cluster = idx
            trajectories[i].parameterization = traj
            trajectories[i].dist = dist
    
        # calculate cluster trajectory
        for i in range(k):
            cl = [x for x in trajectories if x.cluster == clusters[i].cluster]
            clusters[i].longitudinal, clusters[i].times = mean_trajectory(cl)
        
        curr_assign = [x.cluster for x in trajectories]
        
    prevassign = [x.cluster for x in trajectories]
    ds.append(a_dunn_like_index(trajectories, clusters, nrm))
    
    for n in range(N):
        print(k, n)
        
        # make a list of Trajectory class objects
        trajectories = []
        ids = list(mb['patientID'].unique())
        for i in range(len(ids)):
            subset = mb[mb['patientID'] == ids[i]]
            trajectories.append(Trajectory(ids[i], subset.iloc[:,5:].values, subset.iloc[:,1].values, scalar))
        
        patients = mb.groupby(['patientID']).size()
        six = list(patients[patients >= 6].index) # somewhat large initial trajectories
    
        # initialize clusters randomly
        init_ids = random.sample(six, k) # initial clusters have cluster size 6
        clusters = []
        for i in init_ids:
            idx = ids.index(i)
            clusters.append(deepcopy(trajectories[idx]))
        for i in range(k):
            clusters[i].cluster = i # name clusters
        
        prev_assign = [0 for x in range(len(trajectories))]
        curr_assign = [1 for x in range(len(trajectories))]
        ctr = 0
        while curr_assign != prev_assign:
            ctr += 1
            prev_assign = curr_assign
            # assign to clusters
            for i in range(len(ids)):
                frsp = free_space(trajectories[i].longitudinal, 
                                  clusters[0].longitudinal,
                                  trajectories[i].times,
                                  clusters[0].times,
                                  zero_inflated_lp_norm)
                idx = clusters[0].cluster
                dist = frechet_dist(frsp)
                traj = backtrack(frsp)
                for j in range(1, k):
                    frsp = free_space(trajectories[i].longitudinal, 
                                      clusters[j].longitudinal,
                                      trajectories[i].times,
                                      clusters[j].times,
                                      nrm)
                    curr = frechet_dist(frsp)
                    if curr < dist:
                        dist = curr
                        traj = backtrack(frsp)
                        idx = clusters[j].cluster
                trajectories[i].cluster = idx
                trajectories[i].parameterization = traj
                trajectories[i].dist = dist
    
            # calculate cluster trajectory
            for i in range(k):
                cl = [x for x in trajectories if x.cluster == clusters[i].cluster]
                clusters[i].longitudinal, clusters[i].times = mean_trajectory(cl)
        
            curr_assign = [x.cluster for x in trajectories]
            
        ds.append(a_dunn_like_index(trajectories, clusters, nrm))
        currassign = [x.cluster for x in trajectories]
        rs.append(rand_index(prevassign, currassign))
        prevassign = currassign
        
    ktable.loc[itr] = [k, mean(rs), mean(ds)]
    itr += 1
    
ktable.to_csv('k-parameter-search.csv', index = False)

In [None]:
random.seed(30921)
stable = pd.DataFrame(columns = ['Scalar', 'Rand', 'Dunn'], dtype = np.float64)
ss = [0.01, 0.001, 0.0001]
N = 10
k = 6
nrm = lp_norm
ids = list(mb['patientID'].unique())
patients = mb.groupby(['patientID']).size()
six = list(patients[patients >= 6].index) # somewhat large initial trajectories
def mean(x):
    return sum(x) / len(x)

itr = 1
for s in ss:
    rs = []
    ds = []
    
    # make a list of Trajectory class objects
    trajectories = []
    for i in range(len(ids)):
        subset = mb[mb['patientID'] == ids[i]]
        trajectories.append(Trajectory(ids[i], subset.iloc[:,5:].values, subset.iloc[:,1].values, s))
    
    # initialize clusters randomly
    init_ids = random.sample(six, k) # initial clusters have cluster size 6
    clusters = []
    for i in init_ids:
        idx = ids.index(i)
        clusters.append(deepcopy(trajectories[idx]))
    for i in range(k):
        clusters[i].cluster = i # name clusters
        
    prev_assign = [0 for x in range(len(trajectories))]
    curr_assign = [1 for x in range(len(trajectories))]
    ctr = 0
    while curr_assign != prev_assign:
        ctr += 1
        prev_assign = curr_assign
        # assign to clusters
        for i in range(len(ids)):
            frsp = free_space(trajectories[i].longitudinal, 
                              clusters[0].longitudinal,
                              trajectories[i].times,
                              clusters[0].times,
                              nrm)
            idx = clusters[0].cluster
            dist = frechet_dist(frsp)
            traj = backtrack(frsp)
            for j in range(1, k):
                frsp = free_space(trajectories[i].longitudinal, 
                                  clusters[j].longitudinal,
                                  trajectories[i].times,
                                  clusters[j].times,
                                  nrm)
                curr = frechet_dist(frsp)
                if curr < dist:
                    dist = curr
                    traj = backtrack(frsp)
                    idx = clusters[j].cluster
            trajectories[i].cluster = idx
            trajectories[i].parameterization = traj
            trajectories[i].dist = dist
    
        # calculate cluster trajectory
        for i in range(k):
            cl = [x for x in trajectories if x.cluster == clusters[i].cluster]
            clusters[i].longitudinal, clusters[i].times = mean_trajectory(cl)
        
        curr_assign = [x.cluster for x in trajectories]
        
    prevassign = [x.cluster for x in trajectories]
    ds.append(a_dunn_like_index(trajectories, clusters, nrm))
    
    for n in range(N):
        print(s, n)
        
        # make a list of Trajectory class objects
        trajectories = []
        ids = list(mb['patientID'].unique())
        for i in range(len(ids)):
            subset = mb[mb['patientID'] == ids[i]]
            trajectories.append(Trajectory(ids[i], subset.iloc[:,5:].values, subset.iloc[:,1].values, s))
        
        patients = mb.groupby(['patientID']).size()
        six = list(patients[patients >= 6].index) # somewhat large initial trajectories
    
        # initialize clusters randomly
        init_ids = random.sample(six, k) # initial clusters have cluster size 6
        clusters = []
        for i in init_ids:
            idx = ids.index(i)
            clusters.append(deepcopy(trajectories[idx]))
        for i in range(k):
            clusters[i].cluster = i # name clusters
        
        prev_assign = [0 for x in range(len(trajectories))]
        curr_assign = [1 for x in range(len(trajectories))]
        ctr = 0
        while curr_assign != prev_assign:
            ctr += 1
            prev_assign = curr_assign
            # assign to clusters
            for i in range(len(ids)):
                frsp = free_space(trajectories[i].longitudinal, 
                                  clusters[0].longitudinal,
                                  trajectories[i].times,
                                  clusters[0].times,
                                  zero_inflated_lp_norm)
                idx = clusters[0].cluster
                dist = frechet_dist(frsp)
                traj = backtrack(frsp)
                for j in range(1, k):
                    frsp = free_space(trajectories[i].longitudinal, 
                                      clusters[j].longitudinal,
                                      trajectories[i].times,
                                      clusters[j].times,
                                      nrm)
                    curr = frechet_dist(frsp)
                    if curr < dist:
                        dist = curr
                        traj = backtrack(frsp)
                        idx = clusters[j].cluster
                trajectories[i].cluster = idx
                trajectories[i].parameterization = traj
                trajectories[i].dist = dist
    
            # calculate cluster trajectory
            for i in range(k):
                cl = [x for x in trajectories if x.cluster == clusters[i].cluster]
                clusters[i].longitudinal, clusters[i].times = mean_trajectory(cl)
        
            curr_assign = [x.cluster for x in trajectories]
            
        ds.append(a_dunn_like_index(trajectories, clusters, nrm))
        currassign = [x.cluster for x in trajectories]
        rs.append(rand_index(prevassign, currassign))
        prevassign = currassign
        
    stable.loc[itr] = [s, mean(rs), mean(ds)]
    itr += 1
    
stable.to_csv('scalar-parameter-search.csv', index = False)

#### Final Run

In [None]:
# make a list of Trajectory class objects
trajectories = []
scalar = 0.001
nrm = zero_inflated_lp_norm
ids = list(mb['patientID'].unique())
for i in range(len(ids)):
    subset = mb[mb['patientID'] == ids[i]]
    trajectories.append(Trajectory(ids[i], subset.iloc[:,5:].values, subset.iloc[:,1].values, scalar))
    
patients = mb.groupby(['patientID']).size()
six = list(patients[patients >= 6].index) # somewhat large initial trajectories

# initialize clusters randomly
random.seed(31021)
k = 6
#init_ids = random.sample(range(len(ids)), k)
init_ids = random.sample(six, k) # initial clusters have cluster size 6
clusters = []
for i in init_ids:
    idx = ids.index(i)
    clusters.append(deepcopy(trajectories[idx]))
for i in range(k):
    clusters[i].cluster = i # name clusters

In [None]:
prev_assign = [0 for x in range(len(trajectories))]
curr_assign = [1 for x in range(len(trajectories))]
ctr = 0
while curr_assign != prev_assign:
    ctr = ctr + 1
    prev_assign = curr_assign
    # assign to clusters
    for i in range(len(ids)):
        frsp = free_space(trajectories[i].longitudinal, 
                          clusters[0].longitudinal,
                          trajectories[i].times,
                          clusters[0].times,
                          nrm)
        idx = clusters[0].cluster
        dist = frechet_dist(frsp)
        traj = backtrack(frsp)
        for j in range(1, k):
            frsp = free_space(trajectories[i].longitudinal, 
                              clusters[j].longitudinal,
                              trajectories[i].times,
                              clusters[j].times,
                              nrm)
            curr = frechet_dist(frsp)
            if curr < dist:
                dist = curr
                traj = backtrack(frsp)
                idx = clusters[j].cluster
        trajectories[i].cluster = idx
        trajectories[i].parameterization = traj
        trajectories[i].dist = dist
    
    print('iteration', ctr)
    for i in range(k):
        print(len([x.identity for x in trajectories if x.cluster == i]))
    
    # calculate cluster trajectory
    for i in range(k):
        cl = [x for x in trajectories if x.cluster == clusters[i].cluster]
        clusters[i].longitudinal, clusters[i].times = mean_trajectory(cl)
        
    curr_assign = [x.cluster for x in trajectories] 

### Plotting

#### Exploring

In [None]:
print(list(mb.columns[5:]))
multiline(clusters[0], scalar, False)

#### Saving

In [None]:
multiline_save('results\6-0.001\cluster1.png', 'Cluster 1', clusters[0], scalar, False, xa = True, ya = True)
for i in range(1, k):
    multiline_save('results\6-0.001\cluster' + str(i+1) + '.png', 'Cluster ' + str(i+1), clusters[i], scalar, False)

### Some cluster exploration

In [None]:
def mean(x):
    return sum(x) / len(x)

# how close are trajectories within clusters to the cluster mean
for i in range(len(clusters)):
    print('cluster', i + 1)
    print('mean distance:', mean([x.dist for x in trajectories if x.cluster == i]))
    print('\n')

In [None]:
# how close are cluster mean trajectories
for i in range(len(clusters)):
    for j in range(i + 1, len(clusters)):
        print(i + 1, '\t', j + 1, '\t', frechet_dist(free_space(clusters[i].longitudinal, 
                                            clusters[j].longitudinal,
                                            clusters[i].times,
                                            clusters[j].times,
                                            zero_inflated_lp_norm)))

### Some output exploration

In [None]:
gvhd = pd.read_csv('GvHD_Covariates_571.csv')
gvhd['agvhgrd'] = gvhd['agvhgrd'].fillna(0)

In [None]:
for i in range(len(clusters)):
    c = [x.identity for x in trajectories if x.cluster == i]
    print('cluster', i + 1)
    print('mean grade:', gvhd[gvhd['sub_ID'].isin(c)]['agvhgrd'].mean(skipna = True))
    print('grades:', list(gvhd[gvhd['sub_ID'].isin(c)]['agvhgrd']))
    print('\n')

#### Saving Clusters

In [None]:
c = [x.identity for x in trajectories if x.cluster == 0]
ci = []
for j in range(len(c)):
    ci.append(ids.index(c[j]))
out = mb[mb['patientID'].isin((mb['patientID'].unique())[ci])]
out['cluster'] = 1

for i in range(1, len(clusters)):
    c = [x.identity for x in trajectories if x.cluster == i]
    ci = []
    for j in range(len(c)):
        ci.append(ids.index(c[j]))
    incoming = mb[mb['patientID'].isin((mb['patientID'].unique())[ci])]
    incoming['cluster'] = (i + 1)
    out = pd.concat([out, incoming])

In [None]:
out.to_csv('results\6-0.001\final-mb-clusters-6-0.001.csv', index = False)