In [None]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# prepare all the data
reduction_factor = 5 # for clustering and y_hc-dependent identification of frames

wildtype_data     = np.loadtxt ("result0_wt.txt")
wildtype_data     = wildtype_data [:,1:] # delete first column which is frame number
wildtype_data     = wildtype_data/100 # normalizing
wildtype_truth    = np.zeros(len(wildtype_data)) #set wildtype labels to 0
wildtype_selected = wildtype_data[::reduction_factor] # reduce array to fit y_hc output

mutant_1_data     = np.loadtxt ("result1_D132-H.txt")
mutant_1_data     = mutant_1_data [:,1:] # delete first column which is frame number
mutant_1_data     = mutant_1_data/100 # normalizing
mutant_1_truth    = np.ones(len(mutant_1_data)) #set mutant labels to 1
mutant_1_selected = mutant_1_data[::reduction_factor] # reduce array to fit y_hc output

print('Wildtype Training Data Shape:', wildtype_data.shape)
print('Wildtype Truth Data Shape:   ', wildtype_truth.shape)
print('Wildtype Selected Data Shape:', wildtype_selected.shape)
print('D132-H   Training Data Shape:', mutant_1_data.shape)
print('D132-H   Truth Data Shape:   ', mutant_1_truth.shape)
print('D132-H   Selected Data Shape:', mutant_1_selected.shape)

lcp_data   = np.vstack((wildtype_data, mutant_1_data))
truth_data = np.hstack((wildtype_truth, mutant_1_truth))
                       
print ("\nCombined input data:", lcp_data.shape)
print ("Combined truth_data:", truth_data.shape, "\n")

In [None]:
# load the model, recover and analyze latent data
autoencoder = tf.keras.models.load_model('saved_model_359')
dr_model = tf.keras.models.Model(inputs=autoencoder.get_layer('input_1').input, outputs=autoencoder.get_layer('dense_1').output)

In [None]:
# plot model output data from latent plane
x = []
y = []
z = []
for i in range(80000):
    z.append(truth_data[i])
    op = dr_model.predict(np.array([lcp_data[i]]))
    #op = dr_model.predict(np.array([lcp_data[i]]))
    x.append(op[0][0])
    y.append(op[0][1])

df = pd.DataFrame()
df['x'] = x
df['y'] = y
df['z'] = ["trajectory-" + str(k) for k in z]
 
plt.figure(figsize=(8, 6));
fig = sns.scatterplot(x = 'x', y='y', hue='z', data=df, s=10)
fig.figure.savefig("combined_data_0.png", dpi=300)
plt.show()
# save the result
df.to_pickle ("combined_data_result")

In [None]:
# read in and plot the original data set
result_df = pd.read_pickle ('combined_data_result')
result_df = result_df.replace({'trajectory-0.0': 0, 'trajectory-1.0': 1})

plt.ylim (-1.5,4)
plt.xlim (-1.6, 2.6)
plt.xlabel("Latent Plane x-coordinates")
plt.ylabel("Latent Plane y-coordinates")
plt.scatter (result_df.x,result_df.y, c = result_df.z, cmap="bwr", edgecolors="none", s = 0.5, alpha=1)
plt.savefig("combined_data_1.png", dpi = 300)
plt.show()

In [None]:
# plot the results
plt.ylim (-1.5,4)
plt.xlim (-1.6, 2.6)
plt.xlabel("Latent Plane x-coordinates")
plt.ylabel("Latent Plane y-coordinates")
plt.scatter (result_df['x'], result_df['y'], s = 1, c = result_df['cluster'], cmap = 'rainbow_r')
plt.savefig("combined_data_2.png", dpi = 300)
plt.show()

In [None]:
cluster_numbers = pd.read_pickle("cluster_assigments_12")
print ("The smallest cluster number is:\t", np.min(cluster_numbers['cluster']))
print ("The highest cluster number is:\t",  np.max(cluster_numbers['cluster']))

#identify individual cluster numbers manually
choice = int(input("Which cluster number would you like to visualize?"))
plt.ylim (-1.5,4)
plt.xlim (-1.6, 2.6)
plt.scatter (cluster_numbers['x'], cluster_numbers['y'], s = 1, color = "gainsboro")
selected = cluster_numbers.loc[cluster_numbers['cluster'] == choice]
plt.scatter (selected['x'], selected['y'], s = 1, color = "red") #terrain gist_rainbow gnuplot
plt.show()

In [None]:
plt.plot (cluster_numbers['cluster'][0:8000] + 1,".", ms="10", color='navy', alpha = 0.1); # this is the wildtype data
plt.plot (cluster_numbers['cluster'][8000:16000] + 1,".", ms="10", color='red', alpha = 0.1); # this is the D132-H data
plt.title ("Cluster Distributions Over Trajectories")
plt.savefig ("cluster_distributions_trajectories.png", dpi = 300)
plt.show()                  

In [None]:
x_axis = np.arange(1,13)
print ("Length of cluster-identity assigned array;", len(cluster_numbers['cluster']))
wildtype_cluster_distribution = np.histogram(cluster_numbers['cluster'][0:8000], bins = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
mutant_cluster_distribution = np.histogram(cluster_numbers['cluster'][8000:16000], bins = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])

plt.xticks(np.arange(1, 13, 1.0))
plt.xlabel("Cluster #")
plt.ylabel("Number of Frames")
plt.bar(x_axis, wildtype_cluster_distribution [0], width = 0.4, color='navy');
plt.bar(x_axis + 0.4, mutant_cluster_distribution [0], width = 0.4,color='red');
plt.savefig("differential_cluster_distribution.png", dpi = 300)
plt.show()