## Simulations

In this notebook we wish to run a simulation demonstrating some of the basic claims we make regarding the random 
forest. 

The key claims we would like to demonstrate are thus:

- A dataset can have heirarchal behavior
    - an RF will identify such hierarchal structure 
    - an RF will capture local changes in covariance etc
    
    - A PCA CANNOT capture some of the effects that we will identify as local in distinct PCs.

- When a dataset undergoes changes in population prevalence, we identify this as a shift in factor values

- When a dataset undergoes a change in population behavior we identify this as a shift in predictive power

To reflect a hierarchal structure with meaningful local behavior, we will need several features that have different means among different clusters, but importantly also interact with each other, especially in different ways within different clusters. 

Let's operate on 10 features total. 

## On The Basis of Component Vectors

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from scipy.stats import multivariate_normal,norm,beta
from sklearn.datasets import make_blobs
from sklearn.manifold import TSNE
from sklearn.preprocessing import scale


In [None]:
# First we will generate the macro-structure. We will generate an eigenvector that applies globally, has a 
# multivariate normal set of loadings and a bimodal normal distribution of values

def generate(noise_multiplier=1):

    global_noise = [
        1,1,1
    ]

    factor_1 = [
        2,0,0
    ]
    
    factor_2 = [
        0,1,0
    ]

    factor_3 = [
        0,0,1
    ]

    factors = np.vstack([factor_1,factor_2,factor_3])

    
    noise = multivariate_normal(global_noise,np.identity(3)*noise_multiplier).rvs(10000)    

    
    plt.figure()
    plt.imshow(factors,aspect='auto',interpolation='none')
    plt.show()
    
    loadings_1 = norm().rvs(10000) 
    loadings_2 = norm().rvs(2500)
    loadings_3 = norm().rvs(2500)
#     loadings_2 = norm().rvs(2000)
#     loadings_3 = norm().rvs(2000)
#     loadings_1 = beta(1,1).rvs(10000) + 3
#     loadings_2 = beta(.5,.5).rvs(2500) + 1
#     loadings_3 = beta(.5,.5).rvs(2500) + 1
#     loadings_2 = beta(1,1).rvs(2000) + 2
#     loadings_3 = beta(1,1).rvs(2000) + 2


    loadings_1 = loadings_1[np.argsort(loadings_1)]
    loadings_2 = loadings_2[np.argsort(loadings_2)]
    loadings_3 = loadings_3[np.argsort(loadings_3)]

    combined_loadings = np.zeros((10000,3))
    combined_loadings[:,0] = loadings_1
    combined_loadings[5000:7500,1] = loadings_2
    combined_loadings[7500:,2] = loadings_3
#     combined_loadings[:2000,1] = loadings_2
#     combined_loadings[-2000:,2] = loadings_3
    
    
    coordinates = np.dot(combined_loadings,factors) + (noise * noise_multiplier)
    
    return (combined_loadings,factors,coordinates)

loadings,factors,coordinates = generate(noise_multiplier=.3)

from scipy.cluster.hierarchy import linkage,dendrogram

# sample_agglomeration = dendrogram(linkage(coordinates, metric='cosine', method='average'), no_plot=True)['leaves']

# loadings = loadings[sample_agglomeration]
# coordinates = coordinates[sample_agglomeration]

In [None]:
# Let's take a look at the data we have generated. 
# We should have two broad clusters which are easily distinguished 

plt.figure()
plt.title("Example Loadings")
plt.imshow(loadings,aspect='auto',interpolation='none')
plt.colorbar()
plt.show()

plt.figure()
plt.title("Example Factor Values")
plt.imshow(factors,aspect='auto',interpolation='none')
plt.colorbar()
plt.show()

plt.figure()
plt.title("Example Feature Values")
plt.imshow(coordinates,aspect='auto',interpolation='none')
plt.colorbar()
plt.show()


In [None]:
plt.figure()
plt.scatter(coordinates[:,1],coordinates[:,2],s=1)
plt.show()

In [None]:
from scipy.cluster.hierarchy import linkage,dendrogram

# sample_agglomeration = dendrogram(linkage(coordinates, metric='cosine', method='average'), no_plot=True)['leaves']

plt.figure()
plt.title("Feature Values")
plt.imshow(coordinates,aspect='auto',interpolation='none',cmap='bwr')
plt.colorbar()
plt.xlabel("Features")
plt.ylabel("Samples")
plt.show()

plt.figure()
plt.title("True Loadings")
plt.imshow(loadings,aspect='auto',interpolation='none')
plt.colorbar()
plt.xlabel("Factors")
plt.ylabel("Samples")
plt.show()

# plt.figure()
# plt.imshow(coordinates[sample_agglomeration],aspect='auto',interpolation='none')
# plt.show()

In [None]:
# Now we will produce an embedding of the newly generated dataset for 
# easier visualization. 

t_coordinates = TSNE().fit_transform(coordinates)

plt.figure()
plt.title("TSNE Embedded Simulated Samples")
plt.scatter(*t_coordinates.T)
plt.show()



In [None]:
# First we can visualize the true factor values in order to understand 
# which clusters are which

plt.figure()
plt.title("True Factor 1 Scores")
plt.scatter(*t_coordinates.T,c=loadings[:,0],cmap='bwr')
plt.colorbar()
plt.show()

plt.figure()
plt.title("True Factor 2 Scores")
plt.scatter(*t_coordinates.T,c=loadings[:,1],cmap='bwr')
plt.colorbar()
plt.show()

plt.figure()
plt.title("True Factor 3 Scores")
plt.scatter(*t_coordinates.T,c=loadings[:,2],cmap='bwr')
plt.colorbar()
plt.show()


In [None]:
# Now we can perform a PCA analysis to see if we can recover the 
# global and local factors accurately. 

from sklearn.decomposition import PCA

model = PCA().fit(coordinates)

In [None]:
# First we observe that PCA DOES explain most of the variance present in the datset,
# however it does so after using 4 components.

plt.figure()
plt.title("PC Explanatory Power (Ratio)")
plt.plot(model.explained_variance_ratio_)
plt.xlabel("PCs")
plt.ylabel("Variance Fraction Explained")
plt.show()

model.explained_variance_ratio_


In [None]:
# However, we would be interested to see if it's possible to understand reover local 
# feature relationships in the PC loadings. After all, in an ideal case, we would recover
# our loadings exactly.

print(model.components_)


In [None]:
# Here we see that the loadings of the PCs discovered do NOT contain a negative association 
# between 2 and the 6-7 pair. 

In [None]:
pct = model.transform(coordinates)
pct.shape

In [None]:
# We can observe directly that while the PCs recover the overall struture somewhat correctly,
# they make inappropriately global inferences about individual PCs.

# This occurs despite the fact that it is in principle perfectly possible to represent the 
# data structure corretly using 3 PCs with non-standardly distributed scores. 
# (As this was the generative process)

plt.figure()
plt.title("PC1 Scores")
plt.scatter(*t_coordinates.T,c=pct[:,0],cmap='bwr')
plt.colorbar()
plt.show()

plt.figure()
plt.title("PC2 Scores")
plt.scatter(*t_coordinates.T,c=pct[:,1],cmap='bwr')
plt.colorbar()
plt.show()

plt.figure()
plt.title("PC3 Scores")
plt.scatter(*t_coordinates.T,c=pct[:,2],cmap='bwr')
plt.colorbar()
plt.show()


In [None]:
abmax= np.max(np.abs(pct))

plt.figure()
plt.title("Principal Component Loadings")
plt.imshow(pct[:,:3],interpolation='none',aspect='auto',cmap='bwr',vmin=-abmax,vmax=abmax)
plt.xlabel("PCs")
plt.ylabel("Samples")
plt.colorbar()
plt.show()

In [None]:
plt.figure(figsize=(10,10))
for i in range(3):
    for j in range(3):
        print((i,j))
        plt.subplot(3,3,(i*3)+(j+1))
        plt.title(f"PC {i}, Factor {j}")
        plt.scatter(loadings[:,i],pct[:,j],s=1)
plt.tight_layout()
plt.show()

# plt.figure()
# plt.scatter(loadings[:,1],pct[:,1])
# plt.show()

# plt.figure()
# plt.scatter(true_factor_scores[:,1],pct[:,2])
# plt.show()

# plt.figure()
# plt.scatter(true_factor_scores[:,1],pct[:,3])
# plt.show()

# plt.figure()
# plt.scatter(true_factor_scores[:,2],pct[:,1])
# plt.show()

# plt.figure()
# plt.scatter(true_factor_scores[:,2],pct[:,2])
# plt.show()

# plt.figure()
# plt.scatter(true_factor_scores[:,2],pct[:,3])
# plt.show()



In [None]:
model = PCA(n_components=3).fit(coordinates)
pct = model.transform(coordinates)

recovered = model.inverse_transform(pct)

null_residuals = coordinates - np.mean(coordinates,axis=0)
recovered_residuals = coordinates - recovered

null_error = np.sum(np.power(null_residuals,2))
recovered_error = np.sum(np.power(recovered_residuals,2))

print(null_error)
print(recovered_error)

recovered_error / null_error

In [None]:
# Now we can examine whether a Random Forest can capture the structure that eluded a PCA:

In [None]:
import sys
sys.path.append('../src/')
import tree_reader as tr 
import lumberjack

In [None]:
# We will train a relatively shallow forest, since the dataset is not complex. 
# We have relatively few features and would like a reliable structure, so will bootstrap a 
# large number of features per node (80% bootstrap)

# The rest of the parameters aren't deeply important, and so are left without comment. 

forest = lumberjack.fit(
    coordinates,
    trees=300,
    ifs=8,
    ofs=8,
    braids=1,
    ss=100,
    leaves=10,
    depth=2,
    norm='l1',
    sfr=.5,
#     reduce_input='true',
#     reduce_output='true',
    reduce_input='false',
    reduce_output='false',
)

In [None]:
forest.tsne_coordinates = t_coordinates
forest.reset_split_clusters()
forest.interpret_splits(mode='additive_mean',metric='cosine',depth=3,resolution=1,pca=False,k=200,relatives=True)
forest.maximum_spanning_tree(mode='samples')
# forest.most_likely_tree(depth=4)

In [None]:
# forest.html_tree_summary(n=5)

In [None]:
# The resulting tree is available at:

# https://bx.bio.jhu.edu/track-hubs/bc/sc_summary/simulation/tree_template.html

In [None]:
plt.figure()
plt.imshow(forest.factor_matrix()[:,1:],cmap='bwr',vmin=-1,vmax=1,aspect='auto',interpolation='none')
plt.xticks(np.arange(len(forest.split_clusters[:-1])),np.arange(1,len(forest.split_clusters)))
plt.colorbar()
plt.show()

reordered = [1,5,3,4,2,6]

plt.figure()
plt.title("Forest Factor Loadings")
plt.imshow(forest.factor_matrix()[:,reordered],cmap='bwr',vmin=-1,vmax=1,aspect='auto',interpolation='none')
plt.colorbar()
plt.show()

In [None]:
factor_matrix = forest.factor_matrix()
f_d = factor_matrix.shape[1]
l_d = loadings.shape[1]

plt.figure(figsize=(10,20))
for i in range(f_d):
    for j in range(l_d):
        print((i,j))
        plt.subplot(f_d,l_d,(i*3)+(j+1))
        plt.title(f"Forest Factor {i}, True Factor {j}")
        plt.scatter(loadings[:,j],factor_matrix[:,i],s=1)
plt.tight_layout()
plt.show()


In [None]:
# Now we would like to range over a variety of noise levels

# noise_range = np.zeros((5,10000,10))
# true_range = np.zeros((5,10000,3))

# for i,noise in enumerate(range(-2,3)):
    
#     true_factor_scores = np.zeros((10000,3))
#     coordinates = np.zeros((10000,10))

#     true_factor_scores,coordinates = generate_global(
#         true_factor_scores,
#         coordinates,
#         noise_multiplier=2**noise,
#     )

#     true_factor_scores,coordinates = generate_local(
#         true_factor_scores,
#         coordinates,
#     )



#     plt.figure()
#     plt.title("Individual Feature Values")
#     plt.imshow(coordinates,aspect='auto',interpolation='none')
#     plt.colorbar()
#     plt.xlabel("Features")
#     plt.ylabel("Samples")
#     plt.show()
    
#     noise_range[i] = coordinates
#     true_range[i] = true_factor_scores


# plt.figure()
# plt.title("Individual Feature Values")
# plt.imshow(noise_range.reshape((50000,10)),aspect='auto',interpolation='none')
# plt.colorbar()
# plt.xlabel("Features")
# plt.ylabel("Samples")
# plt.show()

    
# t_coordinates = TSNE().fit_transform(noise_range.reshape((50000,10)))

# plt.figure()
# plt.title("TSNE Embedded Simulated Samples")
# plt.scatter(*t_coordinates.T)
# plt.show()

# for i in range(5):
#     plt.figure()
#     plt.title("TSNE Embedded Simulated Samples")
#     plt.scatter(*t_coordinates[i*10000:(i+1)*10000].T,c=np.arange(10000),cmap='rainbow')
#     plt.show()
    
    
# for i in range(5):
    
#     forest = lumberjack.fit(
#         noise_range[i],
#         trees=300,
#         ifs=8,
#         ofs=8,
#         braids=1,
#         ss=1000,
#         leaves=10,
#         depth=4,
#         norm='l1',
#         sfr=0,
#     #     reduce_input='true',
#     #     reduce_output='true',
#         reduce_input='false',
#         reduce_output='false',
#     )
    
#     forest.tsne_coordinates = t_coordinates[i*10000:(i+1)*10000]
#     forest.reset_split_clusters()
#     forest.interpret_splits(mode='additive_mean',metric='cosine',depth=4,pca=3,k=500,relatives=True)
#     forest.maximum_spanning_tree(mode='samples')
    
#     plt.figure()
#     plt.imshow(forest.factor_matrix(),aspect='auto',interpolation='none',cmap='bwr')
#     plt.show()



In [None]:
v1x = np.arange(0,10) - 5
v1y = np.arange(0,10)

v2x = np.cos(np.arange(0, 2 * np.pi ,.1))
v2y = np.sin(np.arange(0, 2 * np.pi ,.1))

# v2x = np.arange(0,6)
# v2y = (np.arange(0,6) * - 1) + 5

In [None]:
plt.figure()
plt.plot(v1x,v1y)
plt.plot(v2x,v2y)
plt.plot
plt.show

In [None]:
from sklearn.decomposition import PCA


transformed = PCA().fit_transform(np.array([np.hstack([v1x,v2x]), np.hstack([v1y,v2y])]).T)

In [None]:
transformed.shape
plt.figure()
plt.scatter(*transformed.T)
plt.show()