## Local Effects

One of the more important aspects of random forest nodes, and by extension node clusters, is that they describe what we would call "Local Effects"

While a conventional linear regression might describe a linear relationship between the behavior of a feature and a target that is true across the entire dataset, a node in a random forest may just as easily be a child of another node, and thus only trained on a small part of the dataset. Therefore a relationship that it describes between a feature and a target may be true across the entire dataset, or it may only be true conditionally on the predictions made by the parents of the node.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc

import sys
# sys.path.append('/localscratch/bbrener1/rusty_forest_v3/src')
sys.path.append('../src')
import tree_reader as tr 
import lumberjack

import pickle 

data_location = "../data/aging_brain/"

young = pickle.load(open(data_location + "aging_brain_young.pickle",mode='rb'))
old = pickle.load(open(data_location + "aging_brain_old.pickle",mode='rb'))

forest = tr.Forest.load(data_location + 'full_clustering')
forest.arguments

In [None]:
# forest.reset_split_clusters()

# forest.interpret_splits(
#     depth=8,
#     mode='additive_mean',
#     metric='cosine',
#     pca=100,
#     relatives=True,
#     k=50,
#     resolution=2,
# )

# print(len(forest.split_clusters))

In [None]:
# forest.maximum_spanning_tree(mode='samples')

In [None]:
# forest.html_tree_summary(n=10)

In [None]:
# forest.backup(data_location + "full_clustering")

In [None]:
# We now would like to see if there are any local associations that are dramatically different
# from global ones, to the degree that it is impossible to recapture them using PCA-based analysis. 

# We will need to perform a PCA analysis first. 

from sklearn.decomposition import PCA

model = PCA(n_components=25).fit(young.X)
transformed = model.transform(young.X)
recovered = model.inverse_transform(transformed)

centered = young.X - np.mean(young.X,axis=0)
null_squared_residual = np.power(centered,2)

recovered_residual = young.X - recovered
recovered_squared_residual = np.power(recovered_residual,2)

pca_recovered_per_sample = np.sum(recovered_squared_residual,axis=1)
pca_recovered_fraction_per_sample = np.sum(recovered_squared_residual,axis=1) / np.sum(null_squared_residual,axis=1)
print(np.sum(null_squared_residual))
print(np.sum(recovered_squared_residual))

print(f"Remaining variance:{(np.sum(recovered_squared_residual) / np.sum(null_squared_residual))}")

In [None]:
for i,pc in enumerate(transformed.T):
    plt.figure()
    plt.title(i)
    ab_max = np.max(np.abs(pc))
    plt.scatter(*forest.tsne_coordinates.T,c=pc,s=3,alpha=.4,cmap='bwr',vmin=-ab_max,vmax=ab_max)
    plt.colorbar()
    plt.show()

In [None]:
# Now we will look for features that have an especially large discrepancy in the local 
# correlation compared to the global correlation for each factor. 

for factor in forest.split_clusters:
    print("=====================================")
    print(factor.name())
    print("=====================================")
    fi_pairs = factor.most_local_correlations()
    features = forest.output_features
    f_names = [(features[i],features[j]) for (i,j) in fi_pairs]
    local_correlations = factor.local_correlations()
    global_correlations = forest.global_correlations()
    discrepancy = [(local_correlations[i,j],global_correlations[i,j]) for (i,j) in fi_pairs]
    print(f_names)
    print(discrepancy)

In [None]:
interesting_pairs = []

for factor in forest.split_clusters:
    interesting_pairs.extend(factor.most_local_correlations(n=1))
    
uniques = list(set([y for x in interesting_pairs for y in x]))
  
factor_correlation_table = np.zeros((len(interesting_pairs),len(forest.split_clusters)))

for i,factor in enumerate(forest.split_clusters):
    local_correlations = factor.local_correlations(indices=uniques)
    for j,(f1,f2) in enumerate(interesting_pairs):
        f1_u = uniques.index(f1)
        f2_u = uniques.index(f2)
        factor_correlation_table[j,i] = local_correlations[f1_u,f2_u]

plt.figure()
plt.imshow(factor_correlation_table,interpolation='none',aspect='auto',cmap='bwr',vmin=-1,vmax=1)
plt.colorbar()
plt.show()

from scipy.cluster.hierarchy import linkage,dendrogram

factor_agglomeration = dendrogram(linkage(factor_correlation_table, metric='cosine', method='average'), no_plot=True)['leaves']

plt.figure()
plt.imshow(factor_correlation_table.T[9 factor_agglomeration].T,interpolation='none',aspect='auto',cmap='bwr',vmin=-1,vmax=1)
plt.colorbar()
plt.show()

print([(x,y) for x,y in enumerate(interesting_pairs)])

In [None]:
forest.output_features[1639]

print(forest.split_clusters[23].local_correlations(indices=[717,1639]))
print(forest.split_clusters[20].local_correlations(indices=[717,1639]))

# cluster 23, Rrares2 (717), Meg3 (1639)

In [None]:
from scipy.stats import linregress

f1 = "Tmem119"
f2 = "Cd74"

f1_index = forest.truth_dictionary.feature_dictionary[f1]
f2_index = forest.truth_dictionary.feature_dictionary[f2]

f1_values = forest.output[:,f1_index]
f2_values = forest.output[:,f2_index]

slope,intercept,r_fit,_,_ = linregress(f1_values,f2_values)

plt.figure()
plt.title(f"Linar Fit, {f1}, {f2}, Naive")
plt.scatter(f1_values,f2_values,s=3)
plt.plot(np.arange(7), intercept + (np.arange(7) * slope),c='red',label=f"Slope:{np.around(slope,3)},R2:{np.around(r_fit,3)}")
plt.legend()
plt.xlabel(f"{f1}")
plt.ylabel(f"{f2}")
plt.show()

In [None]:
from scipy.stats import linregress

factor = forest.split_clusters[34]

# for factor in forest.split_clusters[1:]:

factor_mask = np.abs(factor.sister_scores() > .2)

# if np.sum(factor_mask.astype(dtype=int)) < 2:
#     continue

plt.figure()
plt.hist(factor.sister_scores(),bins=50)
plt.show()

f1 = "Tmem119"
f2 = "Cd74"

f1_index = forest.truth_dictionary.feature_dictionary[f1]
f2_index = forest.truth_dictionary.feature_dictionary[f2]

f1_values = forest.output[:,f1_index][factor_mask]
f2_values = forest.output[:,f2_index][factor_mask]

slope,intercept,r_fit,_,_ = linregress(f1_values,f2_values)

plt.figure()
plt.title(f"Linar Fit, {f1}, {f2}, Factor {factor.name()}, Filtered")
plt.scatter(f1_values,f2_values,s=3)
plt.plot(np.arange(7), intercept + (np.arange(7) * slope),c='red',label=f"Slope:{np.around(slope,3)},R2:{np.around(r_fit,3)}")
plt.xlabel(f"{f1}")
plt.ylabel(f"{f2}")
plt.legend()
plt.show()



In [None]:
f1 = "Cdk1"
f2 = "Actg1"

f1_index = forest.truth_dictionary.feature_dictionary[f1]
f2_index = forest.truth_dictionary.feature_dictionary[f2]

f1_values = forest.output[:,f1_index]
f2_values = forest.output[:,f2_index]

slope,intercept,r_fit,_,_ = linregress(f1_values,f2_values)

plt.figure()
plt.title(f"Linar Fit, {f1}, {f2}, Naive")
plt.scatter(f1_values,f2_values,s=3)
plt.plot(np.arange(7), intercept + (np.arange(7) * slope),c='red',label=f"Slope:{np.around(slope,3)},R2:{np.around(r_fit,3)}")
plt.legend()
plt.xlabel(f"{f1}")
plt.ylabel(f"{f2}")
plt.show()

In [None]:

f1 = "Tmem119"
f2 = "Cd74"

f1_index = forest.truth_dictionary.feature_dictionary[f1]
f2_index = forest.truth_dictionary.feature_dictionary[f2]

for i,component in enumerate(model.components_):
    print(f"{i}: {f1}:{component[f1_index]},{f2}:{component[f2_index]}")

plt.figure()
plt.title(f"PC Weights for {f1} and {f2}")
plt.scatter(model.components_[:,f1_index],model.components_[:,f2_index])
plt.plot([.2,-.2],np.array([-.2,.2])*.55,color='red',label="Slope of -.55")
plt.legend()
plt.xlabel(f1)
plt.ylabel(f2)
plt.show()
    

In [None]:
sc.tl.rank_genes_groups(young,grouby)