# Graph Theoretical Analysis

In [None]:
import random
import numpy as np
import pandas as pd
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt
import SigProc
import analysis_utils as au
from scipy import stats
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

In [None]:
TRACE_RAW_DRD87 = "/Users/saveliyyusufov/Hen_Lab/Mice/drd87_experiments/Raw_EPM_drd87.csv"
data = pd.read_csv(TRACE_RAW_DRD87, header=None)
z_scored_dataframe, AUC_dataframe, cell_transients_dataframe = SigProc.detect_ca_transients_mossy(data, 2, 0.5, 0.2, 10)

# Rename all the columns from "neuron_x" --> "x". This makes the graphs neater by making sure the neuron names fit into the nodes
cell_transients_dataframe.columns = [i for i in range(1, len(cell_transients_dataframe.columns)+1)]

In [None]:
def create_graph(dataframe):
    G = nx.Graph()
    G.add_nodes_from(dataframe.columns)
    corr_pairs = au.find_correlated_pairs(dataframe, correlation_coeff=0.3)

    for key in corr_pairs:
        G.add_edge(key[0], key[1], weight=round(corr_pairs[key], 3))
        
    return G

In [None]:
def create_random_graph(dataframe):
    G = nx.Graph()
    G.add_nodes_from(dataframe.columns)
    corr_pairs = au.find_correlated_pairs(dataframe, correlation_coeff=0.3)

    # Connect a len(correlated_pairs_dict) amount of random edges between all the nodes in the random graph
    for i in range(len(corr_pairs)):
        G.add_edge(np.random.randint(1, len(dataframe.columns)+1), np.random.randint(1, len(dataframe.columns)+1))
        
    return G

In [None]:
def plot_graph(G):

    # positions for all nodes
    pos = nx.spring_layout(G, weight='weight') 

    plt.figure(figsize=(35, 35))

    # nodes
    nx.draw_networkx_nodes(G, pos, node_size=1000, node_color='lightblue');

    edges, weights = zip(*nx.get_edge_attributes(G, 'weight').items())

    # edges
    nx.draw_networkx_edges(G, pos, width=3.0, edge_color=weights, edge_cmap=plt.cm.YlGnBu);

    labels = nx.get_edge_attributes(G, 'weight')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=labels)

    # labels
    nx.draw_networkx_labels(G, pos, font_size=15, edge_labels=labels)

    plt.axis('off');
    plt.show();

In [None]:
def plot_random_graph(random_graph):
    
    # positions for all nodes
    pos = nx.spring_layout(random_graph, weight='weight') 

    plt.figure(figsize=(15, 15))

    # nodes
    nx.draw_networkx_nodes(random_graph, pos, node_size=700, node_color='lightblue');

    # edges
    nx.draw_networkx_edges(random_graph, pos, width=1.0); 

    labels = nx.get_edge_attributes(random_graph, 'weight')
    nx.draw_networkx_edge_labels(random_graph, pos, edge_labels=labels)

    # labels
    nx.draw_networkx_labels(random_graph, pos, font_size=15, edge_labels=labels)

    plt.axis('off');
    plt.show();

In [None]:
from networkx.algorithms.approximation import clique
def compute_network_measures(graph):
    """
    
    args:
    
    returns:
    """
    network_measures_dict = dict()
    network_measures_dict["assortativity"] = nx.degree_assortativity_coefficient(graph) 
    network_measures_dict["mean betweenness centrality"] = compute_mean_betweenness_centrality(graph)
    #network_measures_dict["mean clique size"] = 
    network_measures_dict["max clique size"] = len(clique.max_clique(graph))
    network_measures_dict["clustering coefficient"] = nx.clustering(graph)
    #network_measures_dict["mean path length"] = 
    
    return network_measures_dict

In [None]:
def compute_mean_betweenness_centrality(graph):
    graph_centrality = nx.betweenness_centrality(graph)
    return np.mean(list(graph_centrality.values()))

## Now, we plot an undirected graph of the network of cells that were imaged for a given mouse
- Edges are added between pairs of neurons that had a correlation coefficient $\ge 3.0$
    - Each edge is weighted by the correlation coefficient corresponding to the two neurons it connects
    - The correlation coefficient between each corresponding pair of neurons is drawn on the edge between them
    - The color of each edge is determined by the correlation between the pair of neurons: 
        - yellow - correlated
        - green - strongly correlated 
        - blue - very strongly correlated
- Isolate nodes are also plotted
- No clustering algorithm was applied to this network of nodes... the graph drawing algorithm that was utilized keeps corresponding nodes clustered together by accounting for the weights of the edges
- Note that the clusters of nodes strongly correspond to the clusters created by the Seaborn library's cluster map data visualization

In [None]:
total_graph = create_graph(cell_transients_dataframe)
plot_graph(total_graph)

## The degree of a node is the number of edges that link it to the rest of the network. According to Bullmore et al. 2009, node degree is the most fundamental network measure, and most other measures are ultimately linked to node degree. 
### So, we output a dictionary that contains each node (neuron) in the network and its corresponding degree
- For the sake of curiosity, we retrieve the name of the node that has the greatest degree in this particular network
    - As we can see, in the case of the dataset for drd87, the degree of node (neuron) $29$ is $11$, and this is the largest degree of any node for this particular network

In [None]:
max(dict(total_graph.degree()), key=dict(total_graph.degree()).get)

## A simple bar plot that depicts the degree of each node (neuron) in the network
- Using this plot, we can see that node $29$ does indeed have the highest degree

In [None]:
sns.set(rc={"figure.figsize": (18, 8)})
sns.barplot(x=list(dict(total_graph.degree()).keys()), y=list(dict(total_graph.degree()).values()));

## According to Bullmore et al. 2009, in random networks, all connections are equally probable, resulting in a Gaussian degree distribution. In complex networks, the degree distributions are non-Gaussian, and often have a long tail towards high degrees. 
### So, we plot the distribution of the degrees of all the nodes (neurons) in our network

In [None]:
sns.set(rc={"figure.figsize": (14, 4)})
sns.distplot(list(dict(total_graph.degree()).values()), color='m', fit=stats.norm);

In [None]:
total_graph_centrality = nx.betweenness_centrality(total_graph)
max(total_graph_centrality, key=total_graph_centrality.get)

## Using the [Betweenness centrality](https://en.wikipedia.org/wiki/Betweenness_centrality) algorithm, we find that node (neuron) 33 has the highest centrality in this network
### Hubs are nodes with high degree, or high centrality. The centrality of a node measures how many of the shortest paths between all other node pairs in the network pass through it. A node with high centrality is thus crucial to efficient communication (Bullmore et al. 2009)
### Going off of this, we can assume that node (neuron) 33 is a hub in this partricular network.

In [None]:
sns.set(rc={"figure.figsize": (18, 8)})
sns.barplot(x=list(total_graph_centrality.keys()), y=list(total_graph_centrality.values()));

## In accordance with Bullmore et al. 2009, our network measures must be compared with the (null) distribution of equivalent parameters estimated in random networks containing the same number of nodes and connections. 
> "Statistical testing of network parameters may best be conducted by permutation- or resampling-based methods of non-parametric inference given the lack of statistical theory concerning the distribution of most network metrics." (Bullmore et al. 2009)

## So, we begin by plotting an undirected graph of the $69$ nodes (neurons) from our dataset and we draw $184$ edges between $92$ (the amount of correlated neurons in our dataset) randomly selected pairs of nodes.

In [None]:
random_graph = create_random_graph(cell_transients_dataframe)
plot_random_graph(random_graph)

## We quickly plot a degree distribution plot for this random network to see if it is Gaussian 
- recall that a random network should have Gaussian degree distribution

In [None]:
sns.set(rc={"figure.figsize": (14, 4)})
sns.distplot(list(dict(random_graph.degree()).values()), color='m', fit=stats.norm);

## We compute the betweenness centrality of this random network to see whether the centrality measure of the complex network is actually significant.

In [None]:
random_graph_centrality = nx.betweenness_centrality(random_graph)
max(random_graph_centrality, key=random_graph_centrality.get)

In [None]:
behavior_column_names = ['Trial_time', 'Recording_time', 'X_center', 'Y_center', 'Area', 'Areachange', 
                         'Elongation', 'Distance_moved', 'Velocity', 'Arena_centerpoint',
                         'Open1_centerpoint', 'Open2_centerpoint',
                         'Closed1_centerpoint', 'Closed2_centerpoint',
                         'OpenArms_centerpoint', 'ClosedArms_centerpoint', 'Result_1']

activity_df = pd.read_csv('/Users/saveliyyusufov/Hen_Lab/Mice/drd87_experiments/activity_drd87.csv', header=None)
behavior_df = pd.read_csv('/Users/saveliyyusufov/Hen_Lab/Mice/drd87_experiments/behavior_drd87.csv', header=None)

# Save only every nth row in order to downsample behavior Dataframes from 30fps -> 10fps
ROW_MULTIPLE = 3
behavior_df.drop(behavior_df.index[[i for i in range(0, len(behavior_df.index)) if i % ROW_MULTIPLE != 0]], inplace=True)

# For the activity Dataframe, we Change column names to corresponding neuron names 
activity_df.columns = ['neuron' + str(i) for i in range(1, len(activity_df.columns) + 1)]

# Change column names to the behavior column names found in the MossyEPM, MATLAB struct
behavior_df.columns = behavior_column_names

# Fix indexing after downsample
behavior_df.reset_index(drop=True, inplace=True)

# Make the behavior Dataframe and the activity Dataframe have the same amount of rows 
if len(behavior_df.index) > len(activity_df.index):
    diff = len(behavior_df.index) - len(activity_df.index)
    behavior_df = behavior_df[:-diff]
elif len(behavior_df.index) < len(activity_df.index):
    diff = len(activity_df.index) - len(behavior_df.index)
    activity_df = activity_df[:-diff]

# Define running frames
VELOCITY_CUTOFF = 4;

# Adds column to the end of the behavior Dataframe and make each cell in that column a 0 
# if the corresponding velocity < VELOCITY_CUTOFF or a 1 if the corresponding velocity >= VELOCITY_CUTOFF
# TODO: Utilize a pandas Series instead of a list comprehension in the line below
behavior_df['Running_frames'] = [1 if velocity > VELOCITY_CUTOFF else 0 for velocity in behavior_df['Velocity'].tolist()]

AUC_dataframe.columns = ['neuron' + str(i) for i in range(1, len(AUC_dataframe.columns)+1)]
result_dataframe = pd.concat([cell_transients_dataframe, behavior_df], axis=1)

## We plot the network of neurons for all of the time that the given mouse was in the closed arms

In [None]:
# Using the result_dataframe, we get the indices for when the mouse was in ClosedArms_centerpoint
indices = result_dataframe.loc[result_dataframe["OpenArms_centerpoint"] != 0].index

open_arms_graph = create_graph(cell_transients_dataframe.iloc[indices])
plot_graph(open_arms_graph)

In [None]:
sns.set(rc={"figure.figsize": (14, 4)})
sns.distplot(list(dict(open_arms_graph.degree()).values()), color='m', fit=stats.norm);

In [None]:
max(dict(open_arms_graph.degree()), key=dict(open_arms_graph.degree()).get)

In [None]:
open_arms_graph_centrality = nx.betweenness_centrality(open_arms_graph)
max(open_arms_graph_centrality, key=open_arms_graph_centrality.get)