In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')

from main import load_and_prepare_sessions
from analysis.performance_funcs import add_performance_container
from analysis.response_metrics import assign_responses
from processing.timepoint_analysis import aggregate_signals
from data.mouse import create_mice_dict
from data.data_loading import DataContainer
from tqdm.notebook import tqdm

from collections import defaultdict
import matplotlib.pyplot as plt

sessions = load_and_prepare_sessions("../../Baseline", load_from_pickle=True, remove_bad_signal_sessions=True)

In [None]:
mice_dict = create_mice_dict(sessions)

In [None]:
for mouse in mice_dict.values():
    add_performance_container(mouse)

    for session in mouse.sessions:
        session.metric_container = mouse.metric_container

assign_responses(mice_dict.values())
mouse_responses = {mouse.mouse_id: mouse.response_metrics for mouse in mice_dict.values()}

In [None]:
performance_metrics = {}
for mouse_id, mouse in mice_dict.items():
    performance_metrics[mouse_id] = mouse.metric_container.data

In [None]:
import pandas as pd

In [None]:
correlation_matrix = pd.DataFrame(index=mouse_responses.keys(), columns=performance_metrics.keys())

In [None]:
for d in mouse_responses.values():
    print(d)

In [None]:
import numpy as np
import scipy.stats

# Assuming mouse_responses and performance_metrics_2 are structured as described:
# mouse_responses = {mouse_id: {'response_metric_name1': value, ...}, ...}
# performance_metrics_2 = {mouse_id: {'performance_metric_name1': value, ...}, ...}

def calculate_correlations(mouse_responses, performance_metrics_2):
    # Compile a comprehensive list of all unique response metric names across all mice
    all_response_metric_names = set()
    for metrics in mouse_responses.values():
        all_response_metric_names.update(metrics.keys())

    # List of all performance metric names (assuming these are consistent across all mice)
    performance_metric_names = list(next(iter(performance_metrics_2.values())).keys())
    
    # Initialize a dictionary to store correlation results
    correlation_results = {}

    # Iterate over each unique pair of response and performance metrics
    for response_metric in all_response_metric_names:
        for performance_metric in performance_metric_names:
            response_values, performance_values = [], []

            # Collect values for the current pair of metrics across all mice
            for mouse_id, response_metrics in mouse_responses.items():
                response_value = response_metrics.get(response_metric)
                performance_value = performance_metrics_2.get(mouse_id, {}).get(performance_metric)
                
                # Only include mice that have data for both the current response and performance metric
                if response_value is not None and performance_value is not None:
                    response_values.append(response_value)
                    performance_values.append(performance_value)
            
            clean_response_values = []
            clean_performance_values = []
            for rv, pv in zip(response_values, performance_values):
                if not (np.isnan(rv) or np.isnan(pv) or np.isinf(rv) or np.isinf(pv)):
                    clean_response_values.append(rv)
                    clean_performance_values.append(pv)


            # Calculate correlation if both lists have values
            if response_values and performance_values:
                corr, p_val = scipy.stats.pearsonr(clean_response_values, clean_performance_values)
                correlation_results[(response_metric, performance_metric)] = (corr, p_val)

    return correlation_results

correlation_results = calculate_correlations(mouse_responses, performance_metrics)

# Print correlations
for metric_pair, (corr_value, p_val) in correlation_results.items():
    # if p_val <= 0.05:
    print(f"Correlation between '{metric_pair[0]}' and '{metric_pair[1]}': {corr_value:.4f}, {p_val:.4f}")


In [None]:
data_for_df = []
for (response_metric, performance_metric), corr_value in correlation_results.items():
    data_for_df.append({
        'Response Metric': response_metric,
        'Performance Metric': performance_metric,
        'Correlation': corr_value
    })

# Convert the list into a DataFrame
df = pd.DataFrame(data_for_df)

# Pivot the DataFrame to get response metrics as columns and performance metrics as rows
pivot_df = df.pivot(index='Response Metric', columns='Performance Metric', values='Correlation')

# Optionally, fill NaN values with zeros or any other value deemed appropriate
# pivot_df.fillna(0, inplace=True)

pivot_df

In [None]:
# # Reshape the DataFrame from wide to long format
long_df = pivot_df.reset_index().melt(id_vars=['Response Metric'], var_name='Performance Metric', value_name='Correlation')

# # Sort by the absolute value of the correlations to find the biggest ones, regardless of direction
# sorted_df = long_df.reindex(long_df.Correlation.abs().sort_values(ascending=False, key=lambda t: t[-1]).index)

# # Optionally, you can filter to show only the top N correlations
# sorted_df

# sorted_df.to_csv("all_correlations.csv")

# Your initial transformation is good, creating new columns for absolute correlation values and p-values
long_df['Correlation_abs'] = long_df['Correlation'].apply(lambda x: abs(x[0]))
long_df['P_value'] = long_df['Correlation'].apply(lambda x: x[1])

# Now, sort by 'Correlation_abs' in descending order and then by 'P_value' in ascending order
sorted_df = long_df.sort_values(by=['P_value'], ascending=True)

# This will give you a DataFrame sorted by the absolute correlation values first, and then by p-values where correlations are equal or nearly equal.


In [None]:
sorted_df

In [None]:
sorted_df.drop(columns=['Correlation', 'Correlation_abs']).to_csv('all_correlations_3.csv')