out# Data Analysis Script for WILD scent marks with father

## Overview
This Python script analyzes cFos data from experiments on WILD mice reared without father **exposed to scent marks**, comparing experimental groups through various metrics, including cell count and energy levels. It uses statistical analyses to identify significant brain areas and creates visualizations to interpret the results.

## Configuration Variables
- `root_directory`: Main directory for data.
- `experiment`: Name of the experiment.
- `experimental_group`: Experimental group.
- `data_directory`: Data directory.
- `subjects`: List of subjects.
- `threshold`: Thresholds for data analysis.

## Analysis and Visualizations

1. **Loading and Preprocessing Data**
   - Loads volumes and data for each subject.
   - Removes areas with null cell counts or no volume.

2. **Dividing Subjects into Experimental Groups**
   - Divides subjects into groups: CONTROL, FATHER SCENT MARKS EXPOSURE, UNFAM SCENT MARKS EXPOSURE.

3. **Cell Count Analysis**
   - Calculates cell count per area and group.
   - Performs statistical tests to compare groups.
   - Computes and visualizes the cross-correlation matrix for each group.

4. **Energy Analysis**
   - Calculates and compares energy levels per area and group.
   - Performs statistical tests and visualizes cross-correlation matrices.

5. **Visualizations**
   - Creates bar plots and heatmaps to visualize significant differences between experimental groups.

## Additional Notes
- The code is designed to process data stored in CSV files and save them in `.npy` format for analysis.
- Also includes ANOVA tests to compare various metrics between experimental groups.



In [None]:
# !pip install odfpy
# !pip install seaborn
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import yaml
from yaml import Loader
import numpy as np
import analyze_cells_energy as ace
import utils
from scipy.stats import ttest_ind, mannwhitneyu
import networkx as nx
import matplotlib.pyplot as plt

In [None]:
root_directory = '/home/stella/Documents/Torino/projects/'
experiment = 'SexualImprinting'
experimental_group = 'WILD_ScentMarks_Exposure_wof' # CHANGE IF NECESSARY
data_directory = root_directory + experiment + '/' \
                + experimental_group + '/'
subjects = [name for name in os.listdir(data_directory) \
            if os.path.isdir(os.path.join(data_directory, name))]
threshold = 2500 # CHANGE IF NECESSARY

In [None]:
# load query file where we added volumes for each area
volumes = ace.clean_volumes_database()

In [1]:
def load_subject(subject, threshold):
    df_mouse = pd.read_csv(data_directory + subject + '/cells_'+threshold+''.csv')
    df_mouse = ace.reformat_df_mouse(df=df_mouse)
    return df_mouse

In [None]:
def find_significant_areas(dictionary, experimental_groups, value, test='mannwhitneyu', alpha=0.05):
    dfs = \
    ace.calculate_value_across_groups(experimental_groups=experimental_groups, 
                                  dict_results_across_mice=dictionary, 
                                  value=value)
    df_ttest = ace.test_across_groups(dfs,
                                     test=test,
                                     groups=list(experimental_groups.keys()))
    # sort dataframe pvalues of control vs fam and pick the first 20
    columns = df_ttest.loc[:, df_ttest.columns!='area'].columns
    df_sigareas = pd.DataFrame()
    for col in columns:
        df = df_ttest.sort_values(by=col)[['area', col]]
        df_sigareas = pd.concat([df_sigareas, df[df[col]<alpha]['area'].reset_index(drop=True)], axis=1)
    df_sigareas.columns = [col.replace('pval_', '') for col in columns]
    return df_sigareas

# Statistical analysis

In [None]:
dict_results_across_mice = {subject: ace.calculate_cells_energy_per_level(
    df_mouse=load_subject(subject=subject, 
                          data_directory=data_directory, 
                          threshold=threshold), 
    vol=volumes, 
    level=8) for subject in subjects}
np.save('dict_results/newvolumes/dict_results_across_mice_WILD_scent_marks_wof.npy', dict_results_across_mice)

In [None]:
dict_results_across_mice = np.load('dict_results/newvolumes/dict_results_across_mice_WILD_scent_marks_wof.npy', 
                                   allow_pickle=True).item()

In [None]:
subjects = list(dict_results_across_mice.keys())

# Divide mice into experimental groups: FAM/UNFAM/CONTROL

In [None]:
experimental_groups = utils.divide_in_exp_groups(list_subjects=subjects)

# Calculate cell count across experimental groups

In [None]:
df_control_cell_count, df_fam_cell_count, df_unfam_cell_count = \
ace.calculate_value_across_groups(experimental_groups=experimental_groups, 
                              dict_results_across_mice=dict_results_across_mice, 
                              value='n_cells')

In [None]:
df_control_cell_count

In [None]:
df_fam_cell_count.set_index('area').sum()

In [None]:
df_unfam_cell_count.set_index('area').sum()

In [None]:
df_control_cell_count.set_index('area').loc['Arcuate hypothalamic nucleus']

In [None]:
df_fam_cell_count.set_index('area').loc['Arcuate hypothalamic nucleus']

In [None]:
df_unfam_cell_count.set_index('area').loc['Arcuate hypothalamic nucleus']

In [None]:
df_control_cell_count.set_index('area').loc['Medial amygdalar nucleus']

In [None]:
df_fam_cell_count.set_index('area').loc['Medial amygdalar nucleus']

In [None]:
df_unfam_cell_count.set_index('area').loc['Medial amygdalar nucleus']

In [None]:
df_control_cell_count.set_index('area').loc['Basomedial amygdalar nucleus']

In [None]:
df_fam_cell_count.set_index('area').loc['Basomedial amygdalar nucleus']

In [None]:
df_unfam_cell_count.set_index('area').loc['Basomedial amygdalar nucleus']

In [None]:
df_control_cell_count.set_index('area').loc['Supramammillary nucleus']

In [None]:
df_fam_cell_count.set_index('area').loc['Supramammillary nucleus']

In [None]:
df_unfam_cell_count.set_index('area').loc['Supramammillary nucleus']

## Calculate t-tests across groups per area

In [None]:
df_ttest_ncells = ace.test_across_groups([df_control_cell_count,df_fam_cell_count,df_unfam_cell_count], 
                                         test='mannwhitneyu')

In [None]:
# remove rows corresponding to area that have zero cell count across all experimental groups
df_ttest_not_null_ncells = df_ttest_ncells.dropna(axis=0,
                                   how='all',
                                   subset=['pval_Control_vs_Fam',
                                          'pval_Control_vs_Unfam',
                                          'pval_Fam_vs_Unfam'])

In [None]:
# sort dataframe pvalues of control vs fam and pick the first 20
df_ttest_not_null_ncells.sort_values(by='pval_Control_vs_Fam')[['area', 'pval_Control_vs_Fam']][0:40].set_index('area')

In [None]:
# sort dataframe pvalues of control vs unfam and pick the first 20
df_ttest_not_null_ncells.sort_values(by='pval_Control_vs_Unfam')[['area', 'pval_Control_vs_Unfam']][0:35].set_index('area')

In [None]:
# sort dataframe pvalues of fam vs unfam and pick the first 20
df_ttest_not_null_ncells.sort_values(by='pval_Fam_vs_Unfam')[['area', 'pval_Fam_vs_Unfam']][0:20].set_index('area')

## Calculate cross corr across groups per area on cell count

In [None]:
corr_matrix_control_ncells = ace.cross_corr(df_control_cell_count)

In [None]:
corr_matrix_control_ncells = ace.cross_corr(df_control_cell_count)
corr_matrix_fam_ncells = ace.cross_corr(df_fam_cell_count)
corr_matrix_unfam_ncells = ace.cross_corr(df_unfam_cell_count)

In [None]:
corr_matrix_unfam_ncells = ace.cross_corr(df_unfam_cell_count)

## Plot cross corr matrix across groups per area on cell count

In [None]:
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr_matrix_control_ncells,
    cmap=sns.diverging_palette(220, 10, as_cmap=True),
    vmin=-1.0, vmax=1.0,
    square=True, ax=ax)
ax.set_title('CONTROL')

In [None]:
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr_matrix_fam_ncells,
    cmap=sns.diverging_palette(220, 10, as_cmap=True),
    vmin=-1.0, vmax=1.0,
    square=True, ax=ax)
ax.set_title('FAM')

In [None]:
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr_matrix_unfam_ncells,
    cmap=sns.diverging_palette(220, 10, as_cmap=True),
    vmin=-1.0, vmax=1.0,
    square=True, ax=ax)
ax.set_title('UNFAM')

# Calculate energy across experimental groups

In [None]:
df_control_energy, df_fam_energy, df_unfam_energy = \
ace.calculate_value_across_groups(experimental_groups, dict_results_across_mice, value='energy')

## T-test on energy across groups

In [None]:
df_ttest_energy = ace.test_across_groups([df_control_energy,df_fam_energy,df_unfam_energy],
                                        test='mannwhitneyu')

In [None]:
# remove rows corresponding to area that have zero cell count across all experimental groups
df_ttest_not_null_energy = df_ttest_energy.dropna(axis=0,
                                   how='all',
                                   subset=['pval_Control_vs_Fam',
                                          'pval_Control_vs_Unfam',
                                          'pval_Fam_vs_Unfam'])

In [None]:
# sort dataframe pvalues of control vs fam and pick the first 20
df_ttest_not_null_energy.sort_values(by='pval_Control_vs_Fam')[['area', 'pval_Control_vs_Fam']][0:20].set_index('area')

In [None]:
# sort dataframe pvalues of control vs fam and pick the first 20
df_ttest_not_null_energy.sort_values(by='pval_Control_vs_Unfam')[['area', 'pval_Control_vs_Unfam']][0:20].set_index('area')

In [None]:
# sort dataframe pvalues of control vs fam and pick the first 20
df_ttest_not_null_energy.sort_values(by='pval_Fam_vs_Unfam')[['area', 'pval_Fam_vs_Unfam']][0:20].set_index('area')

In [None]:
corr_matrix_control_energy = ace.cross_corr(df_control_energy)

In [None]:
corr_matrix_fam_energy = ace.cross_corr(df_fam_energy)

In [None]:
corr_matrix_unfam_energy = ace.cross_corr(df_unfam_energy)

## Plot cross corr matrix across groups per area on energy

In [None]:
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr_matrix_control_energy,
    cmap=sns.diverging_palette(220, 10, as_cmap=True),
    vmin=-1.0, vmax=1.0,
    square=True, ax=ax)
ax.set_title('CONTROL')

In [None]:
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr_matrix_fam_energy,
    cmap=sns.diverging_palette(220, 10, as_cmap=True),
    vmin=-1.0, vmax=1.0,
    square=True, ax=ax)
ax.set_title('FAM')

In [None]:
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr_matrix_unfam_energy,
    cmap=sns.diverging_palette(220, 10, as_cmap=True),
    vmin=-1.0, vmax=1.0,
    square=True, ax=ax)
ax.set_title('UNFAM')

In [None]:
# stack1
areas_ttest = np.unique(np.hstack([
    df_ttest_not_null_ncells[df_ttest_not_null_ncells['pval_Control_vs_Fam']<0.05]['area'].values,
    df_ttest_not_null_ncells[df_ttest_not_null_ncells['pval_Control_vs_Unfam']<0.05]['area'].values,
    df_ttest_not_null_ncells[df_ttest_not_null_ncells['pval_Fam_vs_Unfam']<0.05]['area'].values]))

areas_ttest_acronyms = [volumes[volumes['safe_name']==area]['acronym'].values[0] for area in areas_ttest]
areas_ttest_acronyms = [area for area in volumes['acronym'].values if area in areas_ttest_acronyms]

control = pd.DataFrame(columns=['area', 'number of cells', 'group'])
fam = pd.DataFrame(columns=['area', 'number of cells', 'group'])
unfam = pd.DataFrame(columns=['area', 'number of cells', 'group'])
control['area'] = areas_ttest_acronyms
control['group'] = ['not exposed' for area in areas_ttest]
control['number of cells'] = [df_control_cell_count.set_index('area').loc[area].mean() for area in areas_ttest]

fam['area'] = areas_ttest_acronyms
fam['group'] = ['exposed to father urine' for area in areas_ttest]
fam['number of cells'] = [df_fam_cell_count.set_index('area').loc[area].mean() for area in areas_ttest]

unfam['area'] = areas_ttest_acronyms
unfam['group'] = ['exposed to unfam WILD urine' for area in areas_ttest]
unfam['number of cells'] = [df_unfam_cell_count.set_index('area').loc[area].mean() for area in areas_ttest]
with_father = control.append(fam).append(unfam)

f, ax = plt.subplots(ncols=1, nrows=1, figsize=(15,3), sharey=True)
sns.barplot(
    data=with_father,
    x="area", y="number of cells", hue="group", palette="Accent", ax=ax)

In [None]:
# stack1
areas_ttest = np.unique(np.hstack([
    df_ttest_not_null_energy[df_ttest_not_null_energy['pval_Control_vs_Fam']<0.05]['area'].values,
    df_ttest_not_null_energy[df_ttest_not_null_energy['pval_Control_vs_Unfam']<0.05]['area'].values,
    df_ttest_not_null_energy[df_ttest_not_null_energy['pval_Fam_vs_Unfam']<0.05]['area'].values]))

areas_ttest_acronyms = [volumes[volumes['safe_name']==area]['acronym'].values[0] for area in areas_ttest]
areas_ttest_acronyms = [area for area in volumes['acronym'].values if area in areas_ttest_acronyms]

control = pd.DataFrame(columns=['area', 'energy', 'group'])
fam = pd.DataFrame(columns=['area', 'energy', 'group'])
unfam = pd.DataFrame(columns=['area', 'energy', 'group'])
control['area'] = areas_ttest_acronyms
control['group'] = ['not exposed' for area in areas_ttest]
control['energy'] = [df_control_energy.set_index('area').loc[area].mean() for area in areas_ttest]

fam['area'] = areas_ttest_acronyms
fam['group'] = ['exposed to father urine' for area in areas_ttest]
fam['energy'] = [df_fam_energy.set_index('area').loc[area].mean() for area in areas_ttest]

unfam['area'] = areas_ttest_acronyms
unfam['group'] = ['exposed to unfam WILD urine' for area in areas_ttest]
unfam['energy'] = [df_unfam_energy.set_index('area').loc[area].mean() for area in areas_ttest]
with_father = control.append(fam).append(unfam)

f, ax = plt.subplots(ncols=1, nrows=1, figsize=(15,3), sharey=True)
sns.barplot(
    data=with_father,
    x="area", y="energy", hue="group", palette="Accent", ax=ax)