In [None]:
from scipy.stats import chi2_contingency
import pandas as pd

def normalize_df(df, column):
    # Normalize the counts in each DataFrame by its total length (number of rows)
    syllable_counts = df[column].value_counts()
    normalized_counts = syllable_counts / len(df)
    return normalized_counts

def chi_square_test_syllables_normalized_per_df(pre_dfs, post_dfs, column='syllable_name'):
    # Step 1: Normalize counts for each pre-amphetamine DataFrame
    pre_normalized = [normalize_df(df, column) for df in pre_dfs]
    
    # Step 2: Normalize counts for each post-amphetamine DataFrame
    post_normalized = [normalize_df(df, column) for df in post_dfs]
    
    # Step 3: Sum up the normalized counts across all DataFrames in each group
    pre_counts = pd.concat(pre_normalized, axis=1).sum(axis=1)
    post_counts = pd.concat(post_normalized, axis=1).sum(axis=1)
    
    # Step 4: Combine the normalized counts into a single DataFrame
    combined_counts = pd.DataFrame({
        'pre': pre_counts,
        'post': post_counts
    }).fillna(0)  # Fill NaN values with 0 for missing syllables
    
    # Step 5: Perform Chi-Square test on the normalized counts
    chi2, p, dof, expected = chi2_contingency(combined_counts.T)

    # Return Chi-Square statistic, p-value, degrees of freedom, expected frequencies, and combined counts
    return chi2, p, dof, expected, combined_counts

# Example usage:
chi2, p, dof, expected, combined_counts = chi_square_test_syllables_normalized_per_df(pre_dfs, post_dfs)

print(f"Chi-Square Statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of Freedom: {dof}")

# Create the DataFrame for expected frequencies
#expected_df = pd.DataFrame(expected, index=['pre', 'post'], columns=combined_counts.columns)

# Display the expected frequencies
#print(f"Expected Frequencies:\n{expected_df}")


Chi-Square Statistic: 6.27970389176415
P-value: 0.9999878331262626
Degrees of Freedom: 27


In [None]:
lst1 = [normalize_df(df, 'syllable_name') for df in pre_dfs]
lst2 = [normalize_df(df, 'syllable_name') for df in post_dfs]

In [None]:
for i in range(3):
    print(lst1[i])
    print('#'*35)
    print(lst2[i])
    print('#'*35)
    print('#'*35)

NameError: name 'lst1' is not defined

### permutation test

In [None]:
import numpy as np

def permutation_test_syllables(pre_counts, post_counts, n_permutations=1000):
    # Calculate the actual difference in means
    actual_diff = post_counts.mean() - pre_counts.mean()

    # Combine the counts and perform permutations
    combined = np.concatenate([pre_counts, post_counts])
    perm_diffs = []
    
    for _ in range(n_permutations):
        np.random.shuffle(combined)
        perm_pre = combined[:len(pre_counts)]
        perm_post = combined[len(pre_counts):]
        perm_diff = np.mean(perm_post) - np.mean(perm_pre)
        perm_diffs.append(perm_diff)
    
    # Calculate p-value
    p_value = np.sum(np.abs(perm_diffs) >= np.abs(actual_diff)) / n_permutations
    
    return actual_diff, p_value

# Example usage:
pre_counts = comparison_df['pre_count'].values[:10]
post_counts = comparison_df['post_count'].values[:10]

actual_diff, p_value = permutation_test_syllables(pre_counts, post_counts)

print(f"Actual Difference: {actual_diff}")
print(f"P-value: {p_value}")

Actual Difference: 20486.399999999998
P-value: 0.545


### mannwhitneyu

In [None]:
def compare_syllable_distributions(pre_dfs, post_dfs, column='syllable_name'):
    # Create a dictionary to store the test results
    results = {}
    
    # Collect syllable counts for each mouse in both groups
    pre_counts = pd.concat([df[column].value_counts() for df in pre_dfs], axis=1).fillna(0)
    post_counts = pd.concat([df[column].value_counts() for df in post_dfs], axis=1).fillna(0)
    
    pre_counts = pre_counts.drop('faulty', errors='ignore')
    post_counts = post_counts.drop('faulty', errors='ignore')

    # Print the counts for debugging
    print("Pre-Amphetamine Syllable Counts:")
    print(pre_counts)
    
    print("Post-Amphetamine Syllable Counts:")
    print(post_counts)
    
    # Perform Mann-Whitney U test for each syllable
    for syllable in pre_counts.index:
        pre_syllable_counts = pre_counts.loc[syllable].values
        post_syllable_counts = post_counts.loc[syllable].values
        # print(f"Syllable: {syllable}, Pre Counts: {pre_syllable_counts}, Post Counts: {post_syllable_counts}")
        stat, p_value = mannwhitneyu(pre_syllable_counts, post_syllable_counts, alternative='two-sided')
        results[syllable] = p_value
    
    # Convert results to a DataFrame for easy viewing
    results_df = pd.DataFrame.from_dict(results, orient='index', columns=['p_value'])
    
    # Sort by p-value
    results_df = results_df.sort_values(by='p_value')
    
    return results_df

# Example usage:
results_df = compare_syllable_distributions(pre_dfs, post_dfs, column='syllable_name');
results_df

Pre-Amphetamine Syllable Counts:
                             count    count    count    count    count
syllable_name                                                         
sniff/sit                  31547.0  57626.0  70918.0  47315.0  56914.0
sniff/groom                22037.0  38787.0  39055.0  26326.0  38277.0
sniff                      10281.0  13097.0  32282.0  70446.0  30260.0
sit/pause                   9193.0  11099.0   7204.0   5554.0  18755.0
sniff right                 8406.0   8060.0   8561.0   8282.0  13542.0
slow walk                   6127.0   6268.0   7661.0   7108.0   5310.0
sit                         6121.0   9613.0   6288.0   9881.0   7567.0
walk                        4911.0   4392.0   7806.0   6294.0  10397.0
sniff left                  4637.0   3717.0   4675.0   4448.0   2899.0
pause-walk right            2881.0   2186.0   3067.0   3671.0   5035.0
orient left                 1908.0   2654.0   3688.0   4261.0   2419.0
groom/sniff/rear            1784.0   7100.0 

Unnamed: 0,p_value
sniff/groom,0.007937
run,0.007937
orient right,0.007937
sniff right,0.007937
sit,0.007937
left turn,0.007937
sniff left,0.007937
pause-turn right,0.007937
walk forward-stop (pause),0.007937
left turn,0.007937


In [None]:
from scipy.stats import wilcoxon
import pandas as pd

def wilcoxon_signed_rank_test(pre_dfs, post_dfs, column='syllable_name'):
    # Create a dictionary to store the test results
    results = {}
    
    # Collect syllable counts for each mouse in both groups
    pre_counts = pd.concat([df[column].value_counts() for df in pre_dfs], axis=1).fillna(0)
    post_counts = pd.concat([df[column].value_counts() for df in post_dfs], axis=1).fillna(0)
    
    pre_counts = pre_counts.drop('faulty', errors='ignore')
    post_counts = post_counts.drop('faulty', errors='ignore')
    # Perform Wilcoxon Signed-Rank Test for each syllable
    for syllable in pre_counts.index:
        pre_syllable_counts = pre_counts.loc[syllable].values
        post_syllable_counts = post_counts.loc[syllable].values
        
        # Perform the Wilcoxon Signed-Rank Test (for paired data)
        try:
            stat, p_value = wilcoxon(pre_syllable_counts, post_syllable_counts)
            results[syllable] = p_value
        except ValueError:
            # Handle cases where the data does not vary enough (e.g., all values are zero)
            results[syllable] = None
    
    # Convert results to a DataFrame for easy viewing
    results_df = pd.DataFrame.from_dict(results, orient='index', columns=['p_value'])
    
    # Sort by p-value
    results_df = results_df.sort_values(by='p_value')
    
    return results_df

# Example usage:
results_df = wilcoxon_signed_rank_test(pre_dfs, post_dfs, column='syllable_name')

# Display the test results
print(results_df.head(20))  # Show top 10 syllables with the most significant differences

                           p_value
sniff/sit                   0.0625
run                         0.0625
orint left                  0.0625
orient right                0.0625
right turn-run              0.0625
left turn                   0.0625
pause-turn right            0.0625
walk forward-stop (pause)   0.0625
left turn                   0.0625
sniff-orient left           0.0625
right turn                  0.0625
left turn-run               0.0625
sniff left                  0.0625
sit                         0.0625
slow walk                   0.0625
sniff right                 0.0625
sniff                       0.0625
sniff/groom                 0.0625
walk                        0.1875
sit/pause                   0.3125


In [None]:
def paired_permutation_test(pre_dfs, post_dfs, column='syllable_name', n_permutations=10000, random_state=None):
    """
    Performs a paired permutation test on multivariate syllable count data.

    Parameters:
    - pre_dfs: List of DataFrames for pre-amphetamine condition (paired by mouse).
    - post_dfs: List of DataFrames for post-amphetamine condition (paired by mouse).
    - column: Name of the column containing syllable names.
    - n_permutations: Number of permutations to perform.
    - random_state: Seed for reproducibility.

    Returns:
    - observed_stat: Observed sum of squared differences.
    - p_value: P-value from the permutation test.
    """
    if random_state is not None:
        np.random.seed(random_state)
    
    # Ensure the number of pre and post DataFrames are equal (paired)
    assert len(pre_dfs) == len(post_dfs), "Pre and Post DataFrames must be paired and of equal length."
    n_mice = len(pre_dfs)
    
    # Identify all unique syllables across all DataFrames, excluding 'faulty'
    all_syllables = sorted(set(
        syllable for df in pre_dfs + post_dfs
        for syllable in df[column].unique() if syllable != 'faulty'
    ))
    
    # Function to extract syllable counts for a single DataFrame
    def get_counts(df):
        return df[column].value_counts().reindex(all_syllables, fill_value=0).values
    
    # Extract syllable counts for all mice
    pre_counts = np.array([get_counts(df) for df in pre_dfs])  # Shape: (n_mice, n_syllables)
    post_counts = np.array([get_counts(df) for df in post_dfs])  # Shape: (n_mice, n_syllables)
    
    # Calculate observed test statistic (sum of squared differences)
    observed_diff = post_counts - pre_counts
    observed_stat = np.sum(observed_diff**2)
    
    # Perform permutation
    perm_stats = []
    for _ in range(n_permutations):
        # For each mouse, decide whether to swap pre and post counts
        swap = np.random.choice([True, False], size=n_mice)
        perm_post = np.where(swap[:, np.newaxis], pre_counts, post_counts)
        perm_pre = np.where(swap[:, np.newaxis], post_counts, pre_counts)
        perm_diff = perm_post - perm_pre
        perm_stat = np.sum(perm_diff**2)
        perm_stats.append(perm_stat)
    
    perm_stats = np.array(perm_stats)
    
    # Calculate p-value
    p_value = np.mean(perm_stats >= observed_stat)
    
    return observed_stat, p_value

In [None]:
# Example Usage:
# Assuming pre_dfs and post_dfs are lists containing 4 DataFrames each for pre and post conditions.
observed_stat, p_value = paired_permutation_test(pre_dfs, post_dfs, column='syllable_name', n_permutations=10000, random_state=42)
print(f"Observed Statistic: {observed_stat}")
print(f"P-value: {p_value}")


Observed Statistic: 47193632910
P-value: 1.0
