In [13]:
import pandas as pd
from scipy.stats import kendalltau

# Load the dataset
df = pd.read_csv("kendall_tau_results")
df

Unnamed: 0.1,Unnamed: 0,site_id,species_a,phenophase_a,species_b,phenophase_b,number_of_observations,kendall_tau,p_value
0,0,8180,444,483,444,498,13,-0.526361,0.013950
1,1,8181,444,483,444,498,13,0.091521,0.667584
2,2,8182,3,371,3,467,14,0.333333,0.107792
3,3,8182,3,371,3,483,14,0.329567,0.109187
4,4,8182,3,371,3,500,14,0.205718,0.319392
...,...,...,...,...,...,...,...,...,...
1389,1389,17582,61,467,1212,467,10,0.022222,1.000000
1390,1390,17582,61,483,82,483,11,0.462963,0.050225
1391,1391,17582,61,483,1212,483,11,0.537037,0.023127
1392,1392,17582,82,467,1212,467,10,0.022222,1.000000


In [14]:
stat_sig_df = df[df["p_value"] <= 0.05]
stat_sig_df

Unnamed: 0.1,Unnamed: 0,site_id,species_a,phenophase_a,species_b,phenophase_b,number_of_observations,kendall_tau,p_value
0,0,8180,444,483,444,498,13,-0.526361,0.013950
14,14,8182,3,371,1172,467,14,0.411435,0.046435
21,21,8182,3,467,3,483,14,0.988700,0.000002
28,28,8182,3,467,102,371,14,0.514294,0.012709
34,34,8182,3,467,1172,500,14,0.443210,0.031051
...,...,...,...,...,...,...,...,...,...
1363,1363,12003,3,498,98,498,12,0.558156,0.012918
1365,1365,12003,12,467,98,371,11,0.537037,0.023127
1382,1382,17582,3,483,61,483,11,0.629630,0.007746
1388,1388,17582,61,467,82,467,10,0.555556,0.028609


In [15]:
pd.unique(stat_sig_df[["species_a", "species_b"]].values.ravel())


array([ 444,    3, 1172,  102, 1174,   93,   28,   12,   81,   82,  823,
       1192,   61,  100,   79,   91, 1199,   97, 1189,   68, 1187,   98,
         76,  970,  941,   60, 1181, 1184, 1212,   67, 1179,    7, 1019,
         74, 1159, 1177])

In [16]:
def find_observed(df):
    significant_data = df[df["p_value"] <= 0.05]
    
    species_counts = (
    pd.concat([significant_data['species_a'], significant_data['species_b']])
    .value_counts()
    .reset_index()
    )
    
    species_counts.columns = ['species', 'count']
    return species_counts

observed_count = find_observed(df)
observed_count

Unnamed: 0,species,count
0,3,99
1,1172,45
2,82,40
3,102,35
4,823,35
5,98,34
6,61,29
7,970,29
8,60,27
9,81,27


In [17]:

# Step 1: Filter significant correlations (p < 0.05)
significant_data = df[df["p_value"] < 0.05]

# Step 2: Count observed significant pairs (O_i)
observed_counts = pd.concat([significant_data["species_a"], significant_data["species_b"]]).value_counts()

# Step 3: Count total pairs (significant + non-significant)
total_counts = pd.concat([df["species_a"], df["species_b"]]).value_counts()

# Step 4: Calculate proportions (P_i)
proportions = total_counts / total_counts.sum()

# Step 5: Calculate expected counts (E_i)
T = len(significant_data)  # Total number of significant pairs
expected_counts = proportions * T

# Combine into a final DataFrame
results = pd.DataFrame({
    "species": total_counts.index,
    "total_pairs": total_counts.values,
    "observed_significant": observed_counts.reindex(total_counts.index, fill_value=0).values,
    "proportion": proportions.values,
    "expected_significant": expected_counts.values
})

results


Unnamed: 0,species,total_pairs,observed_significant,proportion,expected_significant
0,3,355,99,0.127331,42.528694
1,1172,201,45,0.072095,24.079627
2,82,182,40,0.06528,21.803443
3,81,154,27,0.055237,18.449067
4,93,149,20,0.053443,17.850072
5,102,129,35,0.04627,15.454089
6,823,129,35,0.04627,15.454089
7,61,124,29,0.044476,14.855093
8,98,110,34,0.039455,13.177905
9,60,105,27,0.037661,12.57891


In [18]:
from scipy.stats import chi2

# Assuming `results` DataFrame contains observed and expected counts
def calculate_chi_square(results):
    # Observed and Expected counts
    observed = results["observed_significant"]
    expected = results["expected_significant"]

    # Calculate chi-square statistic
    chi_square_stat = ((observed - expected) ** 2 / expected).sum()

    # Degrees of freedom
    deg_freedom = len(results) - 1

    # Calculate p-value
    p_value = chi2.sf(chi_square_stat, df)

    return chi_square_stat, p_value, deg_freedom

# Call the function
chi_square_stat, p_value, deg_freedom = calculate_chi_square(results)

print(f"Chi-Square Statistic: {chi_square_stat}")
print(f"Degrees of Freedom: {deg_freedom}")
print(f"P-value: {p_value}")


Chi-Square Statistic: 452.212528522421
Degrees of Freedom: 36
P-value: [[            nan 1.00000000e+000 3.83518309e-001 ... 2.01114345e-088
              nan 2.03603805e-103]
 [2.38018519e-100 1.00000000e+000 3.83518309e-001 ... 2.01114345e-088
  1.68316796e-102 6.40063443e-101]
 [6.35765720e-099 1.00000000e+000 1.08109952e-097 ... 1.21190083e-087
  1.24225676e-101 2.08029451e-102]
 ...
 [1.00000000e+000 1.00000000e+000 4.29157202e-061 ... 4.87010300e-090
  3.57403112e-101 3.46948370e-103]
 [1.00000000e+000 1.00000000e+000 1.40673586e-052 ... 7.04779295e-091
  3.32472330e-103 2.38018519e-100]
 [1.00000000e+000 1.00000000e+000 1.40673586e-052 ... 4.87010300e-090
  8.81758181e-102 6.82997304e-102]]


In [19]:
def calculate_chi_square_per_site(data):
    # Filter significant correlations
    significant_data = data[data["p_value"] < 0.05]

    # Group by site_id
    site_results = []
    for site_id, site_data in significant_data.groupby("site_id"):
        # Total counts (all pairs)
        total_counts = pd.concat([site_data["species_a"], site_data["species_b"]]).value_counts()

        # Observed significant counts
        observed_counts = pd.concat([
            site_data["species_a"], site_data["species_b"]
        ]).value_counts()

        # Proportions (P_i)
        proportions = total_counts / total_counts.sum()

        # Total significant pairs for this site
        T = len(site_data)

        # Expected counts (E_i)
        expected_counts = proportions * T

        # Calculate chi-square contributions
        chi_square_contributions = ((observed_counts - expected_counts) ** 2 / expected_counts).fillna(0)

        # Calculate overall chi-square statistic
        chi_square_stat = chi_square_contributions.sum()

        # Degrees of freedom
        df = len(proportions) - 1

        # P-value
        p_value = chi2.sf(chi_square_stat, df)

        # Save results for the site
        site_results.append({
            "site_id": site_id,
            "chi_square_stat": chi_square_stat,
            "p_value": p_value,
            "degrees_of_freedom": df
        })

    return pd.DataFrame(site_results)

# Call the function
site_results = calculate_chi_square_per_site(df)
site_results

Unnamed: 0,site_id,chi_square_stat,p_value,degrees_of_freedom
0,8180,1.0,,0
1,8182,41.0,9.382784e-08,5
2,8409,1.0,,0
3,8806,29.0,0.0001446869,7
4,8836,8.0,0.04601171,3
5,8899,6.0,0.1116102,3
6,8901,12.0,0.007383161,3
7,8902,14.0,0.007295056,4
8,8903,3.0,0.2231302,2
9,8904,7.0,0.03019738,2


In [38]:
import pandas as pd

def calculate_species_contribution(data, expected_counts):
    """
    Calculate chi-square contributions of each species within each site.

    Parameters:
    - data: DataFrame with raw data containing site_id, species_a, species_b, and p_value.
    - expected_counts: DataFrame with precomputed expected counts for each species.

    Returns:
    - DataFrame with site_id, species, observed significant, expected significant, and chi-square contributions.
    """
    # Step 1: Filter significant correlations
    significant_data = data[data["p_value"] < 0.05]

    # Step 2: Flatten species columns into a single column
    flattened_data = pd.melt(
        significant_data,
        id_vars=["site_id"],
        value_vars=["species_a", "species_b"],
        var_name="species_type",
        value_name="species"
    )

    # Step 3: Compute observed counts per species within each site
    observed_counts = (
        flattened_data.groupby(["site_id", "species"]).size().reset_index(name="observed_significant")
    )

    # Step 4: Merge with expected counts
    merged_data = observed_counts.merge(
        expected_counts[["species", "expected_significant"]],
        on="species",
        how="left"
    )

    # Step 5: Calculate chi-square contributions
    merged_data["chi_square_contribution"] = (
        (merged_data["observed_significant"] - merged_data["expected_significant"]) ** 2
        / merged_data["expected_significant"]
    )

    # Step 6: Drop rows with NaN contributions (e.g., missing expected counts)
    merged_data = merged_data.dropna(subset=["chi_square_contribution"])

    return merged_data

# Example Usage (replace these DataFrames with your actual data)
data = pd.DataFrame({
    "site_id": [8180, 8180, 8180, 8181, 8181, 8182],
    "species_a": [444, 444, 1172, 444, 1177, 3],
    "species_b": [1177, 3, 444, 1177, 3, 1177],
    "p_value": [0.01, 0.03, 0.2, 0.01, 0.04, 0.4]
})

expected_counts = pd.DataFrame({
    "species": [444, 1177, 3, 1172],
    "expected_significant": [7.5, 6, 4, 5]
})

# Calculate species contributions
species_contributions = calculate_species_contribution(data, expected_counts)
species_contributions


Unnamed: 0,site_id,species,observed_significant,expected_significant,chi_square_contribution
0,8180,3,1,4.0,2.25
1,8180,444,2,7.5,4.033333
2,8180,1177,1,6.0,4.166667
3,8181,3,1,4.0,2.25
4,8181,444,1,7.5,5.633333
5,8181,1177,2,6.0,2.666667
