In [None]:
import pandas as pd
from scipy.stats import kendalltau

# Load the dataset
df = pd.read_csv("kendall_tau_results")
df

In [None]:
# Remove rows where species_a and species_b are the same
filtered_df = df[df['species_a'] != df['species_b']]
filtered_df

In [None]:
significant_df = filtered_df[filtered_df["p_value"] <= 0.05]
significant_df

In [None]:
# Create a set of unique species pairs
unique_pairs = significant_df.apply(
    lambda row: tuple(sorted([row['species_a'], row['species_b']])), axis=1
).drop_duplicates()

# Count occurrences of each species in unique pairs
species_counts = pd.Series([species for pair in unique_pairs for species in pair]).value_counts()

# Create a new DataFrame for the results
species_counts_df = species_counts.reset_index()
species_counts_df.columns = ['species', 'observed']

# Display the resulting DataFrame
species_counts_df

In [None]:
# Filter for rows where species 97 appears in either species_a or species_b
species_97_filtered = significant_df[(significant_df['species_a'] == 97) | (significant_df['species_b'] == 97)]

# Display the filtered DataFrame
species_97_filtered


In [None]:
pairs = pd.read_csv("valid_pairs")
pairs

In [None]:
filtered_pairs = df[df['species_a'] != df['species_b']]
filtered_pairs

In [None]:
valid_pairs_copy = filtered_pairs.copy()
# Step 1: Create unique species pairs (ignoring phenophases)
valid_pairs_copy['species_pair'] = valid_pairs_copy.apply(
    lambda row: tuple(sorted([row['species_a'], row['species_b']])), axis=1
)
unique_pairs = valid_pairs_copy.drop_duplicates(subset=['species_pair'])

# Step 2: Count occurrences of each species
species_counts = pd.concat([
    unique_pairs['species_a'],
    unique_pairs['species_b']
]).value_counts().reset_index()

species_counts.columns = ['species', 'raw_expected']

# Step 3: Load observed results (if available) and merge
# Assuming observed_results is the DataFrame with observed counts
observed_results = species_counts_df 

# Merge the total observations with observed results
merged_results = observed_results.merge(species_counts, on='species', how='left')

# Display the final DataFrame
merged_results


In [None]:
# Filter for rows where species 97 appears in either species_a or species_b
species_1174_filtered = significant_df[(significant_df['species_a'] == 1174) | (significant_df['species_b'] == 1174)]

# Display the filtered DataFrame
species_1174_filtered

In [None]:
# Filter for rows where species 97 appears in either species_a or species_b
species_1174_filtered_raw = filtered_pairs[(filtered_pairs['species_a'] == 1174) | (filtered_pairs['species_b'] == 1174)]

# Display the filtered DataFrame

species_1174_filtered_raw
species_1174_filtered_raw.to_csv("testing")

In [None]:
# Step 1: Calculate the total raw expected counts
total_raw_expected = merged_results['raw_expected'].sum()

# Step 2: Calculate the proportion of unique pairings for each species
merged_results['proportion'] = merged_results['raw_expected'] / total_raw_expected

# Step 3: Scale the proportions to match the total observed significant pairings
total_observed = merged_results['observed'].sum()
merged_results['expected'] = merged_results['proportion'] * total_observed

# Display the updated DataFrame
pd.set_option('display.max_rows', None)     # Show all rows

merged_results


In [None]:
from scipy.stats import chisquare

# Step 1: Calculate the contribution of each species to the chi-square statistic
merged_results['chi_square_contribution'] = (
    (merged_results['observed'] - merged_results['expected'])**2 
    / merged_results['expected']
)

# Step 2: Calculate the overall chi-square statistic and p-value
chi_square_stat = merged_results['chi_square_contribution'].sum()
degrees_of_freedom = len(merged_results) - 1  # Number of species - 1
chi_square_results = chisquare(
    f_obs=merged_results['observed'], 
    f_exp=merged_results['expected']
)

# Step 3: Extract the p-value
p_value = chi_square_results.pvalue

# Step 4: Print the summary results
print("Chi-Square Statistic:", chi_square_stat)
print("Degrees of Freedom:", degrees_of_freedom)
print("P-Value:", p_value)

# Display the updated DataFrame
merged_results

In [None]:
# Function to calculate unique species pairs and counts by site
def calculate_species_counts_by_site(df):
    # Group by site
    grouped = df.groupby('site_id')
    site_results = {}

    for site_id, group in grouped:
        # Create a set of unique species pairs for the site
        unique_pairs = group.apply(
            lambda row: tuple(sorted([row['species_a'], row['species_b']])), axis=1
        ).drop_duplicates()

        # Count occurrences of each species in unique pairs
        species_counts = pd.Series(
            [species for pair in unique_pairs for species in pair]
        ).value_counts()

        # Create a new DataFrame for the site's results
        species_counts_df = species_counts.reset_index()
        species_counts_df.columns = ['species', 'observed']

        # Store the results for this site
        site_results[site_id] = species_counts_df

    return site_results

# Example usage
site_species_counts = calculate_species_counts_by_site(significant_df)

# Display the results for each site
for site_id, counts_df in site_species_counts.items():
    print(f"Site ID: {site_id}")
    print(counts_df)
    print("\n" + "-"*40 + "\n")


In [None]:
species_1172_filtered = significant_df[(significant_df['site_id'] == 8182) & ((significant_df['species_a'] == 1172) | (significant_df['species_b'] == 1172))]
#species_1172_filtered
significant_df[(significant_df['site_id'] == 8182)]


In [None]:
def calculate_species_counts_by_site_as_dataframe(df):
    # Create an empty list to store results
    all_sites_data = []

    # Group by site
    grouped = df.groupby('site_id')

    for site_id, group in grouped:
        # Create a set of unique species pairs for the site
        unique_pairs = group.apply(
            lambda row: tuple(sorted([row['species_a'], row['species_b']])), axis=1
        ).drop_duplicates()

        # Count occurrences of each species in unique pairs
        species_counts = pd.Series(
            [species for pair in unique_pairs for species in pair]
        ).value_counts()

        # Create a DataFrame for this site's results
        species_counts_df = species_counts.reset_index()
        species_counts_df.columns = ['species', 'observed']
        species_counts_df['site_id'] = site_id  # Add site_id as a column

        # Append to the list
        all_sites_data.append(species_counts_df)

    # Combine all site results into a single DataFrame
    return pd.concat(all_sites_data, ignore_index=True)

# Example usage
site_species_counts_df = calculate_species_counts_by_site_as_dataframe(significant_df)

# Display the combined DataFrame
print(site_species_counts_df)


In [None]:
def calculate_species_counts_by_site_as_dataframe(df):
    # Standardize site_id
    df['site_id'] = df['site_id'].astype('int')

    # Create an empty list to store results
    all_sites_data = []

    # Group by site
    grouped = df.groupby('site_id')

    for site_id, group in grouped:
        # Create a set of unique species pairs for the site
        unique_pairs = group.apply(
            lambda row: tuple(sorted([row['species_a'], row['species_b']])), axis=1
        ).drop_duplicates()

        # Count occurrences of each species in unique pairs
        species_counts = pd.Series(
            [species for pair in unique_pairs for species in pair]
        ).value_counts()

        # Create a DataFrame for this site's results
        species_counts_df = species_counts.reset_index()
        species_counts_df.columns = ['species', 'observed']
        species_counts_df['site_id'] = site_id  # Add site_id as a column

        # Standardize species as integers
        species_counts_df['species'] = species_counts_df['species'].astype('int')

        # Append to the list
        all_sites_data.append(species_counts_df)

    # Combine all site results into a single DataFrame
    return pd.concat(all_sites_data, ignore_index=True)
    
site_species_counts_df = calculate_species_counts_by_site_as_dataframe(significant_df)

# Display the combined DataFrame
print(site_species_counts_df)

In [None]:
def calculate_raw_expected_by_site(filtered_pairs, site_species_counts_df):
    # Ensure 'site_id' exists in filtered_pairs
    if 'site_id' not in filtered_pairs.columns:
        raise KeyError("'site_id' column is missing in filtered_pairs.")
    
    # Standardize site_id and species
    filtered_pairs['site_id'] = filtered_pairs['site_id'].astype('int')
    site_species_counts_df['site_id'] = site_species_counts_df['site_id'].astype('int')
    site_species_counts_df['species'] = site_species_counts_df['species'].astype('int')

    # Group by site
    grouped = filtered_pairs.groupby('site_id')
    site_results = {}

    for site_id, group in grouped:
        # Step 1: Create unique species pairs (ignoring phenophases)
        group['species_pair'] = group.apply(
            lambda row: tuple(sorted([row['species_a'], row['species_b']])), axis=1
        )
        unique_pairs = group.drop_duplicates(subset=['species_pair'])

        # Step 2: Count occurrences of each species
        species_counts = pd.concat([
            unique_pairs['species_a'],
            unique_pairs['species_b']
        ]).value_counts().reset_index()

        species_counts.columns = ['species', 'raw_expected']

        # Standardize species as integers
        species_counts['species'] = species_counts['species'].astype('int')

        # Step 3: Filter observed_results for the current site
        observed_results = site_species_counts_df[site_species_counts_df['site_id'] == site_id]

        # Step 4: Merge using species (no site_id needed in this merge)
        site_merged_results = observed_results.merge(
            species_counts, 
            on=['species'],  # Merge by species only
            how='left'
        )
        site_merged_results['raw_expected'].fillna(0, inplace=True)  # Fill NaN values with 0

        # Store the merged results for this site
        site_results[site_id] = site_merged_results

    return site_results
print(calculate_raw_expected_by_site(filtered_pairs, site_species_counts_df))