In [None]:
#!/usr/bin/env python
# coding: utf-8

import pandas as pd
from scipy.stats import kendalltau

# Load the dataset
data_file = "raw_data.csv"  # Replace with your actual file path
df = pd.read_csv(data_file)

# Filter for plants only
pkup_plants = df[df["kingdom"] == "Plantae"]

def filter_species_with_min_days(df, min_days=10):
    """
    Filters species (and phenophases) that have at least `min_days` unique first_yes_days
    corresponding to the first observed day per year.

    Parameters:
        df (pd.DataFrame): The input DataFrame containing columns:
                           'site_name', 'species', 'first_yes_day'.
        min_days (int): The minimum number of unique days required to include a species.

    Returns:
        pd.DataFrame: Filtered DataFrame.
    """
    return (
        df.groupby(["site_name", "species"])
          .filter(lambda group: group["first_yes_day"].nunique() >= min_days)
    )

# Filter the data for species with at least 10 unique first_yes_days
filtered_data = filter_species_with_min_days(pkup_plants)

# Add a new column combining species with their site to avoid overlap
filtered_pkup_plants["site_specific_species"] = (
    filtered_pkup_plants["species"] + " | " + filtered_pkup_plants["site_name"]
)


In [None]:
def find_species_phenophase_pairs(data):
    """
    Finds all species pairs with all possible phenophase pairs based on the first_yes_day.
    Includes lists of all `first_yes_day` values for matching years in a flattened DataFrame.

    Parameters:
        data (DataFrame): The filtered dataset containing columns for species_id, phenophase_id,
                          first_yes_day, and first_yes_year.

    Returns:
        DataFrame: Contains columns for species pairs, phenophase pairs, matching years,
                   and lists of first_yes_days for both species.
    """
    # Filter required columns
    filtered_data = data[['species_id', 'phenophase_id', 'first_yes_day', 'first_yes_year']].dropna()

    # Ensure only the first `first_yes_day` for each species-phenophase-year combination is considered
    filtered_data = filtered_data.sort_values('first_yes_day').drop_duplicates(
        subset=['species_id', 'phenophase_id', 'first_yes_year']
    )

    # Initialize a list to store the results
    pairs = []

    # Group the data by species and phenophase
    grouped_data = filtered_data.groupby(['species_id', 'phenophase_id'])
    for (species_a, phenophase_a), group_a in grouped_data:
        for (species_b, phenophase_b), group_b in grouped_data:
            # Avoid duplicate pairs (A-B and B-A are the same) and self-pairs unless explicitly allowed
            if (species_a, phenophase_a) < (species_b, phenophase_b):
                # Find common years between the two groups
                common_years = set(group_a['first_yes_year']).intersection(group_b['first_yes_year'])

                # If there are common years, collect all years and corresponding first_yes_days
                if common_years:
                    first_yes_days_a = [
                        group_a[group_a['first_yes_year'] == year]['first_yes_day'].values[0]
                        for year in common_years
                    ]
                    first_yes_days_b = [
                        group_b[group_b['first_yes_year'] == year]['first_yes_day'].values[0]
                        for year in common_years
                    ]

                    pairs.append({
                        'species_a': species_a,
                        'phenophase_a': phenophase_a,
                        'species_b': species_b,
                        'phenophase_b': phenophase_b,
                        'matching_years': list(common_years),
                        'first_yes_days_a': first_yes_days_a,
                        'first_yes_days_b': first_yes_days_b,
                    })

    # Return a DataFrame of the pairs
    return pd.DataFrame(pairs)




pairs_list_df = find_species_phenophase_pairs(filtered_pkup_plants)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_row', None)

pairs_list_df

In [None]:
def order_lists_within_dataframe(df):
    """
    Sorts the lists in `matching_years`, `first_yes_days_a`, and `first_yes_days_b` columns
    within each row of the DataFrame, aligning them by the sorted order of `matching_years`.

    Parameters:
        df (DataFrame): A DataFrame containing `matching_years`, `first_yes_days_a`, and `first_yes_days_b` columns.

    Returns:
        DataFrame: A DataFrame with sorted lists in `matching_years` and aligned corresponding columns.
    """
    def sort_and_align(row):
        # Combine matching_years with first_yes_days_a and first_yes_days_b
        combined = sorted(zip(row['matching_years'], row['first_yes_days_a'], row['first_yes_days_b']))
        
        # Unzip the sorted tuples back into separate lists
        sorted_years, sorted_days_a, sorted_days_b = zip(*combined)
        
        # Update the row's lists with the sorted values
        row['matching_years'] = list(sorted_years)
        row['first_yes_days_a'] = list(sorted_days_a)
        row['first_yes_days_b'] = list(sorted_days_b)
        
        return row

    # Apply the sorting function to every row in the DataFrame
    sorted_df = df.apply(sort_and_align, axis=1)

    return sorted_df
    
ordered_pairs = order_lists_within_dataframe(pairs_list_df)

In [None]:
# Filter the DataFrame for the specific pair
specific_pair = ordered_pairs[
    (ordered_pairs['species_a'] == 93) & (ordered_pairs['phenophase_a'] == 483) &
    (ordered_pairs['species_b'] == 1172) & (ordered_pairs['phenophase_b'] == 501)
]

# Display the specific pair
specific_pair

In [None]:
from scipy.stats import kendalltau

def calculate_kendall_tau(df):
    """
    Calculates the Kendall tau correlation coefficient and p-value for each row in the DataFrame,
    based on the `first_yes_days_a` and `first_yes_days_b` lists.
    Returns a DataFrame with species, phenophases, number of observations, Kendall tau, and p-value.

    Parameters:
        df (DataFrame): A DataFrame containing `first_yes_days_a` and `first_yes_days_b` columns.

    Returns:
        DataFrame: A DataFrame with columns:
                   - species_a
                   - phenophase_a
                   - species_b
                   - phenophase_b
                   - number_of_observations
                   - kendall_tau
                   - p_value
    """
    results = []

    for _, row in df.iterrows():
        # Get the first_yes_days lists for both species
        days_a = row['first_yes_days_a']
        days_b = row['first_yes_days_b']
        
        # Ensure lists are not empty and have the same length
        if len(days_a) > 0 and len(days_a) == len(days_b):
            # Calculate Kendall tau and p-value
            tau, p_value = kendalltau(days_a, days_b)
        else:
            tau, p_value = None, None
        
        # Append the results for this row
        results.append({
            'species_a': row['species_a'],
            'phenophase_a': row['phenophase_a'],
            'species_b': row['species_b'],
            'phenophase_b': row['phenophase_b'],
            'number_of_observations': len(days_a),  # Number of years
            'kendall_tau': tau,
            'p_value': p_value
        })

    # Convert the results to a DataFrame
    return pd.DataFrame(results)

results = calculate_kendall_tau(ordered_pairs)
results

In [None]:
# Filter the DataFrame for the specific pair
specific_pair = results[
    (results['species_a'] == 93) & (results['phenophase_a'] == 483) &
    (results['species_b'] == 1172) & (results['phenophase_b'] == 501)
]

# Display the specific pair
specific_pair