In [15]:
import pandas as pd
from scipy.stats import kendalltau

df = pd.read_csv("raw_data.csv")
pkup_df = df[df['site_name'] == 'GRSM-PKnob Upper Plot']
pkup_plants = pkup_df[pkup_df["kingdom"] == "Plantae"]

def filter_species_with_min_days(df, min_days=10):
    """
    Filters species (and phenophases) that have at least `min_days` unique first_yes_days 
    corresponding to the first observed day per year.

    Parameters:
        df (pd.DataFrame): The input DataFrame containing columns:
                           'species_id', 'phenophase_id', 'first_yes_year', 'first_yes_day'.
        min_days (int): Minimum number of unique first_yes_days required to keep a species.

    Returns:
        pd.DataFrame: Filtered DataFrame containing only species/phenophases with at least `min_days` valid entries.
    """
    # Ensure we're only working with the first observed day per year
    first_observed_df = df.sort_values(['species_id', 'phenophase_id', 'first_yes_year', 'first_yes_day']).drop_duplicates(
        subset=['species_id', 'phenophase_id', 'first_yes_year'], keep='first'
    )

    # Count the number of unique first_yes_days for each species/phenophase
    valid_counts = first_observed_df.groupby(['species_id', 'phenophase_id'])['first_yes_day'].nunique()

    # Filter for species/phenophases with at least min_days unique first_yes_days
    valid_species = valid_counts[valid_counts >= min_days].index

    # Return the filtered DataFrame
    filtered_df = first_observed_df[
        first_observed_df.set_index(['species_id', 'phenophase_id']).index.isin(valid_species)
    ]
    
    return filtered_df.reset_index(drop=True)

# Usage
filtered_pkup_plants = filter_species_with_min_days(pkup_plants, min_days=10)
filtered_pkup_plants


Unnamed: 0.1,Unnamed: 0,site_id,site_name,latitude,longitude,elevation_in_meters,state,species_id,genus,species,...,tminf,prcp_winter,prcp_spring,prcp_summer,prcp_fall,prcp,acc_prcp,daylength,Lat Grouping,Elev(m)
0,556,8182,GRSM-PKnob Upper Plot,35.586998,-83.074631,1505,NC,3,Acer,rubrum,...,35.60,345.00,489.00,417.00,325.00,0.00,446.00,44928,high elevation,1505.0
1,557,8182,GRSM-PKnob Upper Plot,35.586998,-83.074631,1505,NC,3,Acer,rubrum,...,50.00,522.00,534.00,438.00,381.00,0.00,447.00,44928,high elevation,1505.0
2,558,8182,GRSM-PKnob Upper Plot,35.586998,-83.074631,1505,NC,3,Acer,rubrum,...,41.00,714.00,586.00,670.00,322.00,0.00,799.00,45619,high elevation,1505.0
3,559,8182,GRSM-PKnob Upper Plot,35.586998,-83.074631,1505,NC,3,Acer,rubrum,...,23.00,532.00,454.00,529.00,369.00,0.00,342.00,43200,high elevation,1505.0
4,560,8182,GRSM-PKnob Upper Plot,35.586998,-83.074631,1505,NC,3,Acer,rubrum,...,41.00,298.00,374.00,501.00,354.00,0.00,402.00,46310,high elevation,1505.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
479,234,8182,GRSM-PKnob Upper Plot,35.586998,-83.074631,1505,NC,1174,Amelanchier,laevis,...,53.60,489.00,532.00,486.00,528.00,0.00,824.00,50803,high elevation,1505.0
480,235,8182,GRSM-PKnob Upper Plot,35.586998,-83.074631,1505,NC,1174,Amelanchier,laevis,...,53.60,719.00,502.00,633.00,496.00,11.00,1167.00,51840,high elevation,1505.0
481,236,8182,GRSM-PKnob Upper Plot,35.586998,-83.074631,1505,NC,1174,Amelanchier,laevis,...,44.65,618.40,556.13,564.94,477.51,0.00,1024.46,51344,high elevation,1505.0
482,237,8182,GRSM-PKnob Upper Plot,35.586998,-83.074631,1505,NC,1174,Amelanchier,laevis,...,40.80,506.59,398.52,463.60,255.50,25.95,776.38,50968,high elevation,1505.0


In [37]:
def find_species_phenophase_pairs(data):
    """
    Finds all species pairs with all possible phenophase pairs based on the first_yes_day.
    Includes lists of all `first_yes_day` values for matching years in a flattened DataFrame.

    Parameters:
        data (DataFrame): The filtered dataset containing columns for species_id, phenophase_id,
                          first_yes_day, and first_yes_year.

    Returns:
        DataFrame: Contains columns for species pairs, phenophase pairs, matching years,
                   and lists of first_yes_days for both species.
    """
    # Filter required columns
    filtered_data = data[['species_id', 'phenophase_id', 'first_yes_day', 'first_yes_year']].dropna()

    # Ensure only the first `first_yes_day` for each species-phenophase-year combination is considered
    filtered_data = filtered_data.sort_values('first_yes_day').drop_duplicates(
        subset=['species_id', 'phenophase_id', 'first_yes_year']
    )

    # Initialize a list to store the results
    pairs = []

    # Group the data by species and phenophase
    grouped_data = filtered_data.groupby(['species_id', 'phenophase_id'])
    for (species_a, phenophase_a), group_a in grouped_data:
        for (species_b, phenophase_b), group_b in grouped_data:
            # Avoid duplicate pairs (A-B and B-A are the same) and self-pairs unless explicitly allowed
            if (species_a, phenophase_a) < (species_b, phenophase_b):
                # Find common years between the two groups
                common_years = set(group_a['first_yes_year']).intersection(group_b['first_yes_year'])

                # If there are common years, collect all years and corresponding first_yes_days
                if common_years:
                    first_yes_days_a = [
                        group_a[group_a['first_yes_year'] == year]['first_yes_day'].values[0]
                        for year in common_years
                    ]
                    first_yes_days_b = [
                        group_b[group_b['first_yes_year'] == year]['first_yes_day'].values[0]
                        for year in common_years
                    ]

                    pairs.append({
                        'species_a': species_a,
                        'phenophase_a': phenophase_a,
                        'species_b': species_b,
                        'phenophase_b': phenophase_b,
                        'matching_years': list(common_years),
                        'first_yes_days_a': first_yes_days_a,
                        'first_yes_days_b': first_yes_days_b,
                    })

    # Return a DataFrame of the pairs
    return pd.DataFrame(pairs)




pairs_list_df = find_species_phenophase_pairs(filtered_pkup_plants)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_row', None)

pairs_list_df

Unnamed: 0,species_a,phenophase_a,species_b,phenophase_b,matching_years,first_yes_days_a,first_yes_days_b
0,3,371,3,467,"[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2011, 2012, 2013, 2014, 2015]","[27, 27, 18, 1, 12, 4, 3, 21, 20, 4, 3, 9, 21, 13]","[27, 3, 14, 1, 7, 5, 3, 25, 17, 4, 27, 10, 6, 27]"
1,3,371,3,471,"[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2012, 2013, 2014, 2015]","[27, 27, 18, 1, 12, 4, 3, 21, 20, 3, 9, 21, 13]","[12, 6, 25, 15, 26, 6, 13, 25, 24, 5, 14, 22, 20]"
2,3,371,3,483,"[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2011, 2012, 2013, 2014, 2015]","[27, 27, 18, 1, 12, 4, 3, 21, 20, 4, 3, 9, 21, 13]","[27, 3, 14, 1, 7, 5, 3, 18, 17, 4, 27, 10, 6, 23]"
3,3,371,3,500,"[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2011, 2012, 2013, 2014, 2015]","[27, 27, 18, 1, 12, 4, 3, 21, 20, 4, 3, 9, 21, 13]","[25, 29, 11, 13, 24, 29, 30, 23, 17, 6, 16, 9, 16, 13]"
4,3,371,3,501,"[2016, 2017, 2018, 2019, 2020, 2021, 2023, 2024, 2011, 2012, 2013, 2014, 2015]","[27, 27, 18, 1, 12, 4, 21, 20, 4, 3, 9, 21, 13]","[25, 5, 17, 13, 24, 6, 23, 8, 6, 16, 24, 16, 21]"
5,3,371,28,371,"[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2011, 2012, 2013, 2014, 2015]","[27, 27, 18, 1, 12, 4, 3, 21, 20, 4, 3, 9, 21, 13]","[11, 19, 18, 7, 10, 10, 17, 4, 3, 4, 3, 24, 16, 13]"
6,3,371,28,467,"[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2011, 2012, 2013, 2014, 2015]","[27, 27, 18, 1, 12, 4, 3, 21, 20, 4, 3, 9, 21, 13]","[13, 19, 2, 13, 17, 19, 17, 9, 8, 11, 10, 24, 23, 16]"
7,3,371,28,471,"[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2012, 2013, 2014, 2015]","[27, 27, 18, 1, 12, 4, 3, 21, 20, 3, 9, 21, 13]","[12, 23, 20, 15, 11, 25, 13, 19, 24, 30, 14, 15, 17]"
8,3,371,28,483,"[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2011, 2012, 2013, 2014, 2015]","[27, 27, 18, 1, 12, 4, 3, 21, 20, 4, 3, 9, 21, 13]","[13, 19, 2, 13, 17, 14, 17, 9, 8, 11, 3, 24, 23, 16]"
9,3,371,28,498,"[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2012, 2013, 2014, 2015]","[27, 27, 18, 1, 12, 4, 3, 21, 20, 3, 9, 21, 13]","[12, 23, 20, 12, 1, 27, 18, 21, 3, 23, 14, 24, 17]"


In [39]:
def order_lists_within_dataframe(df):
    """
    Sorts the lists in `matching_years`, `first_yes_days_a`, and `first_yes_days_b` columns
    within each row of the DataFrame, aligning them by the sorted order of `matching_years`.

    Parameters:
        df (DataFrame): A DataFrame containing `matching_years`, `first_yes_days_a`, and `first_yes_days_b` columns.

    Returns:
        DataFrame: A DataFrame with sorted lists in `matching_years` and aligned corresponding columns.
    """
    def sort_and_align(row):
        # Combine matching_years with first_yes_days_a and first_yes_days_b
        combined = sorted(zip(row['matching_years'], row['first_yes_days_a'], row['first_yes_days_b']))
        
        # Unzip the sorted tuples back into separate lists
        sorted_years, sorted_days_a, sorted_days_b = zip(*combined)
        
        # Update the row's lists with the sorted values
        row['matching_years'] = list(sorted_years)
        row['first_yes_days_a'] = list(sorted_days_a)
        row['first_yes_days_b'] = list(sorted_days_b)
        
        return row

    # Apply the sorting function to every row in the DataFrame
    sorted_df = df.apply(sort_and_align, axis=1)

    return sorted_df
    
ordered_pairs = order_lists_within_dataframe(pairs_list_df)

In [41]:
# Filter the DataFrame for the specific pair
specific_pair = ordered_pairs[
    (ordered_pairs['species_a'] == 93) & (ordered_pairs['phenophase_a'] == 483) &
    (ordered_pairs['species_b'] == 1172) & (ordered_pairs['phenophase_b'] == 501)
]

# Display the specific pair
specific_pair

Unnamed: 0,species_a,phenophase_a,species_b,phenophase_b,matching_years,first_yes_days_a,first_yes_days_b
367,93,483,1172,501,"[2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]","[2, 5, 21, 9, 11, 6, 14, 14, 1, 27, 21, 11, 9, 8]","[12, 28, 21, 9, 30, 6, 16, 17, 7, 1, 26, 9, 16, 21]"


In [45]:
from scipy.stats import kendalltau

def calculate_kendall_tau(df):
    """
    Calculates the Kendall tau correlation coefficient and p-value for each row in the DataFrame,
    based on the `first_yes_days_a` and `first_yes_days_b` lists.
    Returns a DataFrame with species, phenophases, number of observations, Kendall tau, and p-value.

    Parameters:
        df (DataFrame): A DataFrame containing `first_yes_days_a` and `first_yes_days_b` columns.

    Returns:
        DataFrame: A DataFrame with columns:
                   - species_a
                   - phenophase_a
                   - species_b
                   - phenophase_b
                   - number_of_observations
                   - kendall_tau
                   - p_value
    """
    results = []

    for _, row in df.iterrows():
        # Get the first_yes_days lists for both species
        days_a = row['first_yes_days_a']
        days_b = row['first_yes_days_b']
        
        # Ensure lists are not empty and have the same length
        if len(days_a) > 0 and len(days_a) == len(days_b):
            # Calculate Kendall tau and p-value
            tau, p_value = kendalltau(days_a, days_b)
        else:
            tau, p_value = None, None
        
        # Append the results for this row
        results.append({
            'species_a': row['species_a'],
            'phenophase_a': row['phenophase_a'],
            'species_b': row['species_b'],
            'phenophase_b': row['phenophase_b'],
            'number_of_observations': len(days_a),  # Number of years
            'kendall_tau': tau,
            'p_value': p_value
        })

    # Convert the results to a DataFrame
    return pd.DataFrame(results)

results = calculate_kendall_tau(ordered_pairs)
results

Unnamed: 0,species_a,phenophase_a,species_b,phenophase_b,number_of_observations,kendall_tau,p_value
0,3,371,3,467,14,0.333333,0.107792
1,3,371,3,471,13,0.145699,0.498195
2,3,371,3,483,14,0.329567,0.109187
3,3,371,3,500,14,0.205718,0.319392
4,3,371,3,501,13,0.12,0.578762
5,3,371,28,371,14,0.205718,0.319392
6,3,371,28,467,14,0.0,1.0
7,3,371,28,471,13,-0.026318,0.902165
8,3,371,28,483,14,0.079551,0.699012
9,3,371,28,498,13,0.092717,0.666447


In [47]:
# Filter the DataFrame for the specific pair
specific_pair = results[
    (results['species_a'] == 93) & (results['phenophase_a'] == 483) &
    (results['species_b'] == 1172) & (results['phenophase_b'] == 501)
]

# Display the specific pair
specific_pair

Unnamed: 0,species_a,phenophase_a,species_b,phenophase_b,number_of_observations,kendall_tau,p_value
367,93,483,1172,501,14,0.137145,0.506814
