In [None]:
import pandas as pd
from scipy.stats import kendalltau

df = pd.read_csv("raw_data.csv")
pkup_df = df[df['site_name'] == 'GRSM-PKnob Upper Plot']
pkup_plants = pkup_df[pkup_df["kingdom"] == "Plantae"]

def get_valid_combinations(original_df):
    valid_combinations = []
    
    # Generate all possible combinations of species and phenophases
    all_species_combinations = original_df.merge(
        original_df, how='cross', suffixes=('_one', '_two')
    )
    
    # Iterate through each combination
    for _, row in all_species_combinations.iterrows():
        # Ensure they are not the same species
        if row['species_id_one'] != row['species_id_two']:
            # Get the years for species one and species two
            plant_one_years = set(
                original_df[
                    (original_df['species_id'] == row['species_id_one']) & 
                    (original_df['phenophase_id'] == row['phenophase_id_one'])
                ]['first_yes_year']
            )
            plant_two_years = set(
                original_df[
                    (original_df['species_id'] == row['species_id_two']) & 
                    (original_df['phenophase_id'] == row['phenophase_id_two'])
                ]['first_yes_year']
            )

            # Check if the years match
            if plant_one_years == plant_two_years:
                species_one, species_two = sorted([row['species_id_one'], row['species_id_two']])
                valid_combinations.append({
                    'species_one': species_one,
                    'species_two': species_two,
                    'phenophase_one': row['phenophase_id_one'],
                    'phenophase_two': row['phenophase_id_two'],
                    'matching_years': tuple(plant_one_years)
                })

    final_valid_combos = pd.DataFrame(valid_combinations).drop_duplicates()
    final_valid_combos = final_valid_combos.sort_values(
        by=['species_one', 'species_two', 'phenophase_one', 'phenophase_two']
    ).reset_index(drop=True)
    return final_valid_combos

combos = get_valid_combinations(pkup_plants)
combos

In [None]:
def filter_by_observations(df, min_years = 10):
    return df[df['matching_years'].apply(len) >= min_years]

filtered_combos = filter_by_observations(combos)
pd.set_option('display.max_rows', None)
filtered_combos
print("Hello")

In [49]:
def align_first_yes_days(valid_combos, original_df):
    """
    Align the `first_yes_day` values for species pairs based on shared years.

    Args:
    - valid_combos (DataFrame): DataFrame containing valid combinations of species pairs and shared years.
    - original_df (DataFrame): The original DataFrame with phenophase and species data.

    Returns:
    - DataFrame: A DataFrame with aligned `first_yes_day` values for each species pair.
    """
    aligned_data = []

    for _, row in valid_combos.iterrows():
        species_one = row['species_one']
        species_two = row['species_two']
        phenophase_one = row['phenophase_one']
        phenophase_two = row['phenophase_two']
        shared_years = row['matching_years']

        # Filter the original data for species_one and species_two in shared years
        species_one_data = original_df[
            (original_df['species_id'] == species_one) &
            (original_df['phenophase_id'] == phenophase_one) &
            (original_df['first_yes_year'].isin(shared_years))
        ]
        species_two_data = original_df[
            (original_df['species_id'] == species_two) &
            (original_df['phenophase_id'] == phenophase_two) &
            (original_df['first_yes_year'].isin(shared_years))
        ]

        # Create a mapping of year -> first_yes_day for both species
        species_one_days_map = species_one_data.set_index('first_yes_year')['first_yes_day'].to_dict()
        species_two_days_map = species_two_data.set_index('first_yes_year')['first_yes_day'].to_dict()

        # Only include years where both species have data
        aligned_years = [
            year for year in shared_years if year in species_one_days_map and year in species_two_days_map
        ]
        species_one_days = [species_one_days_map[year] for year in aligned_years]
        species_two_days = [species_two_days_map[year] for year in aligned_years]

        # Append aligned data to the results
        aligned_data.append({
            'phenophase_one': phenophase_one,
            'phenophase_two': phenophase_two,
            'species_one': species_one,
            'species_two': species_two,
            'matching_years': aligned_years,
            'species_one_days': species_one_days,
            'species_two_days': species_two_days
        })

    return pd.DataFrame(aligned_data)

aligned_df = align_first_yes_days(combos, pkup_plants)
pd.set_option('display.max_rows', None)
aligned_df

Unnamed: 0,phenophase_one,phenophase_two,species_one,species_two,matching_years,species_one_days,species_two_days
0,371,371,3,28,"[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2011, 2012, 2013, 2014, 2015]","[27, 27, 18, 1, 12, 4, 3, 21, 20, 4, 3, 9, 21, 13]","[11, 19, 18, 7, 10, 10, 17, 4, 3, 4, 3, 24, 16, 13]"
1,371,467,3,28,"[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2011, 2012, 2013, 2014, 2015]","[27, 27, 18, 1, 12, 4, 3, 21, 20, 4, 3, 9, 21, 13]","[13, 19, 2, 13, 17, 19, 17, 9, 8, 11, 10, 24, 23, 16]"
2,371,483,3,28,"[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2011, 2012, 2013, 2014, 2015]","[27, 27, 18, 1, 12, 4, 3, 21, 20, 4, 3, 9, 21, 13]","[13, 19, 2, 13, 17, 14, 17, 9, 8, 11, 3, 24, 23, 16]"
3,371,500,3,28,"[2016, 2017, 2018, 2019, 2020, 2021, 2023, 2024, 2012, 2013, 2015]","[27, 27, 18, 1, 12, 4, 21, 20, 3, 9, 13]","[24, 14, 14, 7, 1, 7, 10, 21, 28, 10, 11]"
4,467,371,3,28,"[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2011, 2012, 2013, 2014, 2015]","[27, 3, 14, 1, 7, 5, 3, 25, 17, 4, 27, 10, 6, 27]","[11, 19, 18, 7, 10, 10, 17, 4, 3, 4, 3, 24, 16, 13]"
5,467,467,3,28,"[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2011, 2012, 2013, 2014, 2015]","[27, 3, 14, 1, 7, 5, 3, 25, 17, 4, 27, 10, 6, 27]","[13, 19, 2, 13, 17, 19, 17, 9, 8, 11, 10, 24, 23, 16]"
6,467,483,3,28,"[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2011, 2012, 2013, 2014, 2015]","[27, 3, 14, 1, 7, 5, 3, 25, 17, 4, 27, 10, 6, 27]","[13, 19, 2, 13, 17, 14, 17, 9, 8, 11, 3, 24, 23, 16]"
7,467,500,3,28,"[2016, 2017, 2018, 2019, 2020, 2021, 2023, 2024, 2012, 2013, 2015]","[27, 3, 14, 1, 7, 5, 25, 17, 27, 10, 27]","[24, 14, 14, 7, 1, 7, 10, 21, 28, 10, 11]"
8,471,471,3,28,"[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2012, 2013, 2014, 2015]","[12, 6, 25, 15, 26, 6, 13, 25, 24, 5, 14, 22, 20]","[12, 23, 20, 15, 11, 25, 13, 19, 24, 30, 14, 15, 17]"
9,471,498,3,28,"[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2012, 2013, 2014, 2015]","[12, 6, 25, 15, 26, 6, 13, 25, 24, 5, 14, 22, 20]","[12, 23, 20, 12, 1, 27, 18, 21, 3, 23, 14, 24, 17]"


In [43]:
def compute_kendall_tau(aligned_combos):
    """
    Computes Kendall's tau for aligned species pairs.

    Args:
    - aligned_combos (DataFrame): DataFrame with aligned first_yes_day values.

    Returns:
    - DataFrame: Results with tau, p-value, and metadata.
    """
    results = []

    for _, row in aligned_combos.iterrows():
        species_one_days = row['species_one_days']
        species_two_days = row['species_two_days']

        # Compute Kendall's tau only if both species have data
        if species_one_days and species_two_days:
            tau, p_value = kendalltau(species_one_days, species_two_days)
        else:
            tau, p_value = None, None

        results.append({
            'phenophase_one': row['phenophase_one'],
            'phenophase_two': row['phenophase_two'],
            'species_one': row['species_one'],
            'species_two': row['species_two'],
            'tau': tau,
            'p_value': p_value,
            'num_years': len(row['matching_years'])
        })

    return pd.DataFrame(results)

final_analysis = compute_kendall_tau(aligned_df)
pd.set_option('display.max_rows', None)
final_analysis

Unnamed: 0,phenophase_one,phenophase_two,species_one,species_two,tau,p_value,num_years
0,371,371,3,28,0.205718,0.319392,14
1,371,467,3,28,0.0,1.0,14
2,371,483,3,28,0.079551,0.699012,14
3,371,500,3,28,0.358554,0.134353,11
4,467,371,3,28,-0.160003,0.438203,14
5,467,467,3,28,-0.18286,0.375629,14
6,467,483,3,28,-0.215923,0.29346,14
7,467,500,3,28,0.519231,0.031982,11
8,471,471,3,28,-0.143794,0.499775,13
9,471,498,3,28,-0.315789,0.140187,13


In [None]:
import pandas as pd

def get_valid_combinations(original_df):
    # Initialize a list to store valid combinations
    valid_combinations = []
    
    # Generate all possible combinations of species
    all_species_combinations = original_df.merge(
        original_df, how='cross', suffixes=('_one', '_two')
    )
    
    # Iterate through each combination
    for _, row in all_species_combinations.iterrows():
        # Ensure they are not the same species
        if row['species_id_one'] != row['species_id_two']:
            # Get the years and first_yes_days for species one
            plant_one_data = original_df[
                (original_df['species_id'] == row['species_id_one'])
            ][['first_yes_year', 'first_yes_day']].drop_duplicates().sort_values('first_yes_year')
            
            # Get the years and first_yes_days for species two
            plant_two_data = original_df[
                (original_df['species_id'] == row['species_id_two'])
            ][['first_yes_year', 'first_yes_day']].drop_duplicates().sort_values('first_yes_year')
            
            # Extract years and days as sets
            plant_one_years = set(plant_one_data['first_yes_year'])
            plant_two_years = set(plant_two_data['first_yes_year'])
            
            # Find the intersection of years
            matching_years = plant_one_years.intersection(plant_two_years)
            
            # Check if there are at least 10 matching years
            if len(matching_years) >= 10:
                matching_years = sorted(matching_years)
                
                # Extract corresponding first_yes_days for the matching years
                plant_one_days = [
                    plant_one_data[plant_one_data['first_yes_year'] == year]['first_yes_day'].iloc[0]
                    for year in matching_years
                ]
                plant_two_days = [
                    plant_two_data[plant_two_data['first_yes_year'] == year]['first_yes_day'].iloc[0]
                    for year in matching_years
                ]
                
                species_one, species_two = sorted([row['species_id_one'], row['species_id_two']])
                valid_combinations.append({
                    'species_one': species_one,
                    'species_two': species_two,
                    'matching_years': tuple(matching_years),
                    'first_yes_days_species_one': plant_one_days,
                    'first_yes_days_species_two': plant_two_days
                })
    
    # Convert to DataFrame and remove duplicates
    final_valid_combos = pd.DataFrame(valid_combinations).drop_duplicates()
    final_valid_combos = final_valid_combos.sort_values(
        by=['species_one', 'species_two']
    ).reset_index(drop=True)
    return final_valid_combos

# Apply the function to get valid combinations
combos = get_valid_combinations(pkup_plants)
combos
