In [4]:
import pandas as pd
from scipy.stats import kendalltau

df = pd.read_csv("raw_data.csv")
pkup_df = df[df['site_name'] == 'GRSM-PKnob Upper Plot']
pkup_plants = pkup_df[pkup_df["kingdom"] == "Plantae"]


In [5]:
def get_valid_combinations(original_df):
    valid_combinations = []
    
    # Get all unique phenophase/species combinations
    combinations = original_df.merge(original_df, how='cross', suffixes=('_one', '_two'))

    for _, row in combinations.iterrows():
        # Extract species and phenophases
        species_one = row['species_id_one']
        species_two = row['species_id_two']
        phenophase_one = row['phenophase_id_one']
        phenophase_two = row['phenophase_id_two']

        # Avoid self-pairs
        if species_one == species_two and phenophase_one == phenophase_two:
            continue

        # Find shared years for the two species (across their phenophases)
        plant_one_years = set(original_df[
            (original_df['species_id'] == species_one) &
            (original_df['phenophase_id'] == phenophase_one)
        ]['first_yes_year'])
        
        plant_two_years = set(original_df[
            (original_df['species_id'] == species_two) &
            (original_df['phenophase_id'] == phenophase_two)
        ]['first_yes_year'])
        
        matching_years = plant_one_years & plant_two_years

        # Check for valid combinations (at least one shared year)
        if len(matching_years) >= 10:  # Only keep pairs with at least 10 shared years
            species_one, species_two = sorted([species_one, species_two])
            valid_combinations.append({
                'phenophase_one': phenophase_one,
                'phenophase_two': phenophase_two,
                'species_one': species_one,
                'species_two': species_two,
                'matching_years': tuple(matching_years)
            })

    # Create a DataFrame of valid combinations
    final_valid_combos = pd.DataFrame(valid_combinations).drop_duplicates()
    final_valid_combos = final_valid_combos.sort_values(
        by=['species_one', 'species_two', 'phenophase_one', 'phenophase_two']
    ).reset_index(drop=True)
    return final_valid_combos


In [6]:
def align_first_yes_days(valid_combos, original_df):
    aligned_data = []

    for _, row in valid_combos.iterrows():
        species_one = row['species_one']
        species_two = row['species_two']
        phenophase_one = row['phenophase_one']
        phenophase_two = row['phenophase_two']
        shared_years = row['matching_years']

        # Extract earliest `first_yes_day` for species_one
        species_one_data = original_df[
            (original_df['species_id'] == species_one) &
            (original_df['phenophase_id'] == phenophase_one) &
            (original_df['first_yes_year'].isin(shared_years))
        ].groupby('first_yes_year')['first_yes_day'].min()

        # Extract earliest `first_yes_day` for species_two
        species_two_data = original_df[
            (original_df['species_id'] == species_two) &
            (original_df['phenophase_id'] == phenophase_two) &
            (original_df['first_yes_year'].isin(shared_years))
        ].groupby('first_yes_year')['first_yes_day'].min()

        # Align data: Keep only years where both species have data
        aligned_years = species_one_data.index.intersection(species_two_data.index)
        species_one_days = species_one_data.loc[aligned_years].tolist()
        species_two_days = species_two_data.loc[aligned_years].tolist()

        # Add aligned data to the results
        aligned_data.append({
            'phenophase_one': phenophase_one,
            'phenophase_two': phenophase_two,
            'species_one': species_one,
            'species_two': species_two,
            'matching_years': list(aligned_years),  # Update to include only aligned years
            'species_one_days': species_one_days,
            'species_two_days': species_two_days
        })

    return pd.DataFrame(aligned_data)


In [7]:
def compute_kendall_tau(aligned_combos):
    """
    Computes Kendall's tau for aligned species pairs.

    Args:
    - aligned_combos (DataFrame): DataFrame with aligned `first_yes_day` values.

    Returns:
    - DataFrame: Results with tau, p-value, and metadata.
    """
    results = []

    for _, row in aligned_combos.iterrows():
        species_one_days = row['species_one_days']
        species_two_days = row['species_two_days']

        # Compute Kendall's tau
        tau, p_value = kendalltau(species_one_days, species_two_days)
        results.append({
            'phenophase_one': row['phenophase_one'],
            'phenophase_two': row['phenophase_two'],
            'species_one': row['species_one'],
            'species_two': row['species_two'],
            'tau': tau,
            'p_value': p_value,
            'num_years': len(row['matching_years'])
        })

    return pd.DataFrame(results)


In [8]:
print("Hi")
valid_combos = get_valid_combinations(pkup_plants)
valid_combos
aligned_combos = align_first_yes_days(valid_combos, pkup_plants)
aligned_combos
final_analysis = compute_kendall_tau(aligned_combos)
final_analysis
print("Hi")

Hi
Hi
