In [7]:
import pandas as pd
from scipy.stats import kendalltau

df = pd.read_csv("raw_data.csv")
pkup_df = df[df['site_name'] == 'GRSM-PKnob Upper Plot']
pkup_plants = pkup_df[pkup_df["kingdom"] == "Plantae"]

def filter_species_with_min_days(df, min_days=10):
    """
    Filters species (and phenophases) that have at least `min_days` unique first_yes_days 
    corresponding to the first observed day per year.

    Parameters:
        df (pd.DataFrame): The input DataFrame containing columns:
                           'species_id', 'phenophase_id', 'first_yes_year', 'first_yes_day'.
        min_days (int): Minimum number of unique first_yes_days required to keep a species.

    Returns:
        pd.DataFrame: Filtered DataFrame containing only species/phenophases with at least `min_days` valid entries.
    """
    # Ensure we're only working with the first observed day per year
    first_observed_df = df.sort_values(['species_id', 'phenophase_id', 'first_yes_year', 'first_yes_day']).drop_duplicates(
        subset=['species_id', 'phenophase_id', 'first_yes_year'], keep='first'
    )

    # Count the number of unique first_yes_days for each species/phenophase
    valid_counts = first_observed_df.groupby(['species_id', 'phenophase_id'])['first_yes_day'].nunique()

    # Filter for species/phenophases with at least min_days unique first_yes_days
    valid_species = valid_counts[valid_counts >= min_days].index

    # Return the filtered DataFrame
    filtered_df = first_observed_df[
        first_observed_df.set_index(['species_id', 'phenophase_id']).index.isin(valid_species)
    ]
    
    return filtered_df.reset_index(drop=True)

# Usage
filtered_pkup_plants = filter_species_with_min_days(pkup_plants, min_days=10)
filtered_pkup_plants
print(len(filtered_pkup_plants)

Hello


In [None]:
def find_species_phenophase_pairs(data):
    """
    Finds all species pairs with all possible phenophase pairs based on the first_yes_day.
    Ensures valid matches only include years where both species have observations.

    Parameters:
        data (DataFrame): The filtered dataset containing columns for species_id, phenophase_id,
                          first_yes_day, and first_yes_year.

    Returns:
        DataFrame: Contains columns for species pairs, phenophase pairs, years, and first_yes_days.
    """
    # Filter required columns
    filtered_data = data[['species_id', 'phenophase_id', 'first_yes_day', 'first_yes_year']].dropna()

    # Ensure only the first `first_yes_day` for each species-phenophase-year combination is considered
    filtered_data = filtered_data.sort_values('first_yes_day').drop_duplicates(
        subset=['species_id', 'phenophase_id', 'first_yes_year']
    )

    # Initialize a list to store pairs
    pairs = []

    # Create all possible pairs of species and phenophases
    grouped_data = filtered_data.groupby(['species_id', 'phenophase_id'])
    for (species_a, phenophase_a), group_a in grouped_data:
        for (species_b, phenophase_b), group_b in grouped_data:
            # Avoid duplicate pairs (A-B and B-A are the same) and self-pairs unless explicitly allowed
            if (species_a, phenophase_a) < (species_b, phenophase_b):
                # Find common years between the two groups
                common_years = set(group_a['first_yes_year']).intersection(group_b['first_yes_year'])

                # If there are common years, add the pair details
                for year in common_years:
                    day_a = group_a[group_a['first_yes_year'] == year]['first_yes_day'].values[0]
                    day_b = group_b[group_b['first_yes_year'] == year]['first_yes_day'].values[0]
                    pairs.append({
                        'species_a': species_a,
                        'phenophase_a': phenophase_a,
                        'species_b': species_b,
                        'phenophase_b': phenophase_b,
                        'year': year,
                        'first_yes_day_a': day_a,
                        'first_yes_day_b': day_b
                    })

    # Return a DataFrame of the pairs
    return pd.DataFrame(pairs)
filtered = find_species_phenophase(filtered_pkup_plants)
filtered

In [3]:
def get_valid_combinations(original_df):
    # Initialize a list to store valid combinations
    valid_combinations = []
    
    # Generate all possible combinations of species
    all_species_combinations = original_df.merge(
        original_df, how='cross', suffixes=('_one', '_two')
    )
    
    # Iterate through each combination
    for _, row in all_species_combinations.iterrows():
        # Ensure they are not the same species
        if row['species_id_one'] != row['species_id_two']:
            # Get the years and first_yes_days for species one
            plant_one_data = original_df[
                (original_df['species_id'] == row['species_id_one'])
            ][['first_yes_year', 'first_yes_day']].drop_duplicates().sort_values('first_yes_year')
            
            # Get the years and first_yes_days for species two
            plant_two_data = original_df[
                (original_df['species_id'] == row['species_id_two'])
            ][['first_yes_year', 'first_yes_day']].drop_duplicates().sort_values('first_yes_year')
            
            # Extract years and days as sets
            plant_one_years = set(plant_one_data['first_yes_year'])
            plant_two_years = set(plant_two_data['first_yes_year'])
            
            # Find the intersection of years
            matching_years = plant_one_years.intersection(plant_two_years)
            
            # Check if there are at least 10 matching years
            if len(matching_years) >= 10:
                matching_years = sorted(matching_years)
                
                # Extract corresponding first_yes_days for the matching years
                plant_one_days = [
                    plant_one_data[plant_one_data['first_yes_year'] == year]['first_yes_day'].iloc[0]
                    for year in matching_years
                ]
                plant_two_days = [
                    plant_two_data[plant_two_data['first_yes_year'] == year]['first_yes_day'].iloc[0]
                    for year in matching_years
                ]
                
                species_one, species_two = sorted([row['species_id_one'], row['species_id_two']])
                valid_combinations.append({
                    'species_one': species_one,
                    'species_two': species_two,
                    'matching_years': tuple(matching_years),
                    'first_yes_days_species_one': plant_one_days,
                    'first_yes_days_species_two': plant_two_days
                })
    
    # Convert to DataFrame and remove duplicates
    final_valid_combos = pd.DataFrame(valid_combinations).drop_duplicates()
    final_valid_combos = final_valid_combos.sort_values(
        by=['species_one', 'species_two']
    ).reset_index(drop=True)
    return final_valid_combos

# Apply the function to get valid combinations
combos = get_valid_combinations(pkup_plants)
combos
print("Hello")

TypeError: unhashable type: 'list'