In [3]:
import pandas as pd
from scipy.stats import kendalltau

# Load the dataset
df = pd.read_csv("kendall_tau_results")
df

Unnamed: 0.1,Unnamed: 0,site_id,species_a,phenophase_a,species_b,phenophase_b,number_of_observations,kendall_tau,p_value
0,0,8180,444,483,444,498,13,-0.526361,0.013950
1,1,8181,444,483,444,498,13,0.091521,0.667584
2,2,8182,3,371,3,467,14,0.333333,0.107792
3,3,8182,3,371,3,483,14,0.329567,0.109187
4,4,8182,3,371,3,500,14,0.205718,0.319392
...,...,...,...,...,...,...,...,...,...
1389,1389,17582,61,467,1212,467,10,0.022222,1.000000
1390,1390,17582,61,483,82,483,11,0.462963,0.050225
1391,1391,17582,61,483,1212,483,11,0.537037,0.023127
1392,1392,17582,82,467,1212,467,10,0.022222,1.000000


In [4]:
# Remove rows where species_a and species_b are the same
filtered_df = df[df['species_a'] != df['species_b']]
filtered_df

Unnamed: 0.1,Unnamed: 0,site_id,species_a,phenophase_a,species_b,phenophase_b,number_of_observations,kendall_tau,p_value
5,5,8182,3,371,28,371,14,0.205718,0.319392
6,6,8182,3,371,28,467,14,0.000000,1.000000
7,7,8182,3,371,28,483,14,0.079551,0.699012
8,8,8182,3,371,93,371,14,0.057471,0.781787
9,9,8182,3,371,93,483,14,0.091954,0.657631
...,...,...,...,...,...,...,...,...,...
1389,1389,17582,61,467,1212,467,10,0.022222,1.000000
1390,1390,17582,61,483,82,483,11,0.462963,0.050225
1391,1391,17582,61,483,1212,483,11,0.537037,0.023127
1392,1392,17582,82,467,1212,467,10,0.022222,1.000000


In [5]:
significant_df = filtered_df[filtered_df["p_value"] <= 0.05]
significant_df

Unnamed: 0.1,Unnamed: 0,site_id,species_a,phenophase_a,species_b,phenophase_b,number_of_observations,kendall_tau,p_value
14,14,8182,3,371,1172,467,14,0.411435,0.046435
28,28,8182,3,467,102,371,14,0.514294,0.012709
34,34,8182,3,467,1172,500,14,0.443210,0.031051
36,36,8182,3,467,1174,371,14,0.483268,0.017756
41,41,8182,3,471,93,498,13,0.431382,0.042916
...,...,...,...,...,...,...,...,...,...
1363,1363,12003,3,498,98,498,12,0.558156,0.012918
1365,1365,12003,12,467,98,371,11,0.537037,0.023127
1382,1382,17582,3,483,61,483,11,0.629630,0.007746
1388,1388,17582,61,467,82,467,10,0.555556,0.028609


In [6]:
# Create a set of unique species pairs
unique_pairs = significant_df.apply(
    lambda row: tuple(sorted([row['species_a'], row['species_b']])), axis=1
).drop_duplicates()

# Count occurrences of each species in unique pairs
species_counts = pd.Series([species for pair in unique_pairs for species in pair]).value_counts()

# Create a new DataFrame for the results
species_counts_df = species_counts.reset_index()
species_counts_df.columns = ['species', 'observed']

# Display the resulting DataFrame
species_counts_df

Unnamed: 0,species,observed
0,3.0,24
1,61.0,16
2,1172.0,14
3,98.0,14
4,102.0,13
5,60.0,13
6,93.0,13
7,970.0,13
8,12.0,13
9,81.0,13


In [7]:
# Filter for rows where species 97 appears in either species_a or species_b
species_97_filtered = significant_df[(significant_df['species_a'] == 97) | (significant_df['species_b'] == 97)]

# Display the filtered DataFrame
species_97_filtered


Unnamed: 0.1,Unnamed: 0,site_id,species_a,phenophase_a,species_b,phenophase_b,number_of_observations,kendall_tau,p_value
437,437,8901,91,483,97,483,13,0.562104,0.008339035
454,454,8901,97,471,1189,471,13,0.724849,0.0008331759
463,463,8901,97,483,1199,483,13,0.552632,0.009839661
466,466,8901,97,498,1189,483,13,-0.450341,0.0362981
1302,1302,11895,82,483,97,483,11,0.574074,0.01518352
1306,1306,11895,97,483,98,483,11,0.888889,0.0001702938
1307,1307,11895,97,483,970,483,11,0.733976,0.001787289
1308,1308,11895,97,483,1172,483,11,0.759259,0.001321871
1309,1309,11895,97,498,98,498,10,1.0,5.511464e-07


In [8]:
pairs = pd.read_csv("valid_pairs")
pairs

Unnamed: 0.1,Unnamed: 0,site_id,species_a,phenophase_a,species_b,phenophase_b,matching_years,first_yes_days_a,first_yes_days_b
0,0,8180,444,483,444,498,"[2012, 2013, 2014, 2015, 2016, 2017, 2018, 201...","[3, 24, 29, 17, 18, 17, 2, 22, 10, 15, 6, 6, 10]","[26, 4, 1, 8, 26, 14, 28, 16, 30, 13, 31, 18, 10]"
1,1,8181,444,483,444,498,"[2012, 2013, 2014, 2015, 2016, 2017, 2018, 201...","[3, 24, 29, 17, 18, 17, 30, 22, 10, 15, 18, 6, 3]","[30, 4, 22, 24, 20, 6, 28, 5, 14, 13, 31, 1, 10]"
2,2,8182,3,371,3,467,"[2011, 2012, 2013, 2014, 2015, 2016, 2017, 201...","[4, 3, 9, 21, 13, 27, 27, 18, 1, 12, 4, 3, 21,...","[4, 27, 10, 6, 27, 27, 3, 14, 1, 7, 5, 3, 25, 17]"
3,3,8182,3,371,3,483,"[2011, 2012, 2013, 2014, 2015, 2016, 2017, 201...","[4, 3, 9, 21, 13, 27, 27, 18, 1, 12, 4, 3, 21,...","[4, 27, 10, 6, 23, 27, 3, 14, 1, 7, 5, 3, 18, 17]"
4,4,8182,3,371,3,500,"[2011, 2012, 2013, 2014, 2015, 2016, 2017, 201...","[4, 3, 9, 21, 13, 27, 27, 18, 1, 12, 4, 3, 21,...","[6, 16, 9, 16, 13, 25, 29, 11, 13, 24, 29, 30,..."
...,...,...,...,...,...,...,...,...,...
1389,1389,17582,61,467,1212,467,"[2015, 2016, 2017, 2018, 2019, 2020, 2021, 202...","[13, 14, 9, 16, 17, 7, 6, 11, 10, 12]","[28, 6, 9, 1, 17, 7, 11, 19, 10, 18]"
1390,1390,17582,61,483,82,483,"[2014, 2015, 2016, 2017, 2018, 2019, 2020, 202...","[30, 13, 6, 9, 16, 17, 7, 11, 11, 10, 12]","[16, 13, 6, 9, 16, 17, 7, 28, 1, 8, 30]"
1391,1391,17582,61,483,1212,483,"[2014, 2015, 2016, 2017, 2018, 2019, 2020, 202...","[30, 13, 6, 9, 16, 17, 7, 11, 11, 10, 12]","[30, 13, 1, 9, 1, 17, 7, 11, 19, 10, 18]"
1392,1392,17582,82,467,1212,467,"[2015, 2016, 2017, 2018, 2019, 2020, 2021, 202...","[13, 6, 9, 16, 17, 7, 2, 1, 8, 12]","[28, 6, 9, 1, 17, 7, 11, 19, 10, 18]"


In [9]:
filtered_pairs = df[df['species_a'] != df['species_b']]
filtered_pairs

Unnamed: 0.1,Unnamed: 0,site_id,species_a,phenophase_a,species_b,phenophase_b,number_of_observations,kendall_tau,p_value
5,5,8182,3,371,28,371,14,0.205718,0.319392
6,6,8182,3,371,28,467,14,0.000000,1.000000
7,7,8182,3,371,28,483,14,0.079551,0.699012
8,8,8182,3,371,93,371,14,0.057471,0.781787
9,9,8182,3,371,93,483,14,0.091954,0.657631
...,...,...,...,...,...,...,...,...,...
1389,1389,17582,61,467,1212,467,10,0.022222,1.000000
1390,1390,17582,61,483,82,483,11,0.462963,0.050225
1391,1391,17582,61,483,1212,483,11,0.537037,0.023127
1392,1392,17582,82,467,1212,467,10,0.022222,1.000000


In [10]:
valid_pairs_copy = filtered_pairs.copy()
# Step 1: Create unique species pairs (ignoring phenophases)
valid_pairs_copy['species_pair'] = valid_pairs_copy.apply(
    lambda row: tuple(sorted([row['species_a'], row['species_b']])), axis=1
)
unique_pairs = valid_pairs_copy.drop_duplicates(subset=['species_pair'])

# Step 2: Count occurrences of each species
species_counts = pd.concat([
    unique_pairs['species_a'],
    unique_pairs['species_b']
]).value_counts().reset_index()

species_counts.columns = ['species', 'raw_expected']

# Step 3: Load observed results (if available) and merge
# Assuming observed_results is the DataFrame with observed counts
observed_results = species_counts_df 

# Merge the total observations with observed results
merged_results = observed_results.merge(species_counts, on='species', how='left')

# Display the final DataFrame
merged_results


Unnamed: 0,species,observed,raw_expected
0,3.0,24,31
1,61.0,16,22
2,1172.0,14,18
3,98.0,14,21
4,102.0,13,22
5,60.0,13,22
6,93.0,13,21
7,970.0,13,16
8,12.0,13,22
9,81.0,13,23


In [11]:
# Filter for rows where species 97 appears in either species_a or species_b
species_1174_filtered = significant_df[(significant_df['species_a'] == 1174) | (significant_df['species_b'] == 1174)]

# Display the filtered DataFrame
species_1174_filtered

Unnamed: 0.1,Unnamed: 0,site_id,species_a,phenophase_a,species_b,phenophase_b,number_of_observations,kendall_tau,p_value
36,36,8182,3,467,1174,371,14,0.483268,0.017756
63,63,8182,3,483,1174,371,14,0.477807,0.01822
150,150,8182,93,371,1174,371,14,0.415835,0.041569
178,178,8182,102,371,1174,371,14,0.491689,0.015534
188,188,8182,102,467,1174,467,14,0.416192,0.045352
194,194,8182,102,471,1174,471,13,0.451622,0.032412
202,202,8182,102,483,1174,467,14,0.439314,0.034638
203,203,8182,102,483,1174,483,14,0.420563,0.040763
224,224,8182,1172,471,1174,471,13,0.802701,0.000177
233,233,8182,1172,498,1174,498,13,0.447368,0.036643


In [12]:
# Filter for rows where species 97 appears in either species_a or species_b
species_1174_filtered_raw = filtered_pairs[(filtered_pairs['species_a'] == 1174) | (filtered_pairs['species_b'] == 1174)]

# Display the filtered DataFrame

species_1174_filtered_raw
species_1174_filtered_raw.to_csv("testing")

In [13]:
# Step 1: Calculate the total raw expected counts
total_raw_expected = merged_results['raw_expected'].sum()

# Step 2: Calculate the proportion of unique pairings for each species
merged_results['proportion'] = merged_results['raw_expected'] / total_raw_expected

# Step 3: Scale the proportions to match the total observed significant pairings
total_observed = merged_results['observed'].sum()
merged_results['expected'] = merged_results['proportion'] * total_observed

# Display the updated DataFrame
pd.set_option('display.max_rows', None)     # Show all rows

merged_results


Unnamed: 0,species,observed,raw_expected,proportion,expected
0,3.0,24,31,0.060311,18.575875
1,61.0,16,22,0.042802,13.182879
2,1172.0,14,18,0.035019,10.785992
3,98.0,14,21,0.040856,12.583658
4,102.0,13,22,0.042802,13.182879
5,60.0,13,22,0.042802,13.182879
6,93.0,13,21,0.040856,12.583658
7,970.0,13,16,0.031128,9.587549
8,12.0,13,22,0.042802,13.182879
9,81.0,13,23,0.044747,13.782101


In [28]:
from scipy.stats import chisquare

# Step 1: Calculate the contribution of each species to the chi-square statistic
merged_results['chi_square_contribution'] = (
    (merged_results['observed'] - merged_results['expected'])**2 
    / merged_results['expected']
)

# Step 2: Calculate the overall chi-square statistic and p-value
chi_square_stat = merged_results['chi_square_contribution'].sum()
degrees_of_freedom = len(merged_results) - 1  # Number of species - 1
chi_square_results = chisquare(
    f_obs=merged_results['observed'], 
    f_exp=merged_results['expected']
)

# Step 3: Extract the p-value
p_value = chi_square_results.pvalue

# Step 4: Print the summary results
print("Chi-Square Statistic:", chi_square_stat)
print("Degrees of Freedom:", degrees_of_freedom)
print("P-Value:", p_value)

# Display the updated DataFrame
merged_results

Chi-Square Statistic: 26.968824482866534
Degrees of Freedom: 34
P-Value: 0.7987868326079247


Unnamed: 0,species,observed,raw_expected,proportion,expected,chi_square_contribution
0,3.0,24,31,0.060311,18.575875,1.583835
1,61.0,16,22,0.042802,13.182879,0.602006
2,1172.0,14,18,0.035019,10.785992,0.957709
3,98.0,14,21,0.040856,12.583658,0.159415
4,102.0,13,22,0.042802,13.182879,0.002537
5,60.0,13,22,0.042802,13.182879,0.002537
6,93.0,13,21,0.040856,12.583658,0.013775
7,970.0,13,16,0.031128,9.587549,1.214578
8,12.0,13,22,0.042802,13.182879,0.002537
9,81.0,13,23,0.044747,13.782101,0.044382
