In [1]:
import pandas as pd
from itertools import combinations
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import ScalarFormatter
import numpy as np
import geopandas as gpd
import os
import contextily as ctx
from shapely.geometry import Point

In [2]:
input_path = r"C:\Users\sagni\Documents\Personal Files\Research\doi_10_5061_dryad_k0p2ngfhn__v20250410\Data_Clustered_Cleaned.csv"

# Read the CSV file into a DataFrame
df_merged = pd.read_csv(input_path, low_memory=False)

## Save Data

In [3]:
# Step 1: Count records and unique years per Deployment_ID
unique_combinations = df_merged[['Deployment_ID', 'Cluster_Agglo', 'Year']].drop_duplicates()

deployment_stats = (
    unique_combinations.groupby('Cluster_Agglo')
    .agg(record_count=('Year', 'count'), unique_years_count=('Year', 'nunique'))
    .reset_index()
)

# Step 2: Group by unique_years_count and sum the record_count
summary = (
    deployment_stats
    .groupby('unique_years_count')['record_count']
    .sum()
    .sort_index()
)

print(summary)

unique_years_count
1    1417
2     971
3    1387
4    1264
5    4550
Name: record_count, dtype: int64


In [4]:
final = df_merged.copy()

In [9]:
# Step 1: Filter and dedupe
df = final[['Year', 'Species', 'Cluster_Agglo']].drop_duplicates()

# Ensure Year is sorted as int
df['Year'] = df['Year'].astype(int)
years = sorted(df['Year'].unique())  # [2019, 2020, 2021, 2022, 2023]

# ------------------ Species Movement: Jaccard of Clusters ------------------

species_year_clusters = defaultdict(lambda: defaultdict(set))

# Build mapping: Species → Year → Set of Clusters
for _, row in df.iterrows():
    species_year_clusters[row['Species']][row['Year']].add(row['Cluster_Agglo'])

species_jaccard_scores = []

for species, year_map in species_year_clusters.items():
    for y1, y2 in zip(years, years[1:]):  # consecutive year pairs
        set1 = year_map.get(y1, set())
        set2 = year_map.get(y2, set())
        if set1 or set2:
            intersection = len(set1 & set2)
            union = len(set1 | set2)
            jaccard = intersection / union if union != 0 else None
            species_jaccard_scores.append({
                'Species': species,
                'Year1': y1,
                'Year2': y2,
                'JaccardSimilarity': jaccard
            })

df_species_movement = pd.DataFrame(species_jaccard_scores)

# ------------------ Location Movement: Jaccard of Species ------------------

cluster_year_species = defaultdict(lambda: defaultdict(set))

# Build mapping: Cluster → Year → Set of Species
for _, row in df.iterrows():
    cluster_year_species[row['Cluster_Agglo']][row['Year']].add(row['Species'])

cluster_jaccard_scores = []

for cluster, year_map in cluster_year_species.items():
    for y1, y2 in zip(years, years[1:]):  # consecutive year pairs
        set1 = year_map.get(y1, set())
        set2 = year_map.get(y2, set())
        if set1 or set2:
            intersection = len(set1 & set2)
            union = len(set1 | set2)
            jaccard = intersection / union if union != 0 else None
            cluster_jaccard_scores.append({
                'Cluster_Agglo': cluster,
                'Year1': y1,
                'Year2': y2,
                'JaccardSimilarity': jaccard
            })

df_cluster_movement = pd.DataFrame(cluster_jaccard_scores)

## Jaccard Index - Species Movement

#### Most Moving Species = Lowest Average Jaccard

- **Low Jaccard scores over time** → Highly moving species  
  Species that frequently change their locations or habitats.

- **High Jaccard scores over time** → Stationary or consistent habitat species  
  Species that tend to remain in the same deployment areas across years.