In [4]:
import pandas as pd
import os
from sklearn.metrics import jaccard_score
from itertools import combinations

# ==== Setup ====
data_dir = r'C:\Users\sagni\Documents\Personal Files\Research\doi_10_5061_dryad_k0p2ngfhn__v20250410'
seq_path = os.path.join(data_dir, 'ssusa_finalsequences.csv')
dep_path = os.path.join(data_dir, 'ssusa_finaldeployments.csv')

# ==== Load Data ====
sequences = pd.read_csv(seq_path)
deployments = pd.read_csv(dep_path)

# ==== Use Common Columns Only ====
common_cols = list(set(sequences.columns) & set(deployments.columns))
sequences_common = sequences[common_cols]

# ==== Merge Site & Year from Deployments ====
merged = pd.merge(
    sequences_common,
    deployments[['Deployment_ID', 'Site_Name', 'Year']],
    on='Deployment_ID',
    how='left'
)

# ==== Add Species and Group_Size ====
merged = pd.merge(
    merged,
    sequences[['Deployment_ID', 'Species', 'Group_Size']],
    on='Deployment_ID',
    how='left'
)

# ==== Clean Data ====
merged = merged[merged['Species'].notna()]
merged['Species'] = merged['Species'].str.lower().str.strip()
merged['presence'] = 1

# ==== View 1: Site-Year Presence Matrix ====
site_year_matrix = merged.pivot_table(
    index=['Site_Name', 'Year'],
    columns='Species',
    values='presence',
    aggfunc='max',
    fill_value=0
)
site_year_matrix.to_csv(os.path.join(data_dir, 'community_view_site_year.csv'))

# ==== View 2: Site-Year Abundance Matrix ====
abundance_matrix = merged.pivot_table(
    index=['Site_Name', 'Year'],
    columns='Species',
    values='Group_Size',
    aggfunc='sum',
    fill_value=0
)
abundance_matrix.to_csv(os.path.join(data_dir, 'community_view_site_year_abundance.csv'))

# ==== View 3: Site Multi-Year Presence Matrix ====
multi_year_matrix = merged.pivot_table(
    index='Site_Name',
    columns='Species',
    values='presence',
    aggfunc='max',
    fill_value=0
)
multi_year_matrix.to_csv(os.path.join(data_dir, 'community_view_site_multi_year.csv'))

# ==== View 4: Species Richness Per Site-Year ====
richness = site_year_matrix.sum(axis=1).reset_index()
richness.columns = ['Site_Name', 'Year', 'Species_Richness']
richness.to_csv(os.path.join(data_dir, 'species_richness_per_site_year.csv'), index=False)

# ==== View 5: Species Richness Per Site (Multi-Year) ====
richness_multi_year = multi_year_matrix.sum(axis=1).reset_index()
richness_multi_year.columns = ['Site_Name', 'Species_Richness_MultiYear']
richness_multi_year.to_csv(os.path.join(data_dir, 'species_richness_per_site_multiyear.csv'), index=False)

# ==== View 6: Species List Per Site-Year ====
site_year_species = (
    merged[['Site_Name', 'Year', 'Species']]
    .drop_duplicates()
    .groupby(['Site_Name', 'Year'])['Species']
    .apply(list)
    .reset_index()
)
site_year_species.to_csv(os.path.join(data_dir, 'species_list_site_year.csv'), index=False)

# ==== View 7: Species List Per Site (Multi-Year) ====
site_species = (
    merged[['Site_Name', 'Species']]
    .drop_duplicates()
    .groupby('Site_Name')['Species']
    .apply(list)
    .reset_index()
)
site_species.to_csv(os.path.join(data_dir, 'species_list_site_multiyear.csv'), index=False)

# ==== View 8: Jaccard Similarity Between Years Per Site ====
jaccard_results = []
for site, group in site_year_matrix.groupby(level=0):
    years = group.index.get_level_values(1).tolist()
    for y1, y2 in combinations(years, 2):
        s1 = group.loc[(site, y1)].values
        s2 = group.loc[(site, y2)].values
        score = jaccard_score(s1, s2)
        jaccard_results.append({
            'Site_Name': site,
            'Year_1': y1,
            'Year_2': y2,
            'Jaccard_Similarity': score
        })

jaccard_df = pd.DataFrame(jaccard_results)
jaccard_df.to_csv(os.path.join(data_dir, 'jaccard_similarity_between_years.csv'), index=False)

# ==== View 9: Species Turnover Between Years (1 - Jaccard) ====
jaccard_df['Turnover'] = 1 - jaccard_df['Jaccard_Similarity']
jaccard_df.to_csv(os.path.join(data_dir, 'species_turnover_between_years.csv'), index=False)

print("✅ All community views, richness, species lists, and temporal metrics created successfully.")


  site_year_matrix = merged.pivot_table(
  merged.groupby(['Site_Name', 'Species'])['presence']


✅ Done: Created species-community matrices (1-year and multi-year).
