# Create Species List for Each Camera Trap Array

Species List will have following columns:
- Site Name (Array Name)
- Year
- Habitat Types 
- Mean Longitude 
- Mean Latitude 
- A comma separated list of species names found at the giveb site 

In [1]:
import pandas as pd 
import numpy as np 
import warnings 
import ast

In [2]:
warnings.filterwarnings('ignore')

In [3]:
data_dir = r'/Users/neelima/Documents/ML_Projects/data'
out_dir = r'/Users/neelima/Documents/ML_Projects/output'

In [4]:
cleaned_df = pd.read_csv(data_dir + r'/Data_Clustered_Cleaned_50km_WithSciName.csv')
cleaned_df.shape

(772478, 30)

In [5]:
cleaned_df.columns

Index(['Year', 'Project', 'Camera_Trap_Array', 'Deployment_ID', 'Sequence_ID',
       'Start_Time', 'End_Time', 'Class', 'Order', 'Family', 'Genus',
       'Species', 'Common_Name', 'Age', 'Sex', 'Group_Size', 'Site_Name',
       'Start_Date', 'End_Date', 'Survey_Nights', 'Latitude', 'Longitude',
       'Habitat', 'Development_Level', 'Feature_Type', 'Cluster_Agglo',
       'Centroid_Latitude', 'Centroid_Longitude', 'City_State', 'Sci_Name'],
      dtype='object')

In [6]:
array_proximity_with_year_df = pd.read_csv(out_dir + r'/camera_array_5km_proximity_sites.csv')

array_proximity_with_year_df.shape

(591, 5)

In [7]:
array_proximity_with_year_df.head()

Unnamed: 0,Camera_Trap_Array,Year,Mean_Latitude,Mean_Longitude,Proximity_List
0,ARNWR,2020,35.8094,-75.9057,"{'NC_Forest_Alligator_River_NWR_20_02', 'NC_Fo..."
1,ARNWR,2021,35.8236,-75.8931,"{'NC_Wetland_ARNWR_21_dep_2', 'NC_Wetland_ARNW..."
2,ARNWR,2022,35.8263,-75.8911,"{'NC_Wetland_ARNWR_22_Cam15', 'NC_Wetland_ARNW..."
3,AandM,2022,26.2209,-97.4273,"{'TX_Shrubland_A&M_CCR004', 'TX_Shrubland_A&M_..."
4,Abilene,2019,32.2393,-99.8829,"{'TX_Grassland_Abilene_8', 'TX_Grassland_Abile..."


In [8]:
array_proximity_with_year_df.columns

Index(['Camera_Trap_Array', 'Year', 'Mean_Latitude', 'Mean_Longitude',
       'Proximity_List'],
      dtype='object')

In [9]:
# Unique Camera trap array elements count 
array_proximity_with_year_df['Camera_Trap_Array'].nunique()

262

In [10]:
# array habitats from csv file 
habitat_df = pd.read_csv(out_dir + r'/array_habitats.csv')
habitat_df.head()

Unnamed: 0,Camera_Trap_Array,Habitat
0,ARNWR,Wetland
1,AandM,Forest
2,Abilene,Grassland
3,AbitaFlats,Forest
4,Afognak,Forest


### Calculate Species Richness 

In [None]:
def parse_proximity_list(s):
        if isinstance(s, str):
            try:
                parsed = ast.literal_eval(s)
                return set(i.strip() for i in parsed if isinstance(i, str))
            except:
                return set()
        return set()

proximity_df = array_proximity_with_year_df.copy()
proximity_df['Proximity_List'] = proximity_df['Proximity_List'].apply(parse_proximity_list)
#proximity_df['Year'] = proximity_df['Year'].astype(int)
    

grouped = proximity_df.groupby('Camera_Trap_Array').agg({
    'Proximity_List': lambda lists: set().union(*lists),
    'Mean_Latitude': 'mean',
    'Mean_Longitude': 'mean'
    }).reset_index()

richness_records = []

for _, row in proximity_df.iterrows():
    array_id = row['Camera_Trap_Array']
    deployment_ids = row['Proximity_List']
    mean_lat = row['Mean_Latitude']
    mean_lon = row['Mean_Longitude']
    

    # Filter merged_df for matching deployments and year
    subset = cleaned_df[
        (cleaned_df['Deployment_ID'].isin(deployment_ids)) 
    ]

    unique_species = subset['Species'].nunique()
    species_list = subset['Species'].unique().tolist()
    species_list.sort() 
    
    # get list of unique habitats for this array from habitat_df
    unique_habitats = habitat_df[habitat_df['Camera_Trap_Array'] == array_id]['Habitat'].values
    
    unique_habitats = list(unique_habitats)
    
    unique_habitats.sort()

    richness_records.append({
        'Site_Name': array_id,
        'Year': row['Year'],
        'Habitat_Types' : ', '.join(unique_habitats),
        'Mean_Latitude': round(mean_lat, 4),
        'Mean_Longitude': round(mean_lon, 4),
        'Species_List': ', '.join(species_list),
        })

result_df = pd.DataFrame(richness_records)

In [16]:
result_df = result_df.sort_values(by=['Site_Name', 'Year'])
result_df.head()

Unnamed: 0,Site_Name,Year,Habitat_Types,Mean_Latitude,Mean_Longitude,Species_List
0,ARNWR,2020,Wetland,35.8094,-75.9057,"americanus, canadensis, carolinensis, latrans,..."
1,ARNWR,2021,Wetland,35.8236,-75.8931,"americanus, caballus, canadensis, carolinensis..."
2,ARNWR,2022,Wetland,35.8263,-75.8911,"americanus, caballus, canadensis, carolinensis..."
3,AandM,2022,Forest,26.2209,-97.4273,"californicus, familiaris, latrans, lotor, meph..."
4,Abilene,2019,Grassland,32.2393,-99.8829,"catus, cinereoargenteus, latrans, lotor, mephi..."


In [17]:
output_path = out_dir + r'/species_list/species_list_by_sites.csv'
result_df.to_csv(output_path, index=False)