# Generate Species List with following columns 
- array name 
- year 
- longitude 
- latitude 
- radius 
- habitats  
- sum of all group sizes detected 
- species columns: one column for each species found in the SnapShotUSA dataset(130). It can have value 1 (species present) or 0(species absent)

Notes: 
- The table will have 137 columns 
- For 5 km radius used for the camera trap array sites, we have used mean longitude and mean latitude as the centroid of the compiled single site. for a 5 km radius, using the mean latitude and longitude as the centroid is a reasonable and commonly used approximation, especially in ecological studies like Snapshot USA. This is because for such small distances the Earth's curvature and projection distortions are minimal. 


In [77]:
import pandas as pd 
import warnings
import ast
warnings.filterwarnings("ignore")

In [78]:
data_dir = '/Users/neelima/Documents/ML_Projects/data'
out_dir = '/Users/neelima/Documents/ML_Projects/output'

In [79]:
cleaned_df = pd.read_csv(data_dir + r'/Data_Clustered_Cleaned_50km_WithSciName.csv')
cleaned_df.shape

(772478, 30)

In [80]:
cleaned_df.head()

Unnamed: 0,Year,Project,Camera_Trap_Array,Deployment_ID,Sequence_ID,Start_Time,End_Time,Class,Order,Family,...,Latitude,Longitude,Habitat,Development_Level,Feature_Type,Cluster_Agglo,Centroid_Latitude,Centroid_Longitude,City_State,Sci_Name
0,2019,Snapshot USA 2019,Crupi,AK_Forest_Chilkat_Preserve_1,d58722s1,2019-08-31 06:50:00,2019-08-31 06:50:00,mammalia,carnivora,ursidae,...,59.42643,-136.2225,forest,wild,water source,55,59.413873,-136.116876,Haines Borough - Alaska,Ursus arctos
1,2019,Snapshot USA 2019,Crupi,AK_Forest_Chilkat_Preserve_1,d58722s2,2019-08-31 14:15:00,2019-08-31 14:17:00,mammalia,carnivora,ursidae,...,59.42643,-136.2225,forest,wild,water source,55,59.413873,-136.116876,Haines Borough - Alaska,Ursus arctos
2,2019,Snapshot USA 2019,Crupi,AK_Forest_Chilkat_Preserve_1,d58722s3,2019-08-31 18:22:00,2019-08-31 18:22:00,mammalia,carnivora,ursidae,...,59.42643,-136.2225,forest,wild,water source,55,59.413873,-136.116876,Haines Borough - Alaska,Ursus arctos
3,2019,Snapshot USA 2019,Crupi,AK_Forest_Chilkat_Preserve_1,d58722s4,2019-08-31 20:58:00,2019-08-31 20:58:00,mammalia,carnivora,ursidae,...,59.42643,-136.2225,forest,wild,water source,55,59.413873,-136.116876,Haines Borough - Alaska,Ursus arctos
4,2019,Snapshot USA 2019,Crupi,AK_Forest_Chilkat_Preserve_1,d58722s4,2019-08-31 20:58:00,2019-08-31 20:58:00,mammalia,carnivora,ursidae,...,59.42643,-136.2225,forest,wild,water source,55,59.413873,-136.116876,Haines Borough - Alaska,Ursus arctos


In [82]:
#get list of all unique species in the cleaned_df
unique_species = cleaned_df['Sci_Name'].unique()

# print count of unique species
print(f"Number of unique species: {len(unique_species)}")
#print 10 unique species
#print("10 unique species:", unique_species[:10])

Number of unique species: 130


In [83]:
# read the array proximity data from camera_array_5km_proximity_sites
array_proximity_df = pd.read_csv(out_dir + r'/camera_array_5km_proximity_sites.csv')   
array_proximity_df.head()

Unnamed: 0,Camera_Trap_Array,Year,Mean_Latitude,Mean_Longitude,Proximity_List
0,ARNWR,2020,35.8094,-75.9057,"{'NC_Forest_Alligator_River_NWR_20_02', 'NC_Fo..."
1,ARNWR,2021,35.8236,-75.8931,"{'NC_Wetland_ARNWR_21_dep_2', 'NC_Wetland_ARNW..."
2,ARNWR,2022,35.8263,-75.8911,"{'NC_Wetland_ARNWR_22_Cam15', 'NC_Wetland_ARNW..."
3,AandM,2022,26.2209,-97.4273,"{'TX_Shrubland_A&M_CCR004', 'TX_Shrubland_A&M_..."
4,Abilene,2019,32.2393,-99.8829,"{'TX_Grassland_Abilene_8', 'TX_Grassland_Abile..."


In [84]:
# array habitats from csv file 
habitat_df = pd.read_csv(out_dir + r'/array_habitats.csv')
habitat_df.head()

Unnamed: 0,Camera_Trap_Array,Habitat
0,ARNWR,Wetland
1,AandM,Forest
2,Abilene,Grassland
3,AbitaFlats,Forest
4,Afognak,Forest


In [87]:
# Generate species list for each array id in the array proximity data

def parse_proximity_list(s):
        if isinstance(s, str):
            try:
                parsed = ast.literal_eval(s)
                return set(i.strip() for i in parsed if isinstance(i, str))
            except:
                return set()
        return set()


proximity_df = array_proximity_df.copy()
proximity_df['Proximity_List'] = proximity_df['Proximity_List'].apply(parse_proximity_list)

proximity_df.head()



Unnamed: 0,Camera_Trap_Array,Year,Mean_Latitude,Mean_Longitude,Proximity_List
0,ARNWR,2020,35.8094,-75.9057,"{NC_Forest_Alligator_River_NWR_20_03, NC_Fores..."
1,ARNWR,2021,35.8236,-75.8931,"{NC_Wetland_ARNWR_21_dep_22, NC_Wetland_ARNWR_..."
2,ARNWR,2022,35.8263,-75.8911,"{NC_Wetland_ARNWR_22_Cam11, NC_Wetland_ARNWR_2..."
3,AandM,2022,26.2209,-97.4273,"{TX_Shrubland_A&M_CCR018, TX_Shrubland_A&M_CCR..."
4,Abilene,2019,32.2393,-99.8829,"{TX_Grassland_Abilene_5, TX_Grassland_Abilene_..."


In [95]:
# Iterate through each row in proximity_df to create species list
species_list = []
for _, row in proximity_df.iterrows():
    array_id = row['Camera_Trap_Array']
    deployment_ids = row['Proximity_List']
    mean_lat = row['Mean_Latitude']
    mean_lon = row['Mean_Longitude']
    
    # Filter merged_df for matching deployments and year
    subset = cleaned_df[
        (cleaned_df['Deployment_ID'].isin(deployment_ids)) 
    ]
    
    radius = 5 # Assuming a fixed radius of 5km for simplicity
    
    unique_habitats = habitat_df[habitat_df['Camera_Trap_Array'] == array_id]['Habitat'].values

    #sum of group sizes from subset
    group_size = subset['Group_Size'].sum() 
    
    #unique_habitats = list(unique_habitats)
    
    unique_habitats.sort()
    habitat_list = ', '.join(unique_habitats)   

    species_presence = {}
    for species in unique_species:
        species_presence[species] = 1 if species in subset['Sci_Name'].values else 0 

    # create different columns in species list for each species presence
    # this will create a dictionary with species names as keys and presence (1) or absence (0) as values
    species_presence = {f'{species}': presence for species, presence in species_presence.items()}
    # create dataframe with required columns 
    species_list.append({
        'Array_Name': array_id,
        'Year': row['Year'],
        'Latitude': mean_lat,
        'Longitude': mean_lon,
        'Radius': radius,
        'Habitats': habitat_list,
        'Group_Size': group_size,
        **species_presence
    })

In [96]:
# convert species_list to a dataframe
species_list_df = pd.DataFrame(species_list)
species_list_df.shape

(591, 137)

In [90]:
species_list_df.columns

Index(['Array_Name', 'Year', 'Mean_Latitude', 'Mean_Longitude', 'Radius',
       'Habitats', 'Group_Size', 'Ursus arctos', 'Alces alces',
       'Canis latrans',
       ...
       'Lama glama', 'Microtus ochrogaster', 'Zalophus californianus',
       'Thomomys bottae', 'Neotamias obscurus', 'Urocitellus armatus',
       'Myodes gapperi', 'Neotamias ruficaudus', 'Brachylagus idahoensis',
       'Cynomys leucurus'],
      dtype='object', length=137)

In [91]:
species_list_df.head()

Unnamed: 0,Array_Name,Year,Mean_Latitude,Mean_Longitude,Radius,Habitats,Group_Size,Ursus arctos,Alces alces,Canis latrans,...,Lama glama,Microtus ochrogaster,Zalophus californianus,Thomomys bottae,Neotamias obscurus,Urocitellus armatus,Myodes gapperi,Neotamias ruficaudus,Brachylagus idahoensis,Cynomys leucurus
0,ARNWR,2020,35.8094,-75.9057,5,Wetland,279,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,ARNWR,2021,35.8236,-75.8931,5,Wetland,509,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,ARNWR,2022,35.8263,-75.8911,5,Wetland,645,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,AandM,2022,26.2209,-97.4273,5,Forest,960,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,Abilene,2019,32.2393,-99.8829,5,Grassland,1372,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [92]:
# write the file to csv
species_list_df.to_csv(out_dir + r'/species_list_5km.csv', index=False)
print(f"Species list saved to {out_dir + r'/species_list_5km.csv'}")

Species list saved to /Users/neelima/Documents/ML_Projects/output/species_list_5km.csv


### Possible Focal Species (Species present at more than 100 sites)

In [None]:
# Find Top Species from the species list - species found at more than 100 arrays 
top_species = species_list_df.drop(columns=['Array_Name', 'Year', 'Latitude', 'Longitude', 'Radius', 'Habitats', 'Group_Size']).sum().sort_values(ascending=False)


top_species = top_species[top_species > 100]
print(f"Top species found in more than 100 arrays: {top_species}")

Top species found in more than 100 arrays: Canis latrans               548
Procyon lotor               486
Odocoileus virginianus      485
Canis familiaris            382
Didelphis virginiana        379
Sciurus carolinensis        367
Lynx rufus                  336
Sylvilagus floridanus       298
Mephitis mephitis           261
Vulpes vulpes               236
Tamias striatus             191
Ursus americanus            185
Felis catus                 185
Sciurus niger               177
Urocyon cinereoargenteus    156
Dasypus novemcinctus        135
Tamiasciurus hudsonicus     125
Odocoileus hemionus         122
Sus scrofa                  106
dtype: int64


In [None]:
species_list_df[(species_list_df['Array_Name'] == 'ARNWR') & (species_list_df['Year'] == 2020)].head()
print("Species list for ARNWR in 2020:",species_list_df )
# print names species columns that have value 1 
species_columns = species_list_df.columns[7:]  # Assuming first 7 columns are not species
print(species_columns)

Index(['Ursus arctos', 'Alces alces', 'Canis latrans', 'Lynx canadensis',
       'Canis familiaris', 'Ursus americanus', 'Canis lupus',
       'Tamiasciurus hudsonicus', 'Sciurus carolinensis',
       'Odocoileus virginianus',
       ...
       'Lama glama', 'Microtus ochrogaster', 'Zalophus californianus',
       'Thomomys bottae', 'Neotamias obscurus', 'Urocitellus armatus',
       'Myodes gapperi', 'Neotamias ruficaudus', 'Brachylagus idahoensis',
       'Cynomys leucurus'],
      dtype='object', length=130)
