In [1]:
import pandas as pd
import os

In [2]:
save_dir = r'C:\Users\sagni\Documents\Personal Files\Research\doi_10_5061_dryad_k0p2ngfhn__v20250410'
os.makedirs(save_dir, exist_ok=True)

In [3]:
# Define the directory path
data_dir = save_dir

# Define file paths
sequences_path = data_dir + r'\ssusa_finalsequences.csv'
deployments_path = data_dir + r'\ssusa_finaldeployments.csv'

# Load the CSV files
sequences_df = pd.read_csv(sequences_path)
print(f"Number of records in sequences_df: {len(sequences_df)}")
deployments_df = pd.read_csv(deployments_path)
print(f"Number of records in deployments_df: {len(deployments_df)}")

  sequences_df = pd.read_csv(sequences_path)


Number of records in sequences_df: 987979
Number of records in deployments_df: 9679


In [4]:
# List of required columns
required_cols = ['Class', 'Order', 'Family', 'Genus', 'Species', 'Common_Name']

# Replace single space strings with actual NaN
sequences_df[required_cols] = sequences_df[required_cols].replace(' ', pd.NA)

# Count before filtering
initial_count = len(sequences_df)

# Drop rows with any missing values in required columns
sequences_df = sequences_df.dropna(subset=required_cols)

# Count after filtering
final_count = len(sequences_df)
deleted_count = initial_count - final_count

# Print results
print(f"Number of records deleted: {deleted_count}")
print(f"Final number of records: {final_count}")

Number of records deleted: 96032
Final number of records: 891947


In [5]:
# Get all common columns
common_cols = set(sequences_df.columns).intersection(deployments_df.columns)
print(common_cols)

{'Camera_Trap_Array', 'Deployment_ID', 'Year', 'Project'}


In [6]:
# Merge on both 'Deployment_ID' and 'Year'
merged_df = pd.merge(
    sequences_df, 
    deployments_df, 
    on=list(common_cols), 
    how='inner'
)

merged_df = merged_df.drop_duplicates()

print(f"Number of records in merged_df: {len(merged_df)}")


Number of records in merged_df: 885087


In [None]:
# Columns to convert to Proper Case
prop_case_cols = [
    'Class', 'Order', 'Family', 'Genus', 'Species', 'Habitat', 
    'Development_Level', 'Feature_Type', 'Common_Name', 
    'Site_Name', 'Age', 'Sex'
]

# First, replace blank or whitespace-only strings with NaN for Age, Sex, and Group_Size
merged_df[['Age', 'Sex', 'Group_Size']] = merged_df[['Age', 'Sex', 'Group_Size']].replace(r'^\s*$', pd.NA, regex=True)

# Group_Size: Replace missing with 0 and convert to numeric
merged_df['Group_Size'] = merged_df['Group_Size'].fillna(0)
merged_df['Group_Size'] = pd.to_numeric(merged_df['Group_Size'], errors='coerce').fillna(0).astype(int)

# Age and Sex: Replace missing with 'Unknown'
merged_df['Age'] = merged_df['Age'].fillna('Unknown')
merged_df['Sex'] = merged_df['Sex'].fillna('Unknown')

# Now apply proper case to selected columns
for col in prop_case_cols:
    merged_df[col] = merged_df[col].str.lower()

In [None]:
# Optional: Save merged result to a new CSV
merged_df.to_csv(data_dir + r'\merged_snapshot_usa.csv', index=False)

In [None]:
# Frequency distribution of 'Age' in percentage
age_distribution = merged_df['Sex'].value_counts(normalize=True) * 100

# Round to 2 decimal places and display
age_distribution = age_distribution.round(2)

print(age_distribution)