In [9]:
# !pip install pandas matplotlib seaborn geopandas contextily shapely
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import geopandas as gpd
from shapely.geometry import Point
import contextily as ctx

In [10]:
import os

save_dir = r'C:\Users\sagni\Documents\Personal Files\Research\doi_10_5061_dryad_k0p2ngfhn__v20250410\bivariate_plots'
os.makedirs(save_dir, exist_ok=True)

In [1]:
# Define the directory path
data_dir = r'C:\Users\sagni\Documents\Personal Files\Research\doi_10_5061_dryad_k0p2ngfhn__v20250410'

# Define file paths
sequences_path = data_dir + r'\ssusa_finalsequences.csv'
deployments_path = data_dir + r'\ssusa_finaldeployments.csv'

# Load the CSV files
sequences_df = pd.read_csv(sequences_path)
deployments_df = pd.read_csv(deployments_path)

# Merge the dataframes on 'Deployment_ID'
merged_df = pd.merge(sequences_df, deployments_df, on='Deployment_ID', how='left')

# Optional: Save merged result to a new CSV
merged_df.to_csv(data_dir + r'\merged_snapshot_usa.csv', index=False)

# Print first few rows to verify
print(merged_df.head())


  sequences_df = pd.read_csv(sequences_path)


   Year_x          Project_x Camera_Trap_Array_x  \
0    2019  Snapshot USA 2019               Crupi   
1    2019  Snapshot USA 2019               Crupi   
2    2019  Snapshot USA 2019               Crupi   
3    2019  Snapshot USA 2019               Crupi   
4    2019  Snapshot USA 2019               Crupi   

                  Deployment_ID Sequence_ID           Start_Time  \
0  AK_Forest_Chilkat_Preserve_1    d58722s1  2019/08/31 06:50:00   
1  AK_Forest_Chilkat_Preserve_1    d58722s2  2019/08/31 14:15:00   
2  AK_Forest_Chilkat_Preserve_1    d58722s3  2019/08/31 18:22:00   
3  AK_Forest_Chilkat_Preserve_1    d58722s4  2019/08/31 20:58:00   
4  AK_Forest_Chilkat_Preserve_1    d58722s4  2019/08/31 20:58:00   

              End_Time     Class      Order   Family  ... Camera_Trap_Array_y  \
0  2019/08/31 06:50:00  Mammalia  Carnivora  Ursidae  ...               Crupi   
1  2019/08/31 14:17:00  Mammalia  Carnivora  Ursidae  ...               Crupi   
2  2019/08/31 18:22:00  Mammalia  C

In [2]:
print(merged_df.dtypes)

Year_x                   int64
Project_x               object
Camera_Trap_Array_x     object
Deployment_ID           object
Sequence_ID             object
Start_Time              object
End_Time                object
Class                   object
Order                   object
Family                  object
Genus                   object
Species                 object
Common_Name             object
Age                     object
Sex                     object
Group_Size              object
Year_y                 float64
Project_y               object
Camera_Trap_Array_y     object
Site_Name               object
Start_Date              object
End_Date                object
Survey_Nights          float64
Latitude               float64
Longitude              float64
Habitat                 object
Development_Level       object
Feature_Type            object
dtype: object


In [13]:
# --- Univariate EDA ---

# 1️⃣ Summary statistics — keep in console
print("\n--- Numeric Columns Summary ---")
print(merged_df[numeric_columns].describe())

print("\n--- Categorical Columns Summary ---")
for col in categorical_columns:
    print(f"\nColumn: {col}")
    print(merged_df[col].value_counts().head(10))

# 2️⃣ Visuals

# Numeric columns: histograms
for col in numeric_columns:
    plt.figure()
    sns.histplot(merged_df[col].dropna(), kde=True)
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, f'univariate_histogram_{col}.png'))
    plt.close()

# Categorical columns: bar plots (top 10 categories)
for col in categorical_columns:
    plt.figure()
    vc = merged_df[col].value_counts().head(10)
    sns.barplot(x=vc.values, y=vc.index, palette='viridis')
    plt.title(f'Top 10 categories of {col}')
    plt.xlabel('Count')
    plt.ylabel(col)
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, f'univariate_barplot_{col}.png'))
    plt.close()

# Date columns: time trends (number of records per month)
for col in datetime_columns:
    if merged_df[col].notna().sum() > 0:
        plt.figure()
        ts = merged_df[col].dt.to_period('M').value_counts().sort_index()
        ts.plot(kind='bar')
        plt.title(f'Record count over time: {col}')
        plt.xlabel('Month')
        plt.ylabel('Number of records')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(os.path.join(save_dir, f'univariate_time_trend_{col}.png'))
        plt.close()

# Example scatterplot: Latitude vs Longitude
plt.figure()
sns.scatterplot(x='Longitude', y='Latitude', hue='Habitat', data=merged_df, alpha=0.6)
plt.title('Spatial Distribution of Deployments')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend(loc='best')
plt.tight_layout()
plt.savefig(os.path.join(save_dir, 'univariate_scatterplot_Latitude_vs_Longitude.png'))
plt.close()

# Example boxplot: Group_Size by Habitat
if 'Group_Size' in merged_df.columns:
    plt.figure()
    sns.boxplot(x='Habitat', y='Group_Size', data=merged_df)
    plt.title('Group Size by Habitat')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, 'univariate_boxplot_Group_Size_by_Habitat.png'))
    plt.close()


--- Numeric Columns Summary ---
              Year_x     Group_Size         Year_y  Survey_Nights  \
count  997999.000000  997969.000000  997972.000000  997972.000000   
mean     2021.392110       1.201248    2021.391918      45.142141   
std         1.399937       1.181973       1.401106      19.205563   
min      2019.000000       0.000000    2019.000000       1.000000   
25%      2020.000000       1.000000    2020.000000      31.000000   
50%      2022.000000       1.000000    2022.000000      45.000000   
75%      2023.000000       1.000000    2023.000000      60.000000   
max      2023.000000     245.000000    2024.000000     142.000000   

            Latitude      Longitude  
count  997972.000000  997972.000000  
mean       38.432281     -88.519664  
std         5.289608      14.554896  
min        21.355811    -157.749620  
25%        35.309600     -95.198300  
50%        39.011940     -84.432783  
75%        41.800909     -77.219527  
max        59.452635     -68.611593  

--


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=vc.values, y=vc.index, palette='viridis')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=vc.values, y=vc.index, palette='viridis')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=vc.values, y=vc.index, palette='viridis')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=vc.values, y=vc.index, palette='viridis')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the 

In [20]:
# Convert dates
date_columns = ['Start_Time', 'End_Time', 'Start_Date', 'End_Date']
for col in date_columns:
    merged_df[col] = pd.to_datetime(merged_df[col], errors='coerce')

# Group_Size to numeric
merged_df['Group_Size'] = pd.to_numeric(merged_df['Group_Size'], errors='coerce')

# Identify columns
numeric_columns = merged_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_columns = merged_df.select_dtypes(include=['object']).columns.tolist()

# --- Bivariate EDA ---

# 1️⃣ Correlation heatmap
corr_matrix = merged_df[numeric_columns].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap (Numeric Columns)')
plt.tight_layout()
plt.savefig(os.path.join(save_dir, 'correlation_heatmap.png'))
plt.close()

# 2️⃣ Numeric vs Numeric scatterplots
numeric_pairs = [('Year_x', 'Group_Size'), 
                 ('Survey_Nights', 'Group_Size'),
                 ('Latitude', 'Longitude'),
                 ('Year_x', 'Survey_Nights')]

for x_col, y_col in numeric_pairs:
    plt.figure()
    sns.scatterplot(x=x_col, y=y_col, data=merged_df, alpha=0.5)
    plt.title(f'Scatterplot: {x_col} vs {y_col}')
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, f'scatter_{x_col}_vs_{y_col}.png'))
    plt.close()

# 3️⃣ Numeric vs Categorical boxplots
cat_for_boxplot = ['Habitat', 'Development_Level', 'Feature_Type', 'Age', 'Sex']

for cat_col in cat_for_boxplot:
    for num_col in numeric_columns:
        plt.figure()
        sns.boxplot(x=cat_col, y=num_col, data=merged_df)
        plt.title(f'Boxplot of {num_col} by {cat_col}')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(os.path.join(save_dir, f'boxplot_{num_col}_by_{cat_col}.png'))
        plt.close()

# --- Preprocess Age and Sex ---
merged_df['Age'] = merged_df['Age'].fillna('unknown').replace('', 'unknown').str.lower()
merged_df['Sex'] = merged_df['Sex'].fillna('unknown').replace('', 'unknown').str.lower()

# 4️⃣ Categorical vs Categorical heatmaps
cat_pairs = [('Habitat', 'Development_Level'),
             ('Habitat', 'Feature_Type'),
             ('Age', 'Sex'),
             ('Class', 'Habitat'),
             ('Project_x', 'Project_y')]

for cat1, cat2 in cat_pairs:
    ct = pd.crosstab(merged_df[cat1], merged_df[cat2])
    plt.figure(figsize=(10, 6))
    sns.heatmap(ct, annot=True, fmt='d', cmap='YlGnBu')
    plt.title(f'Crosstab Heatmap: {cat1} vs {cat2}')
    plt.ylabel(cat1)
    plt.xlabel(cat2)
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, f'heatmap_{cat1}_vs_{cat2}.png'))
    plt.close()

# 5️⃣ Latitude and Longitude on USA Map

# Prepare GeoDataFrame
geo_df = merged_df[['Latitude', 'Longitude']].dropna()
geometry = [Point(xy) for xy in zip(geo_df['Longitude'], geo_df['Latitude'])]
geo_df = gpd.GeoDataFrame(geo_df, geometry=geometry, crs="EPSG:4326")

# Reproject to Web Mercator for plotting with basemap
geo_df = geo_df.to_crs(epsg=3857)

# Plot
fig, ax = plt.subplots(figsize=(12, 8))
geo_df.plot(ax=ax, alpha=0.5, markersize=10, color='red')

# Add basemap — use a working provider
ctx.add_basemap(ax, source=ctx.providers.CartoDB.Positron)

ax.set_title('Spatial Distribution of Camera Deployments (USA Map)')
ax.set_axis_off()

plt.tight_layout()
plt.savefig(os.path.join(save_dir, 'usa_map_latitude_longitude.png'))
plt.close()

print(f"Bivariate EDA plots saved in folder:\n{save_dir}")


Bivariate EDA plots saved in folder:
C:\Users\sagni\Documents\Personal Files\Research\doi_10_5061_dryad_k0p2ngfhn__v20250410\bivariate_plots


In [14]:
# Spatial-temporal scatterplot: Latitude vs Longitude, colored by Year_x

plt.figure(figsize=(12, 8))
sns.scatterplot(
    x='Longitude', y='Latitude',
    hue='Year_x', palette='viridis',
    data=merged_df, alpha=0.7, s=50
)

plt.title('Spatial Distribution of Camera Deployments Colored by Year')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend(title='Year', loc='best')
plt.tight_layout()
plt.savefig(os.path.join(save_dir, 'combined_scatterplot_Latitude_Longitude_by_Year.png'))
plt.close()


# Spatial scatterplot: Latitude vs Longitude, colored by Habitat

plt.figure(figsize=(12, 8))
sns.scatterplot(
    x='Longitude', y='Latitude',
    hue='Habitat',
    data=merged_df, alpha=0.7, s=50
)

plt.title('Spatial Distribution of Camera Deployments Colored by Habitat')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend(title='Habitat', loc='best', bbox_to_anchor=(1.05, 1))
plt.tight_layout()
plt.savefig(os.path.join(save_dir, 'combined_scatterplot_Latitude_Longitude_by_Habitat.png'))
plt.close()

In [16]:
###### 1️⃣ Spatial-temporal scatterplot: Latitude vs Longitude → colored by Year_x
plt.figure(figsize=(12, 8))
sns.scatterplot(
    x='Longitude', y='Latitude',
    hue='Year_x', palette='viridis',
    data=merged_df, alpha=0.7, s=50
)

plt.title('Spatial Distribution of Camera Deployments Colored by Year')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend(title='Year', loc='best')
plt.tight_layout()
plt.savefig(os.path.join(save_dir, 'combined_scatterplot_Latitude_Longitude_by_Year.png'))
plt.close()

### 2️⃣ Spatial scatterplot: Latitude vs Longitude → colored by Habitat
plt.figure(figsize=(12, 8))
sns.scatterplot(
    x='Longitude', y='Latitude',
    hue='Habitat',
    data=merged_df, alpha=0.7, s=50
)

plt.title('Spatial Distribution of Camera Deployments Colored by Habitat')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend(title='Habitat', loc='best', bbox_to_anchor=(1.05, 1))
plt.tight_layout()
plt.savefig(os.path.join(save_dir, 'combined_scatterplot_Latitude_Longitude_by_Habitat.png'))
plt.close()

### 3️⃣ Top 10 Common Species → barplot
plt.figure(figsize=(10, 6))
vc_species = merged_df['Common_Name'].value_counts().head(10)
sns.barplot(x=vc_species.values, y=vc_species.index, palette='magma')
plt.title('Top 10 Most Detected Species')
plt.xlabel('Number of Observations')
plt.ylabel('Common Name')
plt.tight_layout()
plt.savefig(os.path.join(save_dir, 'top10_common_species_barplot.png'))
plt.close()

### 4️⃣ Faceted time trends → species or habitat vs month

# Prep data: parse month from Start_Time
merged_df['Month'] = merged_df['Start_Time'].dt.month

# Species vs Month → Facet by Species (top 5 species)
top5_species = merged_df['Common_Name'].value_counts().head(5).index.tolist()
df_top5_species = merged_df[merged_df['Common_Name'].isin(top5_species)]

# Plot
g = sns.catplot(
    data=df_top5_species,
    x='Month', kind='count', col='Common_Name',
    col_wrap=3, height=4, aspect=1.2,
    palette='muted'
)

g.set_titles('{col_name}')
g.set_axis_labels('Month', 'Number of Observations')
g.fig.suptitle('Time Trends of Top 5 Species (by Month)', y=1.02)
g.tight_layout()
g.savefig(os.path.join(save_dir, 'faceted_time_trends_top5_species_by_month.png'))
plt.close()

# Habitat vs Month → Single plot
plt.figure(figsize=(10, 6))
sns.countplot(
    x='Month', hue='Habitat',
    data=merged_df,
    palette='Set2'
)

plt.title('Habitat-wise Number of Observations per Month')
plt.xlabel('Month')
plt.ylabel('Number of Observations')
plt.legend(title='Habitat', loc='best')
plt.tight_layout()
plt.savefig(os.path.join(save_dir, 'time_trend_habitat_vs_month.png'))
plt.close()

print(f"\n✅ All 4 key storytelling visuals saved in:\n{save_dir}")


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=vc_species.values, y=vc_species.index, palette='magma')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  g = sns.catplot(



✅ All 4 key storytelling visuals saved in:
C:\Users\sagni\Documents\Personal Files\Research\doi_10_5061_dryad_k0p2ngfhn__v20250410\bivariate_plots


In [19]:
import matplotlib.ticker as mticker

# Prepare GeoDataFrame from merged_df → drop NAs first
geo_df = merged_df[['Latitude', 'Longitude', 'Year_x']].dropna()

# Create geometry column → Points from Lon/Lat
geometry = [Point(xy) for xy in zip(geo_df['Longitude'], geo_df['Latitude'])]
geo_df = gpd.GeoDataFrame(geo_df, geometry=geometry, crs="EPSG:4326")  # WGS84 CRS

# Project to Web Mercator (for using web tiles like CartoDB or OSM)
geo_df = geo_df.to_crs(epsg=3857)

# Plot on North America map → colored by Year_x
fig, ax = plt.subplots(figsize=(14, 10))

# Plot, and capture the collection so we can adjust colorbar
collection = geo_df.plot(
    ax=ax, column='Year_x',
    cmap='viridis', alpha=0.7, markersize=50, legend=True
)

# Add basemap — safe provider
ctx.add_basemap(ax, source=ctx.providers.CartoDB.Positron)

# Set title
ax.set_title('Spatial Distribution of Camera Deployments on North America Map (colored by Year)')
ax.set_axis_off()

# Force integer ticks on colorbar
cbar = collection.get_figure().axes[-1]  # colorbar axis
cbar.yaxis.set_major_locator(mticker.MaxNLocator(integer=True))  # force integer ticks

# Save the plot
plt.tight_layout()
plt.savefig(os.path.join(save_dir, 'north_america_map_spatial_distribution_by_Year.png'))
plt.close()

print("\n✅ Spatial Distribution plotted on North America map (integer colorbar) and saved.")



✅ Spatial Distribution plotted on North America map (integer colorbar) and saved.


In [18]:
# Version 1 → by Year_x
geo_df.plot(ax=ax, column='Year_x', cmap='viridis', alpha=0.7, markersize=50, legend=True)
# save as ..._by_Year.png

# Version 2 → by Habitat
geo_df2 = merged_df[['Latitude', 'Longitude', 'Habitat']].dropna()
geometry2 = [Point(xy) for xy in zip(geo_df2['Longitude'], geo_df2['Latitude'])]
geo_df2 = gpd.GeoDataFrame(geo_df2, geometry=geometry2, crs="EPSG:4326")
geo_df2 = geo_df2.to_crs(epsg=3857)

fig2, ax2 = plt.subplots(figsize=(14, 10))
geo_df2.plot(ax=ax2, column='Habitat', cmap='Set2', alpha=0.7, markersize=50, legend=True)
ctx.add_basemap(ax2, source=ctx.providers.CartoDB.Positron)
ax2.set_title('Spatial Distribution of Camera Deployments on North America Map (colored by Habitat)')
ax2.set_axis_off()
plt.tight_layout()
plt.savefig(os.path.join(save_dir, 'north_america_map_spatial_distribution_by_Habitat.png'))
plt.close()