In [None]:
import pandas as pd
from pathlib import Path
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import re
from matplotlib.legend_handler import HandlerTuple
from matplotlib.patches import Patch
import numpy as np
from pathlib import Path

sys.path.append("/home/psa_images/SemiF-AnnotationPipeline")
sys.path.append("/home/psa_images/SemiF-AnnotationPipeline/segment")
from utils.viz import (
    compile_cutout_csvs,
    plot_processed_batches,
    plot_images_by_state,
    plot_sub_images,
    plot_cutouts_by_state_and_season,
    read_container_list_summary,
    plot_images_by_location,
    plot_sub_images_subplots,
    custom_sort,
    plot_cutouts_by_state_and_season_all
)

In [None]:
# cutout_dir = Path(
# "/home/psa_images/SemiF-AnnotationPipeline/data/semifield-cutouts")
cutout_dir = Path("/mnt/research-projects/s/screberg/longterm_images/semifield-cutouts")
# batch_dir = f"/mnt/research-projects/s/screberg/longterm_images/semifield-developed-images"
# df = compile_cutout_csvs(cutout_dir)
df = pd.read_csv("./temp_compiled_data.csv", low_memory=True )

In [None]:
import pandas as pd

abbreviations = {
    "Common lambsquarters": "C. lambsquarters",
    "Common ragweed": "C. ragweed",
    "Crimson clover": "C. clover",
    "Large crabgrass": "L. crabgrass",
    "Palmer amaranth": "P. amaranth",
    "Smooth pigweed": "S. pigweed",
    "Purple nutsedge": "P. nutsedge",
    "cultivated radish": "C. radish",
}

# Define the processing function
def process_chunk(chunk):
    chunk["state_id"] = chunk["batch_id"].str.split("_", expand=True)[0]
    chunk["common_name"] = chunk["common_name"].map(lambda x: abbreviations.get(x, x))
    chunk["date"] = pd.to_datetime(chunk["batch_id"].str.split("_", expand=True)[1])
    
    condition = (chunk["date"] < "2022-10-11") & (chunk["season"].isna())
    chunk.loc[condition, "season"] = "summer_weeds_2022"
    chunk.loc[
        chunk["season"] == "cool_season_cover_2022_2023_MD_pos_2", "season"
    ] = "cool_season_covers_2022_2023"
    chunk.loc[
        chunk["season"] == "cool_season_cover_2022_2023_MD_pos_3", "season"
    ] = "cool_season_covers_2022_2023"
    chunk = chunk[["cutout_id","batch_id","common_name","season"]]
    return chunk.drop_duplicates(subset=["cutout_id"])

# Initialize an output file (optional, for saving the processed chunks)
output_file = "processed_data.csv"

# Read in the data in chunks and process each chunk
chunk_iter = pd.read_csv('temp_compiled_data.csv',low_memory=False, chunksize=10000)
for i, chunk in enumerate(chunk_iter):
    processed_chunk = process_chunk(chunk)
    
    # Save processed chunk to an output file
    if i == 0:
        processed_chunk.to_csv(output_file, index=False)
    else:
        processed_chunk.to_csv(output_file, mode='a', header=False, index=False)

    print(f"Processed chunk {i + 1}")


In [None]:
batches = Path("/home/mkutuga/SemiF-ImageSelection/data/projects").glob("*")
csv_files = [Path(x, x.stem + ".csv") for x in batches]
dfs = []
for f in csv_files:
    df = pd.read_csv(f)
    dfs.append(df)
df = pd.concat(dfs)
# df = pd.read_csv("./processed_data.csv")

# Preprocess the dataframe

In [None]:
# Create state_id feature
df["state_id"] = df["batch_id"].str.split("_", expand=True)[0]
# Abreviate long common names
abbreviations = {
    "Common lambsquarters": "C. lambsquarters",
    "Common ragweed": "C. ragweed",
    "Crimson clover": "C. clover",
    "Large crabgrass": "L. crabgrass",
    "Palmer amaranth": "P. amaranth",
    "Smooth pigweed": "S. pigweed",
    "Purple nutsedge": "P. nutsedge",
    "cultivated radish": "C. radish",
}
df["common_name"] = df["common_name"].map(lambda x: abbreviations.get(x, x))

# Change season values
df["date"] = pd.to_datetime(df["batch_id"].str.split("_", expand=True)[1])
condition = (df["date"] < "2022-10-11") & (df["season"].isna())
df.loc[condition, "season"] = "summer_weeds_2022"
df.loc[
    df["season"] == "cool_season_cover_2022_2023_MD_pos_2", "season"
] = "cool_season_covers_2022_2023"
df.loc[
    df["season"] == "cool_season_cover_2022_2023_MD_pos_3", "season"
] = "cool_season_covers_2022_2023"
df = df.drop_duplicates(subset=["cutout_id"])
df.head()

In [None]:
cuts = (
    df.groupby(["common_name", "season"])["cutout_id"]
    .nunique()
    .reset_index()
    .rename(columns={"cutout_id": "count"})
)
cuts = cuts[cuts["common_name"] != "colorchecker"]
cuts = cuts[cuts["common_name"] != "unknown"]
cuts = cuts[cuts["season"] != "summer_cash_crops_2023"]
# cuts = cuts[cuts["season"] != "summer_weeds_2022"]
cuts = cuts[cuts["common_name"] != "unknown"]
cuts = cuts.sort_values(by="count")

cuts

In [None]:
sns.set_style("dark")
sns.set_context("notebook")
sns.set(style="dark", context="notebook", font_scale=1.2)
col_strings = cuts.season.unique()
col_order = sorted(col_strings, key=custom_sort)
g = sns.catplot(
    data=cuts,
    x="common_name",
    y="count",
    col="season",
    kind="bar",
    # hue="is_primary",
    sharex=False,
    errorbar=None,
    # palette=palette,
    # col_order=col_order, 
    height=8
)

# Replace underscores between numbers with '/'
modified_strings = [re.sub(r"(\d)_(\d)", r"\1/\2", s) for s in col_order]

# Replace remaining underscores with spaces
new_titles = [s.replace("_", " ") for s in modified_strings]

for i, ax in enumerate(g.axes.flat):
    # Annotate the bars with their heights (the 'count' values)
    for p in ax.patches:
        ax.annotate(
            f"{int(p.get_height())}" if not pd.isna(p.get_height()) else 0.0,
            xy=(p.get_x() + p.get_width() / 2.0, p.get_height()),
            xytext=(0, 10),  # 3 points vertical offset
            textcoords="offset points",
            ha="center",
            va="bottom",
            fontsize=9,
        )
        ax.set_title(new_titles[i])
    # ax.axis(False)
g.set_xticklabels(rotation=85)
# Change the y-axis label
g.set_axis_labels(y_var="")
g.set_axis_labels(x_var="")
g.set_yticklabels(None)

# g.fig.suptitle(title, y=1.13, fontsize=18)
# if save:
    # g.savefig(f"plots/{title}.png", dpi=300)
plt.show()

# Total number of images

In [None]:
print("\nTotal images: ", df.groupby(["state_id"])["image_id"].nunique().sum())
print("\nTotal cutouts: ", len(df))
print("\nTotal primary cutouts: ", len(df[df["is_primary"] == True]))

tx_cuts = len(df[df["state_id"] == "TX"])
nc_cuts = len(df[df["state_id"] == "NC"])
md_cuts = len(df[df["state_id"] == "MD"])

print("\nTotal cutouts by tx: ", tx_cuts)
print("Total cutouts by nc: ", nc_cuts)
print("Total cutouts by md: ", md_cuts)

# Analyze Processed Batches

In [None]:
path = "/home/psa_images/SemiF-AnnotationPipeline/.batchlogs/summary.txt"
df_summ = read_container_list_summary(path)

# Plot
plot_processed_batches(
    df_summ,
    plot_title="Processed vs Not Processed Batches",
    save=True,
    height=7,
    aspect=0.7,
)

In [None]:
df_summ[df_summ["Processed status"] == "Not Processed"]

# Total number of Full-Res Images by Location

In [None]:
df_imgs = df.drop_duplicates(subset="image_id")
df_imgs = df_imgs.sort_values(by=["state_id"], ascending=True)
plot_images_by_location(df_imgs)

# Image totals by species, season, and location

The number of full-resolution images by species 

## Preprocess df_imgs
Count unique image_ids by state, common name, and season and account for colorchecker images

In [None]:
df_temp = df.copy()
# Create a sorting key based on colorchecker entry
df_temp["sort_key"] = df_temp["common_name"].apply(
    lambda x: 1 if x == "colorchecker" else 0
)
# Sort the dataframe
df_temp = df_temp.sort_values(by=["image_id", "sort_key"])
# Drop duplicates and remove the sort_key
df_dropped = df_temp.drop_duplicates(subset="image_id", keep="first").drop(
    columns=["sort_key"]
)
# Remove images when only the colorchecker was present
df_filtered = df_dropped[df_dropped["common_name"] != "colorchecker"]
# Create count of images by location, season, and species
df_imgs = (
    df_filtered.groupby(["state_id", "common_name", "season"])["image_id"]
    .nunique()
    .reset_index()
    .sort_values(by=["season", "state_id"])
    .rename(columns={"image_id": "count"})
)

In [None]:
nc_imgs = df_imgs[df_imgs["state_id"] == "NC"]
plot_images_by_state(
    nc_imgs, "#82C09A", plot_title="NC Images by species and season", save=True
)

In [None]:
md_imgs = df_imgs[df_imgs["state_id"] == "MD"]
plot_images_by_state(
    md_imgs, "#7BACD6", plot_title="MD Images by species and season", save=True
)

In [None]:
tx_imgs = df_imgs[df_imgs["state_id"] == "TX"]
tx_imgs = tx_imgs[tx_imgs["common_name"] != "unknown"]
plot_images_by_state(
    tx_imgs, "#FF8383", plot_title="TX Images by species and season", save=True
)

# Total Cutouts by Location and Species

## Preprocess subimages `cuts`
Group by state, common name, season, and primary status. Account for colorchecker sub-images

In [None]:
cuts = (
    df.groupby(["state_id", "common_name", "season", "is_primary"])["cutout_id"]
    .nunique()
    .reset_index()
    .rename(columns={"cutout_id": "count"})
)
cuts = cuts[cuts["common_name"] != "colorchecker"]
cuts = cuts[cuts["common_name"] != "unknown"]

In [None]:
nc_cuts = cuts[cuts["state_id"] == "NC"].sort_values(by="count")
binary_palettes = {False: "#A5D6A7", True: "#388E3C"}  # Shades of Green
title = "NC Sub-images by species and season"
plot_sub_images(nc_cuts, binary_palettes, title, save=True)

In [None]:
md_cuts = cuts[cuts["state_id"] == "MD"].sort_values(by="count")
binary_palettes = {False: "#90CAF9", True: "#1565C0"}  # Shades of Blue
title = "MD Sub-images by species and season"
plot_sub_images(md_cuts, binary_palettes, title, save=True)

In [None]:
tx_cuts = cuts[cuts["state_id"] == "TX"].sort_values(by="count")
tx_cuts = tx_cuts[tx_cuts["common_name"] != "unknown"]
binary_palettes = {False: "#FFCDD2", True: "#C62828"}  # Shades of Red
title = "TX Sub-images by species and season"
plot_sub_images(tx_cuts, binary_palettes, title, save=True)

# Plot cutouts by state

In [None]:
df_cuts = (
    df.groupby(["state_id", "season", "is_primary"])["cutout_id"]
    .nunique()
    .reset_index()
    .sort_values(by=["state_id", "cutout_id"])
    .rename(columns={"cutout_id": "count"})
)
plot_title = "Total Cutouts by State"
plot_cutouts_by_state_and_season(df_cuts, plot_title, save=True)

In [None]:
abbreviations = {
    "Common lambsquarters": "C. lambsquarters",
    # "Common ragweed": "C. ragweed",
    "Crimson clover": "C. clover",
    "Large crabgrass": "L. crabgrass",
    "Palmer amaranth": "P. amaranth",
    # "Smooth pigweed": "S. pigweed",
    "Purple nutsedge": "P. nutsedge",
    "cultivated radish": "C. radish",
    "Giant foxtail": "G. Foxtail",
    "Hairy vetch": "H. Vetch",
    # "Jimson weed": "Jim. weed",
    "Yellow foxtail": "Y. Foxtail",
    "Texas millet": "T. Millet",
    # "Winter wheat": "W. Wheat",
    # "Black oats": "B. Oats",
}

cuts["common_name"] = cuts["common_name"].map(lambda x: abbreviations.get(x, x))
cuts = cuts[cuts["common_name"] != "unknown"]
cuts = cuts.sort_values(by=["count"])
title = "Large subplot: Cutouts by state, season, and species"

plot_sub_images_subplots(
    cuts,
    title,
    suptitle_fontsize=12,
    xtick_fontsize=10,
    bar_fontsize=7,
    hspace=0.5,
    wspace=0.1,
    plot_height=4,
    aspect=1.2,
    save=True,
)