In [None]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os
import sys
from tqdm import tqdm

In [None]:
# Load cutout csvs
cutout_dir = "/home/psa_images/SemiF-AnnotationPipeline/data/semifield-cutouts/"
cutout_dir = "/mnt/research-projects/s/screberg/longterm_images/semifield-cutouts"

cutout_csvs = [x.stem for x in Path(cutout_dir).rglob("*.csv")]
batch = False

In [None]:
if not batch:
    data = Path(cutout_dir).glob("*")
    csvs = []
    for a in data:
        print(a)
        csv = list(a.glob("*.csv"))
        if len(csv) > 0:
            csvs.append(csv[0])
    df = pd.concat([pd.read_csv(x, low_memory=False) for x in csvs])
else:
    x = list(Path(cutout_dir).glob("*.csv"))
    df = pd.read_csv(x[0], low_memory=False)

df["state_id"] = df.batch_id.str.split("_", expand=False).str[0]
df["date"] = df.batch_id.str.split("_", expand=False).str[1]
df["common_name"] = np.where(
    df.common_name == "Common lambsquarters", "Lambsquarter", df.common_name
)
df["dt"] = pd.to_datetime(df["date"])
df.columns.unique()[80:100]
df.USDA_symbol.unique()

## Stats

## All images

In [None]:
sdf = df.drop_duplicates("image_id")
sdf = (
    sdf.groupby(["common_name", "state_id"])
    .image_id.count()
    .reset_index()
    .sort_values("image_id")
)
sdf = sdf[sdf["common_name"] != "colorchecker"]
print("Number of images by common name and state")
print(sdf.reset_index(drop=True))

In [None]:
g = sns.catplot(
    data=sdf,
    x="image_id",
    y="common_name",
    hue="state_id",
    height=4,
    aspect=1.5,
    kind="bar",
    palette=sns.color_palette(["#66c2a5", "#fc8d62"]),
)  # 8da0cb)

# extract the matplotlib axes_subplot objects from the FacetGrid
ax = g.facet_axis(0, 0)  # or ax = g.axes.flat[0]

# iterate through the axes containers
for c in ax.containers:
    labels = [f"{(v.get_width()):.0f}" for v in c]
    ax.bar_label(c, labels=labels, label_type="edge")

g.set(xlabel="", ylabel="")
g.fig.subplots_adjust(top=0.9)  # adjust the Figure in rp
g.fig.suptitle("Number of images by common name and state")

## All cutouts

In [None]:
sdf = (
    df.groupby(["common_name", "state_id"])
    .cutout_id.count()
    .reset_index()
    .sort_values("cutout_id")
)
sdf = sdf[sdf["common_name"] != "colorchecker"]
print("Number of cutouts (sub-images) by common name and state")
print(sdf)

In [None]:
g = sns.catplot(
    data=sdf,
    x="cutout_id",
    y="common_name",
    hue="state_id",
    height=4,
    aspect=1.5,
    kind="bar",
    palette=sns.color_palette(["#66c2a5", "#fc8d62"]),
)  # 8da0cb)

# extract the matplotlib axes_subplot objects from the FacetGrid
ax = g.facet_axis(0, 0)  # or ax = g.axes.flat[0]

# iterate through the axes containers
for c in ax.containers:
    labels = [f"{(v.get_width()/1000):.1f}K" for v in c]
    ax.bar_label(c, labels=labels, label_type="edge")

g.set(xlabel="", ylabel="")
g.fig.subplots_adjust(top=0.9)  # adjust the Figure in rp
g.fig.suptitle("Number of cutouts (sub-images) by common name and state")

## Primary Cutouts (unique)

In [None]:
sdf = df[df["is_primary"] == True]
sdf = (
    sdf.groupby(["common_name", "state_id"])
    .cutout_id.count()
    .reset_index()
    .sort_values("cutout_id")
)
sdf = sdf[sdf["common_name"] != "colorchecker"]
print("Number of unique cutouts (sub-images) by common name and state")
print(sdf)

In [None]:
g = sns.catplot(
    data=sdf,
    x="cutout_id",
    y="common_name",
    hue="state_id",
    height=4,
    aspect=1.5,
    kind="bar",
    palette=sns.color_palette(["#66c2a5", "#fc8d62"]),
)  # 8da0cb

# extract the matplotlib axes_subplot objects from the FacetGrid
ax = g.facet_axis(0, 0)  # or ax = g.axes.flat[0]

# iterate through the axes containers
for c in ax.containers:
    # labels = [f'{(v.get_width()/1000):.1f}K' for v in c]
    labels = [f"{(v.get_width()):.1f}" for v in c]
    ax.bar_label(c, labels=labels, label_type="edge")

g.set(xlabel="", ylabel="")
g.fig.subplots_adjust(top=0.9)  # adjust the Figure in rp

g.fig.suptitle("Number of unique cutouts (sub-images) by common name and state")

# Total Number of batches

In [None]:
# plt.style.use('ggplot')
sns.set_context("notebook")
sns.set(font_scale=2)
tdf = df.drop_duplicates("image_id").sort_values("state_id")

g = sns.catplot(
    data=tdf,
    y="batch_id",
    col="state_id",
    sharey=False,
    kind="count",
    aspect=0.8,
    height=14,
)
g.set(xlabel="", ylabel="")
g.set_titles(col_template="{col_name}")

g.fig.subplots_adjust(top=0.9)  # adjust the Figure in rp
g.fig.suptitle("Number of Full Bench Images by Batch")
# g.savefig("../assets/Images_by_batch_summer_weeds_2022.png", dpi=300)

# Number of images per species

In [None]:
sns.set(font_scale=2)
sdf = df.drop_duplicates("image_id")
sdf = (
    sdf.groupby(["common_name", "state_id"])
    .image_id.count()
    .reset_index()
    .sort_values("image_id")
)
sdf = sdf[sdf["common_name"] != "colorchecker"]
# print(sdf)
g = sns.catplot(
    data=sdf,
    x="image_id",
    y="common_name",
    orient="horizontal",
    col="state_id",
    sharey=False,
    kind="bar",
    height=10,
)
g.set(xlabel="", ylabel="")
g.set_titles(col_template="{col_name}")
g.fig.subplots_adjust(top=0.9)  # adjust the Figure in rp
g.fig.suptitle("Full Bench Images by Species")
# g.savefig("../assets/Images_by_species_summer_weeds_2022.png", dpi=300)
# sns.set(font_scale=1)
for ax in g.axes.ravel():
    # add annotations
    for c in ax.containers:
        # labels = [f'{(v.get_width() / 1000):.1f}K' for v in c]
        labels = [int(v.get_width()) for v in c]
        ax.bar_label(c, labels=labels, label_type="edge", fontsize=14)
    ax.margins(y=0.2)
# g.savefig("../assets/Images_by_species_summer_weeds_2022_labeled.png", dpi=300)
plt.show()

## Number of Cutout sub-images by batch

In [None]:
c = (
    df.groupby(["batch_id", "state_id", "is_primary"])
    .cutout_id.count()
    .reset_index()
    .sort_values("batch_id")
)
sns.set(font_scale=2)

g = sns.catplot(
    data=c,
    x="cutout_id",
    y="batch_id",
    hue="is_primary",
    orient="horizontal",
    col="state_id",
    sharey=False,
    kind="bar",
    aspect=0.8,
    height=14,
)
g.set(xlabel="", ylabel="")
g.set_titles(col_template="{col_name}")
g._legend.set_title("")
# replace labels
new_labels = ["duplicates", "unique"]
for t, l in zip(g._legend.texts, new_labels):
    t.set_text(l)

sns.move_legend(g, "center right", bbox_to_anchor=(0.9, 0.55), frameon=False)

g.fig.subplots_adjust(top=0.9)  # adjust the Figure in rp
g.fig.suptitle("Number of Cutouts (sub-images) by Batch")
# g.savefig("../assets/Cutouts_by_batch_summer_weeds_2022.png", dpi=300)

In [None]:
c = (
    df.groupby(["common_name", "state_id", "is_primary"])
    .cutout_id.count()
    .reset_index()
    .sort_values("cutout_id")
)
c = c[c["common_name"] != "colorchecker"]

sns.set(font_scale=2)

g = sns.catplot(
    data=c,
    x="cutout_id",
    y="common_name",
    hue="is_primary",
    orient="horizontal",
    col="state_id",
    sharey=False,
    kind="bar",
    aspect=0.9,
    height=10,
)
g.set(xlabel="", ylabel="")
g.set_titles(col_template="{col_name}")
g._legend.set_title("")
# replace labels
new_labels = ["duplicates", "unique"]
# for t, l in zip(g._legend.texts, new_labels):
#     t.set_text(l)

sns.move_legend(g, "center right", bbox_to_anchor=(0.9, 0.55), frameon=False)

# extract the matplotlib axes_subplot objects from the FacetGrid
# ax = g.facet_axis(0, 0)

# g.set(xticks=np.arange(0,36000,10000))
g.fig.subplots_adjust(top=0.9)  # adjust the Figure in rp
g.fig.suptitle("Number of Cutouts (sub-images) by Species")
# g.savefig("../assets/Cutouts_by_species_summer_weeds_2022.png", dpi=300)
# iterate through axes
for ax in g.axes.ravel():
    # add annotations
    for c in ax.containers:
        labels = [f"{(v.get_width() / 1000):.2f}K" for v in c]
        # labels = [v.get_width() for v in c]
        ax.bar_label(c, labels=labels, label_type="edge", fontsize=14)
    ax.margins(y=0.2)
# g.savefig("../assets/Cutouts_by_species_summer_weeds_2022_labeled.png", dpi=300)
plt.show()