In [None]:
import os 
import cv2

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns 

%matplotlib inline

In [None]:
BASE_PATH = os.path.join("..", "data", "raw", "train")

In [None]:
heights = []
widths = []
images = []

for file in os.listdir(BASE_PATH):
    image = cv2.imread(os.path.join(BASE_PATH, file), cv2.IMREAD_UNCHANGED)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGRA2RGBA)
    print(f"{file}: shape={image.shape}, dtype={image.dtype}")
    heights.append(image.shape[0])
    widths.append(image.shape[1])
    image_resized = cv2.resize(image_rgb, (256, 256))
    success = cv2.imwrite(os.path.join("..", "data", "resized", "train", file), image_resized)
    print(f"resize and save successful: {success}")
    images.append(image_rgb)
    plt.imshow(image_rgb)
    plt.axis("off")  
    plt.show()


In [None]:
plt.figure(figsize=(12,5))
plt.subplot(121); plt.hist(heights, bins=20); plt.title("Heigh Distribution")
plt.subplot(122); plt.hist(widths, bins=20); plt.title("Width Distribution")
plt.show()

In [None]:
hist_b = cv2.calcHist(images, [0], None, [256], [0,256])
hist_g = cv2.calcHist(images, [1], None, [256], [0,256])
hist_r = cv2.calcHist(images, [2], None, [256], [0,256])

plt.figure(figsize=(10,5))
plt.plot(hist_b, color="b", label= "Blue Channel")
plt.plot(hist_g, color="g", label= "Green Channel")
plt.plot(hist_r, color="r", label= "Red Channel")
plt.title("Color Histogram")
plt.grid()
plt.show()

In [None]:
df = pd.read_csv(os.path.join("..", "data", "raw", "train.csv"))
df["local_img_path"]= df.apply(lambda row: row["image_path"].replace("train/", ""), axis=1)
df.head()

In [None]:
plt.figure(figsize=(12,5))
for name, group in df.groupby("target_name"):
    plt.hist(group["target"], label=name)

plt.legend()
plt.show()

In [None]:
targets = df["target_name"].unique()
img_dir = os.path.join("..", "data", "raw", "train")

for target in targets:
     top_5 = df[df["target_name"] == target].nlargest(5, "target")
     bot_5 = df[df["target_name"] == target].nsmallest(5, "target")

     fig, axes = plt.subplots(2, 5, figsize=(20,8))
     fig.suptitle(f"{target}: high biomass vs low biomass", fontsize=24)
     
     for col, (_, row) in enumerate(top_5.iterrows()):
          img = cv2.imread(os.path.join(img_dir, row["local_img_path"]), cv2.IMREAD_COLOR)
          img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
          ax = axes[0, col]
          ax.imshow(img_rgb)
     for col, (_, row) in enumerate(bot_5.iterrows()):
          img = cv2.imread(os.path.join(img_dir, row["local_img_path"]), cv2.IMREAD_COLOR)
          img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
          ax = axes[1, col]
          ax.imshow(img_rgb)
     plt.show()


     print(targets)
    
    




In [None]:
df.head()

In [None]:
wide_df = df.pivot_table(index="image_path", columns="target_name", values="target", aggfunc='first').reset_index()
wide_df.head()

In [None]:
corr = wide_df.drop(columns=["image_path"]).corr()
sns.heatmap(corr, annot=True, cmap="coolwarm")

In [None]:
from pathlib import Path

def count_files(folder):
    folder = Path(folder)
    return sum(1 for p in folder.rglob('*') if p.is_file())

dir1 = Path("../data/raw/train")
dir2 = Path("../data/resized/train")

count1 = count_files(dir1)
count2 = count_files(dir2)

print(f"{dir1}: {count1} files")
print(f"{dir2}: {count2} files")
print("Same count?", count1 == count2)