
## 1. Importing Libraries and Defining Paths

This cell imports essential libraries for the project and sets up configuration paths.

### Libraries Imported:
- `os`: For interacting with the operating system.
- `numpy`: For numerical operations.
- `pandas`: For data manipulation.
- `rasterio`: For reading and writing geospatial raster data.
- `subprocess`: For running subprocesses.

In [1]:
import os
import cv2
import json
import math
import pathlib
import rasterio
import subprocess
import keras
import numpy as np
import pandas as pd
from config import *
import earthpy.plot as ep
import earthpy.spatial as es
from dataset import read_img, transform_data
from rasterio.plot import show
import matplotlib.pyplot as plt
from rasterio.windows import Window
from matplotlib import pyplot as plt
from dataset import data_csv_gen, patch_images

dir_name

'vv'

### Paths:
- Sets paths for training, validation, and testing datasets.
- Sets paths for storing outputs and logging.


In [12]:
if not (os.path.exists(train_dir)):
    data_csv_gen() # create csv files if there is no csv

# provide csv directory
train_df = pd.read_csv(train_dir)
test_df = pd.read_csv(test_dir)
valid_df = pd.read_csv(valid_dir)

TypeError: stat: path should be string, bytes, os.PathLike or integer, not dict

In [9]:
if not (os.path.exists(p_train_dir)):
    # create json file if there is no csv
    patch_images(train_df, "train_patch_phr_cb_") 
    patch_images(valid_df, "valid_patch_phr_cb_")
    patch_images(test_df, "test_patch_phr_cb_")

# provide json directory
p_train_dir = p_train_dir
p_valid_dir = p_train_dir
p_test_dir = p_test_dir

## 2. Counting Images in Datasets

This cell prints the total number of images in the training, testing, and validation datasets.

### Outputs:
- Total number of training images.
- Total number of test images.
- Total number of validation images.

In [10]:
print("Total Number of images before patchify")
print(f"Training images = {len(train_df)}")
print(f"Validation images = {len(valid_df)}")
print(f"Test images = {len(test_df)}")

Total Number of images before patchify
Training images = 46
Validation images = 6
Test images = 6


In [11]:
with p_train_dir.open() as j:
    train_dir = json.loads(j.read())
with p_valid_dir.open() as j:
    valid_dir = json.loads(j.read())
with p_test_dir.open() as j:
    test_dir = json.loads(j.read())
            
train_features = train_dir["feature_ids"]
valid_features = valid_dir["feature_ids"]
test_features = valid_dir["feature_ids"]

print("Total Number of images after patchify")
print(f"Training images = {len(train_features)}")
print(f"Validation images = {len(valid_features)}")
print(f"Test images = {len(test_features)}")

Total Number of images after patchify
Training images = 26
Validation images = 26
Test images = 26


## 3. Checking Class Balance & Unique Values of Mask

This cell defines a function to check the class percentage in the full dataset.

### Function: `class_balance_check(patchify, data_dir)`
- **Parameters**:
  - `patchify` (bool): TRUE if class balance is to be checked for patchify experiments.
  - `data_dir` (str): Directory where data files are saved.
- **Returns**: Class percentage.
- **Prints**:
  - Class pixel percentage.
  - Unique values in the mask.

In [6]:
def class_balance_check(patchify, data_dir):
    """
    Summary:
        Checking class percentage in the full dataset
    Arguments:
        patchify (bool): TRUE if want to check class balance for patchify experiments
        data_dir (str): directory where data files are saved 
    Return:
        Class percentage
    """
    if patchify:
        with open(data_dir, "r") as j:
            train_data = json.loads(j.read())
        labels = train_data["masks"]
        patch_idx = train_data["patch_idx"]
    else:
        train_data = data_dir
        labels = train_data.masks.values
        patch_idx = None

    total = 0
    class_name = {}

    for i in range(len(labels)):
        with rasterio.open(labels[i]) as msk:
            mask = msk.read(1)

        if patchify:
            idx = patch_idx[i]
            mask = mask[idx[0] : idx[1], idx[2] : idx[3]]

        total_pix = mask.shape[0] * mask.shape[1]
        total += total_pix

        dic = {}
        keys = np.unique(mask)
        for key in keys:
            dic[key] = np.count_nonzero(mask == key)

        for key, value in dic.items():
            if key in class_name.keys():
                class_name[key] = value + class_name[key]
            else:
                class_name[key] = value

    for key, val in class_name.items():
        class_name[key] = (val / total) * 100

    print("Class percentage:")
    for key, val in class_name.items():
        print("class pixel: {} = {}".format(key, val))
    print(f"Unique value in the mask {class_name.keys()}")


This cell runs the `class_balance_check` function on the dataset.

### Outputs:
- Class percentage for each class in the dataset.


In [7]:
print("--------------------------------------------------------------")
print("Class percentage of training data before patch")
class_balance_check(patchify = False,
                    data_dir = train_df)
print("--------------------------------------------------------------")
print("Class percentage of training data after patch")
class_balance_check(patchify = True,
                    data_dir = p_train_dir)
print("--------------------------------------------------------------")

--------------------------------------------------------------
Class percentage of training data before patch
Class percentage:
class pixel: 0.0 = 70.08874934652577
class pixel: 1.0 = 27.745041639908504
class pixel: 2.0 = 2.166209013565727
Unique value in the mask dict_keys([0.0, 1.0, 2.0])
--------------------------------------------------------------
Class percentage of training data after patch
Class percentage:
class pixel: 0.0 = 52.816244272085335
class pixel: 1.0 = 43.711702640240006
class pixel: 2.0 = 3.4720530876746545
Unique value in the mask dict_keys([0.0, 1.0, 2.0])
--------------------------------------------------------------


## 4. Checking Unique Height and Width of Images


This cell defines a function `check_height_width` to check and print unique heights and widths of images and masks in a dataset.

### Function: `check_height_width(data_dir)`
- **Parameters**: 
  - `data_dir` (str): Path to the CSV file.
- **Process**:
  - Reads the CSV file.
  - Extracts image and mask paths.
  - Iterates through the images and masks to find unique shapes.
  - Prints the shapes of the dataset, input images, and masks.

In [6]:
def check_height_width(data_dir):
    """
    Summary:
        check unique hight and width of images from dataset
    Arguments:
        data_dir (str): path to csv file
    Return:
        print all the unique height and width
    """

    data = pd.read_csv(data_dir)

    print("Dataset:  ", data.shape)

    input_img = data.feature_ids.values
    input_mask = data.masks.values

    input_img_shape = []
    input_mask_shape = []

    for i in range(len(input_img)):
        with rasterio.open(input_img[i]) as im:
            img = im.read()
        with rasterio.open(input_mask[i]) as msk:
            mask = msk.read()
        # img = cv2.imread(input_img[i])
        # mask = cv2.imread(input_mask[i])
        print(f"Shape for:{i} image Shape:{img.shape}    mask shape:{mask.shape}")

        if img.shape not in input_img_shape:
            input_img_shape.append(img.shape)

        if mask.shape not in input_mask_shape:
            input_mask_shape.append(mask.shape)
            
    print("Input image shapes: ", input_img_shape)
    print("Input mask shapes: ", input_mask_shape)

In [7]:
print("----------------------------------------------------------------------")
print("Unique height and width of training dataset")
check_height_width(train_dir)
print("----------------------------------------------------------------------")

print("Unique height and width of testing dataset")
check_height_width(test_dir)
print("----------------------------------------------------------------------")

print("Unique height and width of validation dataset")
check_height_width(valid_dir)
print("----------------------------------------------------------------------")


----------------------------------------------------------------------
Unique height and width of training dataset
Dataset:   (58, 2)
Shape for:0 image Shape:(3, 2048, 2048)    mask shape:(1, 2048, 2048)
Shape for:1 image Shape:(3, 2048, 2048)    mask shape:(1, 2048, 2048)
Shape for:2 image Shape:(3, 2048, 2048)    mask shape:(1, 2048, 2048)
Shape for:3 image Shape:(3, 2048, 2048)    mask shape:(1, 2048, 2048)
Shape for:4 image Shape:(3, 2048, 2048)    mask shape:(1, 2048, 2048)
Shape for:5 image Shape:(3, 2048, 2048)    mask shape:(1, 2048, 2048)
Shape for:6 image Shape:(3, 2048, 2048)    mask shape:(1, 2048, 2048)
Shape for:7 image Shape:(3, 2048, 2048)    mask shape:(1, 2048, 2048)
Shape for:8 image Shape:(3, 2048, 2048)    mask shape:(1, 2048, 2048)
Shape for:9 image Shape:(3, 2048, 2048)    mask shape:(1, 2048, 2048)
Shape for:10 image Shape:(3, 2048, 2048)    mask shape:(1, 2048, 2048)
Shape for:11 image Shape:(3, 2048, 2048)    mask shape:(1, 2048, 2048)
Shape for:12 image Shape

## 5. Plotting Metrics from CSV Files

This cell defines functions to handle CSV files and plot metrics against epochs.

### Functions:
- `return_csv_from_path`: Returns a list of CSV file paths from a directory.
- `_plot_from_csv`: Plots specified columns from a CSV file against epochs.
- `plot_metrics_vs_epochs`: Plots metrics from a CSV file against epochs using `_plot_from_csv`.
- `plot_metric_vs_epochs_vs_models`: Plots a specific metric against epochs for different models and saves the combined results.

In [7]:
def return_csv_from_path(csv_path=csv_logger_path):
    csv_list = []
    # Iterate through each subdirectory
    for folder in csv_path.iterdir():
        # Check if the entry is a directory
        if folder.is_dir():
            # Iterate through files in the subdirectory
            for file in folder.iterdir():
                # Check if the entry is a file
                if file.is_file():
                    csv_list.append(file)
    return csv_list

def _plot_from_csv(csv_path, name, x_axis_name, y_axis_name, columns_to_plot=None):
    pathlib.Path((root_dir / "logs" / "plots" / "metrics_plots")).mkdir(parents=True, exist_ok=True)
    df = pd.read_csv(csv_path)
    epochs = df['epoch']
    if columns_to_plot is not None:
        columns_to_plot = columns_to_plot
    else:
        columns_to_plot = df.columns.to_list()[1:]

    plt.figure(figsize=(12, 8))
    for column in columns_to_plot:
        plt.plot(epochs, df[column], label=column, linewidth=3.0, marker="o", markersize=5)

    plt.title(f"{y_axis_name}_over_{x_axis_name}")
    plt.xlabel(x_axis_name)
    plt.ylabel(y_axis_name)
    plt.xticks(epochs.astype(int))
    plt.legend()
    plt.savefig(root_dir / "logs" / "plots" / "metrics_plots" / name)
    plt.show()

def plot_metrics_vs_epochs(csv_path, name, x_axis_name="Epochs", y_axis_name="Metrics_score", columns_to_plot=None):
    _plot_from_csv(csv_path=csv_path, name=name, x_axis_name=x_axis_name, y_axis_name=y_axis_name, columns_to_plot=columns_to_plot)

def plot_metric_vs_epochs_vs_models(metric_name="my_mean_iou"):
    pathlib.Path((root_dir / "logs" / "plots" / "csv_for_plotting")).mkdir(parents=True, exist_ok=True)
    csv_list = return_csv_from_path()
    result_df = pd.DataFrame()
    for csv_path in csv_list:
        df = pd.read_csv(csv_path)
        result_df[os.path.basename(csv_path)] = df[metric_name]
    result_df.index.name = "epoch"
    result_df.to_csv(os.path.join(root_dir / "logs" / "plots" / "csv_for_plotting" / f"{metric_name}_vs_epoch.csv"), encoding='utf-8', index=True, header=True)
    _plot_from_csv(root_dir / "logs" / "plots" / "csv_for_plotting" / f"{metric_name}_vs_epoch.csv", x_axis_name="Epochs", y_axis_name=metric_name, name=metric_name)


In [None]:
plot_metrics_vs_epochs(csv_logger_path / "planet-2" / "planet-2_ex_2024-07-13_e_4000_p_2048_s_1024_nsr-1_dtype_nsr-1.csv", name='metrics')
plot_metrics_vs_epochs(csv_logger_path / "planet-2" / "planet-2_ex_2024-07-13_e_4000_p_2048_s_1024_nsr-1_dtype_nsr-1.csv", name='metrics', columns_to_plot=["my_mean_iou"])
plot_metric_vs_epochs_vs_models()
plot_metric_vs_epochs_vs_models(metric_name="my_mean_iou")

## 6. Displaying and Saving All Images and Masks

Defines `pct_clip` & `false_colour` to read an image with dynamic shape and apply percentage clipping to each channel.

only for csv files to read and plot


In [3]:
def pct_clip(array, pct=[2.5, 97.5]):
    array_min, array_max = np.nanpercentile(array, pct[0]), np.nanpercentile(array, pct[1])
    clip = (array - array_min) / (array_max - array_min)
    clip[clip > 1] = 1
    clip[clip < 0] = 0
    return clip

def false_colour(path):
    with rasterio.open(path) as src:
        h,w = src.shape
        img = np.zeros((3,h,w))
        print(img.shape)
        for i in range(3):
            img[i,:,:]= pct_clip(src.read(i+1))
            
    return img, src

- **Parameters**:
  - `data`: Data file holding image paths.
  - `name` (str): Path to save images.
- **Process**:
  - Reads and processes each image and mask.
  - Displays images and masks in a figure.
  - Saves the figure to the specified directory.


In [4]:
def display_all(data, name, visualization_dir):
    """
    Summary:
        save all images into single figure
    Arguments:
        data : data file holding images path
        directory (str) : path to save images
    Return:
        save images figure into directory
    """
    
    pathlib.Path(visualization_dir / 'display').mkdir(parents=True, exist_ok=True)
    pathlib.Path(visualization_dir / "display"/"train").mkdir(parents=True, exist_ok=True)
    pathlib.Path(visualization_dir / "display"/"test").mkdir(parents=True, exist_ok=True)
    pathlib.Path(visualization_dir / "display"/"valid").mkdir(parents=True, exist_ok=True)

    for i in range(len(data)):
        image,src = false_colour(data.feature_ids.values[i])
        print("................................")
        print(np.mean(image),np.std(image))
        print("................................")
        mask = read_img(data.masks.values[i], label=True)
        print("................................")
        print(f"image_shape: {image.shape}")
        print(f"mask_shape: {mask.shape}")
        print("................................")
        id = data.feature_ids.values[i].split("/")[-1]
        display_list = {"image": image, "label": mask}

        plt.figure(figsize=(12, 8))
        title = list(display_list.keys())

        for i in range(len(display_list)):
            plt.subplot(1, len(display_list), i + 1)
            plt.title(title[i])
            if title[i]=='image':
                ax = plt.gca()
                show(display_list[title[i]],transform=src.transform, ax=ax)
            else:
                # plt.imshow((display_list[title[i]]), cmap="gray")
                plt.imshow((display_list[title[i]]))
            plt.axis("off")

        img_name = "{}.png".format(id)  # create file name to save
        plt.savefig(
            os.path.join((visualization_dir / "display"/ name), img_name),
            bbox_inches="tight",
            dpi=800,
        )
        plt.clf()
        plt.cla()
        plt.close()

Displays and saves training images and masks using the `display_all` function.


In [5]:
visualization_dir = pathlib.Path(f"/mnt/hdd2/mdsamiul/project/rice_crop_segmentation/data/signal_based_data/{dir_name}")

print(f"displaying training images and masks for {dir_name} \n")

# display_all(data = train_df,
#             name = "train",
#             visualization_dir = visualization_dir)

# display_all(data = valid_df,
#             name = "valid",
#             visualization_dir = visualization_dir)

display_all(data = test_df,
            name = "test",
            visualization_dir = visualization_dir)

displaying training images and masks for swir 

(3, 3379, 3286)
................................
0.5495073733089328 0.22675192383514214
................................
................................
image_shape: (3, 3379, 3286)
mask_shape: (3379, 3286)
................................


## 7. Calculating Mean & Standard Deviation

Loads training dataset CSV and defines a function to calculate mean and standard deviation for each band of the images.

### Actions:
- Loads training dataset CSV.
- Defines `calculate_stats` to:
  - Read and clip the first three bands of each image.
  - Calculate and print the mean and standard deviation for each band.
- Calls `calculate_stats` with the list of feature image paths.

In [6]:
features_path = train_df["feature_ids"].to_list()

def pct_clip(array, pct=[2.5, 97.5]):
    array_min, array_max = np.nanpercentile(array, pct[0]), np.nanpercentile(array, pct[1])
    clip = (array - array_min) / (array_max - array_min)
    clip[clip > 1] = 1
    clip[clip < 0] = 0
    return clip

def calculate_stats(file_paths):
    all_data1 = []
    all_data2 = []
    all_data3 = []
    for file_path in file_paths:
        with rasterio.open(file_path) as src:
            data1 = pct_clip(src.read(1))  # Read the first band
            all_data1.append(data1)
            data2 = pct_clip(src.read(2))  # Read the second band
            all_data2.append(data2)
            data3 = pct_clip(src.read(3))  # Read the third band
            all_data3.append(data3)

    # Stack all the data into a single numpy array
    stacked_data1 = np.stack(all_data1)
    stacked_data2 = np.stack(all_data2)
    stacked_data3 = np.stack(all_data3)

    # Calculate mean and standard deviation
    mean1 = np.mean(stacked_data1)
    std_dev1 = np.std(stacked_data1)
    mean2 = np.mean(stacked_data2)
    std_dev2 = np.std(stacked_data2)
    mean3 = np.mean(stacked_data3)
    std_dev3 = np.std(stacked_data3)
    
    return mean1, mean2, mean3, std_dev1, std_dev2, std_dev3
    
def calculate_average(file_paths):
    all_data = []
    for file_path in file_paths:
        with rasterio.open(file_path) as src:
            data = pct_clip(src.read()) 
            # print(data.shape)
            all_data.append(data)

    # Stack all the data into a single numpy array
    stacked_data = np.stack(all_data)

    # Calculate mean and standard deviation
    mean = np.mean(stacked_data)
    std_dev = np.std(stacked_data)

    return mean, std_dev

In [7]:
mean1, mean2, mean3, std_dev1, std_dev2, std_dev3 = calculate_stats(features_path)
mean, std_dev = calculate_average(features_path)


print("--------------------------------------------------")
print("Average mean across all files:", mean)
print("Standard deviation across all files:", std_dev)
print("--------------------------------------------------")
print("1st band (mean):", mean1)
print("1st band (std):", std_dev1)
print("--------------------------------------------------")
print("2nd band (mean):", mean2)
print("2nd band (std):", std_dev2)
print("--------------------------------------------------")
print("3rd band (mean):", mean3)
print("3rd band (std):", std_dev3)
print("--------------------------------------------------")

--------------------------------------------------
Average mean across all files: 0.500127772004659
Standard deviation across all files: 0.28518292573178594
--------------------------------------------------
1st band (mean): 0.47304764605328536
1st band (std): 0.2846765337200112
--------------------------------------------------
2nd band (mean): 0.5454271347146809
2nd band (std): 0.26668836613062186
--------------------------------------------------
3rd band (mean): 0.4577998437536517
3rd band (std): 0.28769463044461874
--------------------------------------------------


## 8. Tiles Generation

Defines `save_tiles` to split large images into smaller tiles and save them.

### Function: `save_tiles(path, out_path, tiles_size=2048, stride=1024)`
- **Parameters**:
  - `path`: Directory with original images.
  - `out_path`: Directory to save the tiles.
  - `tiles_size`: Size of each tile.
  - `stride`: Stride for tiling.
- **Process**: Iterates through images, splits them into tiles, and saves the tiles.

In [3]:
def save_tiles(path, out_path, tiles_size=2048, stride=1024):
    os.makedirs(out_path, exist_ok=True)
    
    # Iterate over each file in the path
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        with rasterio.open(file_path) as src:
            # Get metadata and calculate number of tiles in each dimension
            meta = src.meta
            meta["height"]= tiles_size
            meta["width"]= tiles_size
            # print(meta)
            height, width = src.shape
            num_rows = math.ceil((height - tiles_size) / stride + 1)
            num_cols = math.ceil((width - tiles_size) / stride + 1)
            total_tiles = num_rows* num_cols
            print(f"shape of the image before tiles : {src.shape}")
            print(f"number of tiles={total_tiles}")
            print("..................................................")
            # Iterate over each tile
            for row in range(num_rows):
                for col in range(num_cols):
                    # Calculate window coordinates
                    row_start = row * stride
                    row_stop = min(row_start + tiles_size, height)
                    col_start = col * stride
                    col_stop = min(col_start + tiles_size, width)
                    
                    # Read the tile data
                    # window = Window(x0, y0, x1 - x0, y1 - y0)
                    window = Window.from_slices((row_stop-stride, row_stop), (col_stop-stride, col_stop))
                    tile_data = src.read(window=window)
                    # print("...........")
                    # print(tile_data.shape)
                    # Save the tile with a suffix of tile id
                    out_filename = f"tile_{row}_{col}_{os.path.splitext(filename)[0]}.tif"
                    out_file_path = os.path.join(out_path, out_filename)
                    with rasterio.open(out_file_path, 'w', **meta) as dst:
                        dst.write(tile_data)

In [6]:
path = f"/mnt/hdd2/mdsamiul/project/rice_crop_segmentation/data/signal_based_data/{dir_name}-full/input"
out_path = f"/mnt/hdd2/mdsamiul/project/rice_crop_segmentation/data/signal_based_data/{dir_name}/input"

print(path)
print(out_path)

/mnt/hdd2/mdsamiul/project/rice_crop_segmentation/data/signal_based_data/vv-full/input
/mnt/hdd2/mdsamiul/project/rice_crop_segmentation/data/signal_based_data/vv/input


In [7]:
save_tiles(path, out_path)

shape of the image before tiles : (1962, 2769)
number of tiles=2
..................................................
shape of the image before tiles : (1962, 2769)
number of tiles=2
..................................................
shape of the image before tiles : (3391, 3260)
number of tiles=9
..................................................
shape of the image before tiles : (3379, 3286)
number of tiles=9
..................................................
shape of the image before tiles : (3406, 3286)
number of tiles=9
..................................................
shape of the image before tiles : (3379, 3286)
number of tiles=9
..................................................
shape of the image before tiles : (3391, 3260)
number of tiles=9
..................................................
shape of the image before tiles : (3406, 3286)
number of tiles=9
..................................................


In [None]:
def rename_files(datapath):
    # List all files in the directory
    files = os.listdir(datapath)
    
    for filename in files:
        # Extract the file extension
        _, ext = os.path.splitext(filename)
        
        # Check if the filename starts with DEM_ab.tif
        if filename.startswith("DEM_"):
            new_filename = filename.replace("DEM_", "").replace(".tif", "_nasadem.tif")
        
        # Check if the filename starts with VV_ab.tif
        elif filename.startswith("VV_"):
            new_filename = filename.replace("VV_", "").replace(".tif", "_vv.tif")
        
        # Check if the filename starts with VH_ab.tif
        elif filename.startswith("VH_"):
            new_filename = filename.replace("VH_", "").replace(".tif", "_vh.tif")
        
        # Check if the filename starts with GT_ab.tif
        elif filename.startswith("GT_"):
            new_filename = filename.replace("GT_", "")
        
        else:
            # If none of the conditions are met, skip this file
            raise ValueError("files_name_mismatch")
        
        # Construct the new filepath
        new_filepath = os.path.join(datapath, new_filename)
        
        # Rename the file
        os.rename(os.path.join(datapath, filename), new_filepath)
        print(f"Renamed {filename} to {new_filename}")


Calls the `rename_files` function to rename files in the specified dataset directory.


In [None]:
datapath = config.dataset_dir
rename_files(datapath)

## 9. Save all Output in Visualization in rtf

Runs a visualization script and saves the terminal output to an RTF file.

### Actions:
- Executes the `visualization.py` script using a terminal command.
- Captures the terminal output.
- Saves the output to `data_statistics.rtf`.

In [22]:
# Run the command in the terminal
command = "python visualization.ipynb"
result = subprocess.run(command, shell=True, capture_output=True, text=True)

# Get the terminal output
terminal_output = result.stdout

# Save the output to an RTF file
rtf_filename = "data_statistics.rtf"
with open(rtf_filename, "w") as rtf_file:
    # rtf_file.write("{\\rtf1\\ansi\n")
    rtf_file.write(terminal_output)
    # rtf_file.write("}")

print(f"Terminal output saved to {rtf_filename}")

Terminal output saved to data_statistics.rtf
