In [1]:
import os
import cv2
import json
import config
import pathlib
import math
import rasterio
from rasterio.windows import Window
import numpy as np
import pandas as pd
from config import *
import earthpy.plot as ep
import earthpy.spatial as es
from dataset import read_img
from matplotlib import pyplot as plt
import subprocess
import pyperclip

import rasterio as rio
from rasterio.plot import show
import matplotlib.pyplot as plt
import numpy as np

# train_df = pd.read_csv(config.train_dir)
# test_df =  pd.read_csv(config.test_dir)
# valid_df = pd.read_csv(config.valid_dir)
# p_train_json = config.p_train_dir
# p_test_json = config.p_test_dir
# p_valid_json = config.p_valid_dir

In [2]:
print(f"Total number of training images = {len(train_df)}")
print(f"Total number of test images = {len(test_df)}")
print(f"Total number of validation images = {len(valid_df)}")

Total number of training images = 430
Total number of test images = 54
Total number of validation images = 54


In [3]:
def class_balance_check(patchify, data_dir):
    """
    Summary:
        checking class percentage in full dataset
    Arguments:
        patchify (bool): TRUE if want to check class balance for patchify experiments
        data_dir (str): directory where data files are saved 
    Return:
        class percentage
    """
    if patchify:
        with open(data_dir, "r") as j:
            train_data = json.loads(j.read())
        labels = train_data["masks"]
        patch_idx = train_data["patch_idx"]

    else:
        train_data = pd.read_csv(data_dir)
        labels = train_data.masks.values
        patch_idx = None

    total = 0
    class_name = {}

    for i in range(len(labels)):
        with rasterio.open(labels[i]) as msk:
            mask = msk.read(1)
        # mask = cv2.imread(labels[i])
        # mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY)
        # mask[mask==2]=0
        if patchify:
            idx = patch_idx[i]
            mask = mask[idx[0] : idx[1], idx[2] : idx[3]]

        total_pix = mask.shape[0] * mask.shape[1]
        total += total_pix

        dic = {}
        keys = np.unique(mask)
        for i in keys:
            dic[i] = np.count_nonzero(mask == i)

        for key, value in dic.items():
            if key in class_name.keys():
                class_name[key] = value + class_name[key]
            else:
                class_name[key] = value

    for key, val in class_name.items():
        class_name[key] = (val / total) * 100

    print("Class percentage:")
    for key, val in class_name.items():
        print("class pixel: {} = {}".format(key, val))
    print(f"unique value in the mask {class_name.keys()}")

In [5]:
print("class percentage of traning data before patch")
class_balance_check(patchify=False, data_dir=config.train_dir)
print(".........................................................................................")
print("class percentage of traning data after patch")
class_balance_check(patchify=True, data_dir=config.p_train_dir)

class percentage of traning data before patch
Class percentage:
class pixel: 2.0 = 75.63027670217114
class pixel: 1.0 = 24.369723297828852
unique value in the mask dict_keys([2.0, 1.0])
.........................................................................................
class percentage of traning data after patch
Class percentage:
class pixel: 1.0 = 65.69205356629013
class pixel: 2.0 = 34.30794643370988
unique value in the mask dict_keys([1.0, 2.0])


In [7]:
def check_height_width(data_dir):
    """
    Summary:
        check unique hight and width of images from dataset
    Arguments:
        data_dir (str): path to csv file
    Return:
        print all the unique height and width
    """

    data = pd.read_csv(data_dir)


    print("Dataset:  ", data.shape)

    input_img = data.feature_ids.values
    input_mask = data.masks.values

    input_img_shape = []
    input_mask_shape = []

    for i in range(len(input_img)):
        with rasterio.open(input_img[i]) as im:
            img = im.read()
        with rasterio.open(input_mask[i]) as msk:
            mask = msk.read()
        # img = cv2.imread(input_img[i])
        # mask = cv2.imread(input_mask[i])

        if img.shape not in input_img_shape:
            input_img_shape.append(img.shape)

        if mask.shape not in input_mask_shape:
            input_mask_shape.append(mask.shape)

    print("Input image shapes: ", input_img_shape)
    print("Input mask shapes: ", input_mask_shape)

In [8]:
print("Unique height and width of training dataset")
check_height_width(config.train_dir)
print(".........................................................................................")
print("Unique height and width of testing dataset")
check_height_width(config.test_dir)
print(".........................................................................................")

print("Unique height and width of validation dataset")
check_height_width(config.valid_dir)

Unique height and width of training dataset
Dataset:   (430, 2)
Input image shapes:  [(3, 512, 512)]
Input mask shapes:  [(1, 512, 512)]
.........................................................................................
Unique height and width of testing dataset
Dataset:   (54, 2)
Input image shapes:  [(3, 512, 512)]
Input mask shapes:  [(1, 512, 512)]
.........................................................................................
Unique height and width of validation dataset
Dataset:   (54, 2)
Input image shapes:  [(3, 512, 512)]
Input mask shapes:  [(1, 512, 512)]


In [7]:
def return_csv_from_path(csv_path=config.csv_logger_path):
    csv_list = []
    # Iterate through each subdirectory
    for folder in csv_path.iterdir():
        # Check if the entry is a directory
        if folder.is_dir():
            # Iterate through files in the subdirectory
            for file in folder.iterdir():
                # Check if the entry is a file
                if file.is_file():
                    csv_list.append(file)
    # print(csv_list)
    return csv_list
                    

def _plot_from_csv(csv_path, name, x_axis_name, y_axis_name, columns_to_plot=None):
    pathlib.Path((config.root_dir /"logs" / "plots"/"metrics_plots")).mkdir(parents=True, exist_ok=True)
    df = pd.read_csv(csv_path)
    epochs = df['epoch']
    if columns_to_plot is not None:
        columns_to_plot = columns_to_plot
    else:
        columns_to_plot = df.columns.to_list()[1:]

    plt.figure(figsize=(12, 8))
    for column in columns_to_plot:
        plt.plot(epochs, df[column], label=column, linewidth=3.0,
            marker="o",
            markersize=5)

    plt.title(f"{y_axis_name}_over_{x_axis_name}")
    plt.xlabel(x_axis_name)
    plt.ylabel(y_axis_name)
    plt.xticks(epochs.astype(int))
    plt.legend()
    plt.savefig(config.root_dir/"logs"/"plots"/"metrics_plots"/name)
    plt.show()

def plot_metrics_vs_epochs(csv_path, name, x_axis_name= "Epochs", y_axis_name="Metrics_score",columns_to_plot=None):
    _plot_from_csv(csv_path=csv_path, name=name,x_axis_name=x_axis_name, y_axis_name=y_axis_name, columns_to_plot=columns_to_plot)

def plot_metric_vs_epochs_vs_models(metric_name="val_f1_score"):
    pathlib.Path((config.root_dir /"logs"/ "plots"/"csv_for_plotting")).mkdir(parents=True, exist_ok=True)
    csv_list = return_csv_from_path()
    result_df = pd.DataFrame()
    for csv_path in csv_list:
        df = pd.read_csv(csv_path)
        result_df[os.path.basename(csv_path)] = df[metric_name]
    result_df.index.name = "epoch"
    result_df.to_csv(os.path.join(config.root_dir/"logs"/"plots"/"csv_for_plotting"/f"{metric_name}_vs_epoch.csv"), encoding='utf-8',index=True, header=True)
    _plot_from_csv(config.root_dir/"logs"/"plots"/"csv_for_plotting"/f"{metric_name}_vs_epoch.csv", x_axis_name= "Epochs", y_axis_name=metric_name, name=metric_name)

In [8]:
plot_metrics_vs_epochs(config.csv_logger_path/"unet"/"unet_ex_training_ep_20.csv",name='metrics')
plot_metrics_vs_epochs(config.csv_logger_path/"unet"/"unet_ex_training_ep_20.csv",name='metrics',columns_to_plot=["f1_score"])
plot_metric_vs_epochs_vs_models()
plot_metric_vs_epochs_vs_models(metric_name="recall")

FileNotFoundError: [Errno 2] No such file or directory: '/mnt/hdd2/mdsamiul/project/imseg_rice/logs/csv_logger/unet/unet_ex_training_ep_20.csv'

In [13]:
def pct_clip(array,pct=[2.5,97.5]):
    array_min, array_max = np.nanpercentile(array,pct[0]), np.nanpercentile(array,pct[1])
    clip = (array - array_min) / (array_max - array_min)
    clip[clip>1]=1
    clip[clip<0]=0
    return clip

In [None]:
# with rio.open("/mnt/hdd2/mdsamiul/project/rice_crop_segmentation/data/dataset/input/tile_0_0_vVvHdF_Area1_17-18_Beg.tif") as src:
#     with rio.open(
#             'RGB_Temp.tif', 'w+',
#             driver='GTiff',
#             dtype= rio.float32,
#             count=3,
#             crs = src.crs,
#             width=src.width,
#             height=src.height,
#             transform=src.transform,
#         ) as dst:
#         V = pct_clip(src.read(1))
#         dst.write(V.astype(rio.float32),1)
#         V = pct_clip(src.read(2))
#         dst.write(V.astype(rio.float32),2)
#         V = pct_clip(src.read(3))
#         dst.write(V.astype(rio.float32),3)

In [None]:
# with rio.open("/mnt/hdd2/mdsamiul/project/rice_crop_segmentation/data/dataset/input/tile_0_0_vVvHdF_Area1_17-18_Beg.tif") as src:
#     # img= np.zeros((3,512,512))
#     img= pct_clip(src.read())

In [14]:
def false_colour_read(path):
    img= np.zeros((3,512,512))
    with rasterio.open(path) as src:
        for i in range(3):
            img[i,:,:]= pct_clip(src.read(i+1))
            
    return img, src

In [15]:
global h,w

In [16]:
def false_colour_read_bt(path):
    with rasterio.open(path) as src:
        global h,w
        h,w =src.shape
        img= np.zeros((3,h,w))
        # print(img.shape)
        for i in range(3):
            img[i,:,:]= pct_clip(src.read(i+1))
            
    return img, src
# print("displaying training images and masks")
# display_all(data=train_df,name="train")

In [85]:
np.zeros((h,w,3)).shape

(3379, 3286, 3)

In [None]:
# fig,ax=plt.subplots()
# with rio.open("RGB_Temp.tif") as src2:
#     show(src2.read(),transform=src2.transform,ax=ax)
#     ax.grid(False)  # Turn off gridlines along the borders
#     ax.set_xticks([])  # Remove x-axis ticks
#     ax.set_yticks([])  # Remove y-axis ticks
#     ax.set_xlabel('')  # Remove x-axis label
#     ax.set_ylabel('')  # Remove y-axis label
# plt.show()
# plt.savefig("false_color")

In [42]:
visualization_dir = root_dir / "data/dataset-nsr-1-bt"

In [18]:
def display_all(data, name):
    """
    Summary:
        save all images into single figure
    Arguments:
        data : data file holding images path
        directory (str) : path to save images
    Return:
        save images figure into directory
    """

    pathlib.Path((visualization_dir / "display")).mkdir(parents=True, exist_ok=True)
    pathlib.Path((visualization_dir / "display"/"train")).mkdir(parents=True, exist_ok=True)
    pathlib.Path((visualization_dir / "display"/"test")).mkdir(parents=True, exist_ok=True)
    pathlib.Path((visualization_dir / "display"/"valid")).mkdir(parents=True, exist_ok=True)

    for i in range(len(data)):
        image,src = false_colour_read_bt(data.feature_ids.values[i])
        mask = read_img(data.masks.values[i], label=True)
        print("................................")
        print(f"image_shape: {image.shape}")
        print(f"mask_shape: {mask.shape}")
        print("................................")
        id = data.feature_ids.values[i].split("/")[-1]
        display_list = {"image": image, "label": mask}

        plt.figure(figsize=(12, 8))
        title = list(display_list.keys())

        for i in range(len(display_list)):
            plt.subplot(1, len(display_list), i + 1)
            plt.title(title[i])
            if title[i]=='image':
                ax = plt.gca()
                show(display_list[title[i]],transform=src.transform, ax=ax)
            else:
                plt.imshow((display_list[title[i]]), cmap="gray")
            plt.axis("off")

        prediction_name = "img_id_{}.png".format(id)  # create file name to save
        plt.savefig(
            os.path.join((visualization_dir / "display"/ name), prediction_name),
            bbox_inches="tight",
            dpi=800,
        )
        plt.clf()
        plt.cla()
        plt.close()

In [40]:
train_df = pd.read_csv("/mnt/hdd2/mdsamiul/project/rice_crop_segmentation/data/dataset-nsr-1-bt/data/csv/train.csv")
test_df =  pd.read_csv("/mnt/hdd2/mdsamiul/project/rice_crop_segmentation/data/dataset-nsr-1-bt/data/csv/test.csv")
valid_df = pd.read_csv("/mnt/hdd2/mdsamiul/project/rice_crop_segmentation/data/dataset-nsr-1-bt/data/csv/valid.csv")
# p_train_json = config.p_train_dir
# p_test_json = config.p_test_dir
# p_valid_json = config.p_valid_dir

In [20]:
def read_img(directory, in_channels=None, label=False, patch_idx=None, height=256, width=256):
    """
    Summary:
        read image with rasterio and normalize the feature
    Arguments:
        directory (str): image path to read
        in_channels (bool): number of channels to read
        label (bool): TRUE if the given directory is mask directory otherwise False
        patch_idx (list): patch indices to read
    Return:
        numpy.array
    """

    # for musk images
    if label:
        with rasterio.open(directory) as fmask: # opening the directory
            mask = fmask.read(1)    # read the image (Data from a raster band can be accessed by the band’s index number. Following the GDAL convention, bands are indexed from 1. [int or list, optional] – If indexes is a list, the result is a 3D array, but is a 2D array if it is a band index number.
        
        mask[mask == 2.0] = 0
        mask[mask == 1.0] = 1
        # np.swapaxes(mask,0,2)
        # mask[mask == 255] = 1
        # mask[mask == 170] = 2
        # mask[mask == 85] = 2
        mask = mask[... , np.newaxis]
        mask = mask.astype("int32")
        # print(".......mask...............")
        # print(mask.shape)
    
        if patch_idx:
            # extract patch from original mask
            return mask[patch_idx[0]:patch_idx[1], patch_idx[2]:patch_idx[3]]
        else:
            return mask #np.expand_dims(mask, axis=2)
    # for features images
    else:
        # read N number of channels
        with rasterio.open(directory) as inp:
            X =inp.read()
        X= np.swapaxes(X,0,2)
        X = (X-mean)/std
        if patch_idx:
            # extract patch from original features
            return X[patch_idx[0]:patch_idx[1], patch_idx[2]:patch_idx[3], :]
        else:
            return X

In [43]:
print("displaying training images and masks")
display_all(data=train_df,name="train")

displaying training images and masks
................................
image_shape: (3, 3379, 3286)
mask_shape: (3379, 3286, 1)
................................
................................
image_shape: (3, 1962, 2769)
mask_shape: (1962, 2769, 1)
................................
................................
image_shape: (3, 3406, 3286)
mask_shape: (3406, 3286, 1)
................................
................................
image_shape: (3, 3379, 3286)
mask_shape: (3379, 3286, 1)
................................
................................
image_shape: (3, 1962, 2769)
mask_shape: (1962, 2769, 1)
................................
................................
image_shape: (3, 3406, 3286)
mask_shape: (3406, 3286, 1)
................................


In [48]:
root_dir / "data/dataset-nsr-3-bt"

PosixPath('/mnt/hdd2/mdsamiul/project/imseg_rice/data/dataset-nsr-3-bt')

In [44]:
print("displaying testing images and masks")
display_all(data=test_df, name = "test")

displaying testing images and masks
................................
image_shape: (3, 3391, 3260)
mask_shape: (3391, 3260, 1)
................................


In [45]:
print("displaying validation images and masks")
display_all(data=valid_df, name= "valid")

displaying validation images and masks
................................
image_shape: (3, 3391, 3260)
mask_shape: (3391, 3260, 1)
................................


In [57]:
train_df = pd.read_csv("/mnt/hdd2/mdsamiul/project/rice_crop_segmentation/data/dataset-nsr-1/data/csv/train.csv")
features_path = train_df["feature_ids"].to_list()
def calculate_stats(file_paths):
    all_data1 = []
    all_data2 = []
    all_data3 = []
    for file_path in file_paths:
        with rasterio.open(file_path) as src:
            data1 = src.read(1)  # Read the first band
            all_data1.append(data1)
            data2 = src.read(2)  # Read the first band
            all_data2.append(data2)
            data3 = src.read(3)  # Read the first band
            all_data3.append(data3)

    # Stack all the data into a single numpy array
    stacked_data1 = np.stack(all_data1)
    stacked_data2 = np.stack(all_data2)
    stacked_data3 = np.stack(all_data3)

    # Calculate mean and standard deviation
    mean1 = np.mean(stacked_data1)
    std_dev1 = np.std(stacked_data1)
    mean2 = np.mean(stacked_data2)
    std_dev2 = np.std(stacked_data2)
    mean3 = np.mean(stacked_data3)
    std_dev3 = np.std(stacked_data3)
    print("Average mean across 1st band:", mean1)
    print("Standard deviation across 1st band:", std_dev1)
    print("Average mean across 2bd band:", mean2)
    print("Standard deviation across 2nd band:", std_dev2)
    print("Average mean across 3rd band:", mean3)
    print("Standard deviation across 3rd band:", std_dev3)

# Example list of file paths
calculate_stats(features_path)




Average mean across 1st band: 0.21080770438931093
Standard deviation across 1st band: 0.08680501088155813
Average mean across 2bd band: 0.16065762694462385
Standard deviation across 2nd band: 0.07915457728508787
Average mean across 3rd band: 0.07456150900220794
Standard deviation across 3rd band: 0.037746545995537006


In [58]:
# /mnt/hdd2/mdsamiul/project/rice_crop_segmentation/data/dataset-nsr-3/data/csv
train_df = pd.read_csv("/mnt/hdd2/mdsamiul/project/rice_crop_segmentation/data/dataset-nsr-1/data/csv/train.csv")
features_path = train_df["feature_ids"].to_list()
def calculate_stats(file_paths):
    all_data = []
    for file_path in file_paths:
        with rasterio.open(file_path) as src:
            data = src.read()  
            # print(data.shape)
            all_data.append(data)

    # Stack all the data into a single numpy array
    stacked_data = np.stack(all_data)

    # Calculate mean and standard deviation
    mean = np.mean(stacked_data)
    std_dev = np.std(stacked_data)

    return mean, std_dev

# Example list of file paths

mean, std_dev = calculate_stats(features_path)

print("Average mean across all files:", mean)
print("Standard deviation across all files:", std_dev)


Average mean across all files: 0.14867561344538044
Standard deviation across all files: 0.0907785301413935


In [None]:
root_dir = Path("/mnt/hdd2/mdsamiul/project/dataset/trial")

In [25]:
def save_tiles(path, out_path, tiles_size=2048, stride=1024):
    os.makedirs(out_path, exist_ok=True)
    
    # Iterate over each file in the path
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        with rasterio.open(file_path) as src:
            # Get metadata and calculate number of tiles in each dimension
            meta = src.meta
            meta["height"]= tiles_size
            meta["width"]= tiles_size
            # print(meta)
            height, width = src.shape
            num_rows = math.ceil((height - tiles_size) / stride + 1)
            num_cols = math.ceil((width - tiles_size) / stride + 1)
            total_tiles = num_rows* num_cols
            print(f"shape of the image before tiles : {src.shape}")
            print(f"number of tiles={total_tiles}")
            print("..................................................")
            # Iterate over each tile
            for row in range(num_rows):
                for col in range(num_cols):
                    # Calculate window coordinates
                    row_start = row * stride
                    row_stop = min(row_start + tiles_size, height)
                    col_start = col * stride
                    col_stop = min(col_start + tiles_size, width)
                    
                    # Read the tile data
                    # window = Window(x0, y0, x1 - x0, y1 - y0)
                    window = Window.from_slices((row_stop-stride, row_stop), (col_stop-stride, col_stop))
                    tile_data = src.read(window=window)
                    # print("...........")
                    # print(tile_data.shape)
                    # Save the tile with a suffix of tile id
                    # out_filename = f"{os.path.splitext(filename)[0]}_tile_{row}_{col}.tif"
                    out_filename = f"tile_{row}_{col}_{os.path.splitext(filename)[0]}.tif"
                    out_file_path = os.path.join(out_path, out_filename)
                    with rasterio.open(out_file_path, 'w', **meta) as dst:
                        dst.write(tile_data)

In [50]:
data = "/mnt/hdd2/mdsamiul/project/rice_crop_segmentation/data/dataset-nsr-1-bt/input/"

In [51]:
save_tiles(data,root_dir/"data/dataset-nsr-1/input/")

shape of the image before tiles : (3391, 3260)
number of tiles=9
..................................................
shape of the image before tiles : (3406, 3286)
number of tiles=9
..................................................
shape of the image before tiles : (3379, 3286)
number of tiles=9
..................................................
shape of the image before tiles : (1962, 2769)
number of tiles=2
..................................................
shape of the image before tiles : (3406, 3286)
number of tiles=9
..................................................
shape of the image before tiles : (3391, 3260)
number of tiles=9
..................................................
shape of the image before tiles : (1962, 2769)
number of tiles=2
..................................................
shape of the image before tiles : (3379, 3286)
number of tiles=9
..................................................


In [None]:
def rename_files(datapath):
    # List all files in the directory
    files = os.listdir(datapath)
    
    for filename in files:
        # Extract the file extension
        _, ext = os.path.splitext(filename)
        
        # Check if the filename starts with DEM_ab.tif
        if filename.startswith("DEM_"):
            new_filename = filename.replace("DEM_", "").replace(".tif", "_nasadem.tif")
        
        # Check if the filename starts with VV_ab.tif
        elif filename.startswith("VV_"):
            new_filename = filename.replace("VV_", "").replace(".tif", "_vv.tif")
        
        # Check if the filename starts with VH_ab.tif
        elif filename.startswith("VH_"):
            new_filename = filename.replace("VH_", "").replace(".tif", "_vh.tif")
        
        # Check if the filename starts with GT_ab.tif
        elif filename.startswith("GT_"):
            new_filename = filename.replace("GT_", "")
        
        else:
            # If none of the conditions are met, skip this file
            raise ValueError("files_name_mismatch")
        
        # Construct the new filepath
        new_filepath = os.path.join(datapath, new_filename)
        
        # Rename the file
        os.rename(os.path.join(datapath, filename), new_filepath)
        print(f"Renamed {filename} to {new_filename}")


In [None]:
datapath = config.dataset_dir
rename_files(datapath)

In [None]:
# Run the command in the terminal
command = "python visualization.py"
result = subprocess.run(command, shell=True, capture_output=True, text=True)

# Get the terminal output
terminal_output = result.stdout

# Save the output to an RTF file
rtf_filename = "data_statistics.rtf"
with open(rtf_filename, "w") as rtf_file:
    # rtf_file.write("{\\rtf1\\ansi\n")
    rtf_file.write(terminal_output)
    # rtf_file.write("}")

print(f"Terminal output saved to {rtf_filename}")

Terminal output saved to data_statistics.rtf
