In [1]:
import os
import cv2
import json
import config
import pathlib
import math
import rasterio
from rasterio.windows import Window
import numpy as np
import pandas as pd
from config import *
import earthpy.plot as ep
import earthpy.spatial as es
from dataset import read_img
from matplotlib import pyplot as plt
import subprocess
import pyperclip

In [24]:
train_df = pd.read_csv(config.train_dir)
test_df =  pd.read_csv(config.test_dir)
valid_df = pd.read_csv(config.valid_dir)

p_train_json = config.p_train_dir
p_test_json = config.p_test_dir
p_valid_json = config.p_valid_dir

In [25]:
print(f"Total number of training images = {len(train_df)}")
print(f"Total number of test images = {len(test_df)}")
print(f"Total number of validation images = {len(valid_df)}")

Total number of training images = 80
Total number of test images = 10
Total number of validation images = 10


In [26]:
def class_balance_check(patchify, data_dir):
    """
    Summary:
        checking class percentage in full dataset
    Arguments:
        patchify (bool): TRUE if want to check class balance for patchify experiments
        data_dir (str): directory where data files are saved 
    Return:
        class percentage
    """
    if patchify:
        with open(data_dir, "r") as j:
            train_data = json.loads(j.read())
        labels = train_data["masks"]
        patch_idx = train_data["patch_idx"]

    # commented out by manik (as patchify is false, it's CFR or CFR_CB, so it's deprecated)
    else:
        train_data = pd.read_csv(data_dir)
        labels = train_data.masks.values
        patch_idx = None

    total = 0
    class_name = {}

    for i in range(len(labels)):
        with rasterio.open(labels[i]) as l:
            mask = l.read(1)
        mask[mask == 2] = 0
        # mask[mask < 105] = 0
        # mask[mask > 104] = 1
        if patchify:
            idx = patch_idx[i]
            mask = mask[idx[0] : idx[1], idx[2] : idx[3]]

        total_pix = mask.shape[0] * mask.shape[1]
        total += total_pix

        dic = {}
        keys = np.unique(mask)
        for i in keys:
            dic[i] = np.count_nonzero(mask == i)

        for key, value in dic.items():
            if key in class_name.keys():
                #problems
                class_name[key] = value + class_name[key]
            else:
                class_name[key] = value

    for key, val in class_name.items():
        class_name[key] = (val / total) * 100

    print("Class percentage:")
    for key, val in class_name.items():
        print("class pixel: {} = {}".format(key, val))
    print(f"unique value in the mask {class_name.keys()}")

In [27]:
print("class percentage of traning data before patch")
class_balance_check(patchify=False, data_dir=config.train_dir)
print(".........................................................................................")
print("class percentage of traning data after patch")
class_balance_check(patchify=True, data_dir=config.p_train_dir)

class percentage of traning data before patch


Class percentage:
class pixel: 0.0 = 92.04329490661621
class pixel: 1.0 = 7.95670509338379
unique value in the mask dict_keys([0.0, 1.0])
.........................................................................................
class percentage of traning data after patch
Class percentage:
class pixel: 0.0 = 92.06820170084636
class pixel: 1.0 = 7.9317982991536455
unique value in the mask dict_keys([0.0, 1.0])


In [28]:
def check_height_width(data_dir):
    """
    Summary:
        check unique hight and width of images from dataset
    Arguments:
        data_dir (str): path to csv file
    Return:
        print all the unique height and width
    """

    data = pd.read_csv(data_dir)
    # removing UU or UMM or UM
    # data = data[data['feature_ids'].str.contains('uu_00') == False]
    #problems
    # data = data[data["feature_ids"].str.contains("umm_00") == False]
    # data = data[data["feature_ids"].str.contains("um_00") == False]

    print("Number of Datasets:  ", data.shape[0])

    input_img = data.feature_ids.values
    input_mask = data.masks.values

    vv_img_shape = []
    vh_img_shape = []
    dem_img_shape = []
    mask_img_shape = []

    for i in range(len(data)):
        with rasterio.open((data.feature_ids.values[i]+"_vv.tif")) as vv:
            vv_shape = vv.shape
        if vv_shape not in vv_img_shape:
            vv_img_shape.append(vv_shape)
        with rasterio.open((data.feature_ids.values[i]+"_vh.tif")) as vh:
            vh_shape = vh.shape
        if vh_shape not in vh_img_shape:
            vh_img_shape.append(vh_shape)
        with rasterio.open((data.feature_ids.values[i]+"_nasadem.tif")) as dem:
            dem_shape = dem.shape
        if dem_shape not in dem_img_shape:
            dem_img_shape.append(dem_shape)
        with rasterio.open((data.masks.values[i])) as mask:
            mask_shape = mask.shape
        if mask_shape not in mask_img_shape:
            # print(mask)
            mask_img_shape.append(mask_shape)

    print(f"vv_img_shape: {vv_img_shape}")
    print(f"vh_img_shape: {vh_img_shape}")
    print(f"dem_img_shape: {dem_img_shape}")
    print(f"mask_img_shape: {mask_img_shape}")

In [29]:
print("Unique height and width of training dataset")
check_height_width(config.train_dir)
print(".........................................................................................")
print("Unique height and width of testing dataset")
check_height_width(config.test_dir)
print(".........................................................................................")
print("Unique height and width of validation dataset")
check_height_width(config.valid_dir)
print(".........................................................................................")
print("Unique height and width of evaluation dataset")
check_height_width(config.eval_dir)

Unique height and width of training dataset
Number of Datasets:   80


vv_img_shape: [(512, 512)]
vh_img_shape: [(512, 512)]
dem_img_shape: [(512, 512)]
mask_img_shape: [(512, 512)]
.........................................................................................
Unique height and width of testing dataset
Number of Datasets:   10
vv_img_shape: [(512, 512)]
vh_img_shape: [(512, 512)]
dem_img_shape: [(512, 512)]
mask_img_shape: [(512, 512)]
.........................................................................................
Unique height and width of validation dataset
Number of Datasets:   10
vv_img_shape: [(512, 512)]
vh_img_shape: [(512, 512)]
dem_img_shape: [(512, 512)]
mask_img_shape: [(512, 512)]
.........................................................................................
Unique height and width of evaluation dataset


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/hdd2/mdsamiul/project/imseg_sar_csml/data/csv/eval.csv'

In [14]:
check_height_width("/mnt/hdd2/mdsamiul/project/imseg_csml/data/csv/all_data.csv")


Number of Datasets:   21
<closed DatasetReader name='/mnt/hdd2/mdsamiul/project/dataset/rice_data_training/awc00.tif' mode='r'>
<closed DatasetReader name='/mnt/hdd2/mdsamiul/project/dataset/rice_data_training/2018_2019_w5000_h5000_id_4.tif' mode='r'>
vv_img_shape: [(1614, 2101), (5000, 5000)]
vh_img_shape: [(1614, 2101), (5000, 5000)]
dem_img_shape: [(1614, 2101), (5000, 5000)]
mask_img_shape: [(1490, 1939), (5000, 5000)]


In [18]:
def return_csv_from_path(csv_path=config.csv_logger_path/ "logs/ahmed/unet"):
    csv_list = []
    # Iterate through each subdirectory
    for folder in csv_path.iterdir():
        # Check if the entry is a directory
        if folder.is_dir():
            # Iterate through files in the subdirectory
            for file in folder.iterdir():
                # Check if the entry is a file
                if file.is_file():
                    csv_list.append(file)
    # print(csv_list)
    return csv_list
                    

def _plot_from_csv(csv_path, name, x_axis_name, y_axis_name, columns_to_plot=None, upto_epoch=None):
    pathlib.Path((config.root_dir /"logs" / "plots"/"metrics_plots")).mkdir(parents=True, exist_ok=True)
    df = pd.read_csv(csv_path)
    if upto_epoch is not None:
        df = df.head(upto_epoch)
        print(df.shape)
    epochs = df['epoch']
    if columns_to_plot is not None:
        columns_to_plot = columns_to_plot
    else:
        columns_to_plot = df.columns.to_list()[1:]

    plt.figure(figsize=(12, 8))
    for column in columns_to_plot:
        plt.plot(epochs, df[column], label=column, linewidth=3.0,
            marker="o",
            markersize=5)

    plt.title(f"{y_axis_name}_over_{x_axis_name}")
    plt.xlabel(x_axis_name)
    plt.ylabel(y_axis_name)
    plt.xticks(epochs.astype(int))
    plt.legend()
    plt.savefig(config.root_dir/"logs"/"plots"/"metrics_plots"/name)
    plt.show()

def plot_metrics_vs_epochs(csv_path, name, x_axis_name= "Epochs", y_axis_name="Metrics_score",columns_to_plot=None, upto_epoch=None):
    _plot_from_csv(csv_path=csv_path, name=name,x_axis_name=x_axis_name, y_axis_name=y_axis_name, columns_to_plot=columns_to_plot, upto_epoch=upto_epoch)

def plot_metric_vs_epochs_vs_models(metric_name="val_f1-score"):
    pathlib.Path((config.root_dir /"logs"/ "plots"/"csv_for_plotting")).mkdir(parents=True, exist_ok=True)
    csv_list = return_csv_from_path()
    result_df = pd.DataFrame()
    for csv_path in csv_list:
        df = pd.read_csv(csv_path)
        result_df[os.path.basename(csv_path)] = df[metric_name]
    result_df.index.name = "epoch"
    result_df.to_csv(os.path.join(config.root_dir/"logs"/"plots"/"csv_for_plotting"/f"{metric_name}_vs_epoch.csv"), encoding='utf-8',index=True, header=True)
    _plot_from_csv(config.root_dir/"logs"/"plots"/"csv_for_plotting"/f"{metric_name}_vs_epoch.csv", x_axis_name= "Epochs", y_axis_name=metric_name, name=metric_name)

In [24]:
plot_metrics_vs_epochs("/mnt/hdd2/mdsamiul/project/imseg_csml/logs/ahmed/unet/unet_ex_Band123_ep_1000_11-Nov-23.csv",'123',columns_to_plot=["f1_score","val_f1_score"],  upto_epoch=30)

(30, 11)


In [22]:
plot_metrics_vs_epochs("/mnt/hdd2/mdsamiul/project/imseg_csml/logs/ahmed/unet/unet_ex_Band123_ep_1000_11-Nov-23.csv",'123',  upto_epoch=50)

(50, 11)


In [4]:
config.csv_logger_path/ "logs/ahmed/unet"

PosixPath('/mnt/hdd2/mdsamiul/project/imseg_csml/logs/csv_logger/logs/ahmed/unet')

In [10]:
config.csv_log_dir

PosixPath('/mnt/hdd2/mdsamiul/project/imseg_csml/logs/csv_logger/fapnet')

In [11]:
plot_metrics_vs_epochs(config.csv_logger_path/"fapnet"/"fapnet_ex_2024-03-13_e_100_p_256_s_128_ep_100.csv",name='metrics')
plot_metrics_vs_epochs(config.csv_logger_path/"fapnet"/"fapnet_ex_2024-03-13_e_100_p_256_s_128_ep_100.csv",name='metrics',columns_to_plot=["f1-score"])
plot_metric_vs_epochs_vs_models()
plot_metric_vs_epochs_vs_models(metric_name="recall")

In [30]:
def display_all(data,name):
    """
    Summary:
        save all images into single figure
    Arguments:
        data : data file holding images path
        directory (str) : path to save images
    Return:
        save images figure into directory
    """
    
    pathlib.Path((visualization_dir / "display")).mkdir(parents=True, exist_ok=True)
    pathlib.Path((visualization_dir / "display"/"train")).mkdir(parents=True, exist_ok=True)
    pathlib.Path((visualization_dir / "display"/"test")).mkdir(parents=True, exist_ok=True)
    pathlib.Path((visualization_dir / "display"/"valid")).mkdir(parents=True, exist_ok=True)

    for i in range(len(data)):
        with rasterio.open((data.feature_ids.values[i]+"_vv.tif")) as vv:
            vv_img = vv.read(1)
        with rasterio.open((data.feature_ids.values[i]+"_vh.tif")) as vh:
            vh_img = vh.read(1)
        with rasterio.open((data.feature_ids.values[i]+"_nasadem.tif")) as dem:
            dem_img = dem.read(1)
        with rasterio.open((data.masks.values[i])) as l:
            lp_img = l.read(1)
            lp_img[lp_img==2]=0
        id = data.feature_ids.values[i].split("/")[-1]
        display_list = {
                     "vv":vv_img,
                     "vh":vh_img,
                     "dem":dem_img,
                     "label":lp_img}


        plt.figure(figsize=(12, 8))
        title = list(display_list.keys())

        for i in range(len(display_list)):
            plt.subplot(1, len(display_list), i+1)
            
            # plot dem channel using earthpy
            if title[i]=="dem":
                ax = plt.gca()
                hillshade = es.hillshade(display_list[title[i]], azimuth=180)
                ep.plot_bands(
                    display_list[title[i]],
                    cbar=False,
                    cmap="terrain",
                    title=title[i],
                    ax=ax
                )
                ax.imshow(hillshade, cmap="Greys", alpha=0.5)
            
            # gray image plot vv and vh channels
            elif title[i]=="vv" or title[i]=="vh":
                plt.title(title[i])
                plt.imshow((display_list[title[i]]), cmap="gray")
                plt.axis('off')
                
            # gray label plot
            elif title[i]=="label":
                plt.title(title[i])
                plt.imshow((display_list[title[i]]), cmap="gray")
                plt.axis('off')
                
            # rgb plot
            else:
                plt.title(title[i])
                plt.imshow((display_list[title[i]]))
                plt.axis('off')

        prediction_name = "img_id_{}.png".format(id) # create file name to save
        plt.savefig(os.path.join((config.visualization_dir / 'display'/ name), prediction_name), bbox_inches='tight', dpi=800)
        plt.clf()
        plt.cla()
        plt.close()

In [32]:
print("displaying training images and masks")
display_all(data=train_df,name="train")

displaying training images and masks


In [33]:
print("displaying testing images and masks")
display_all(data=test_df,name="test")

displaying testing images and masks


In [34]:
print("displaying validation images and masks")
display_all(data=valid_df,name="valid")

displaying validation images and masks


In [35]:
eval_csv = pd.read_csv("/mnt/hdd2/mdsamiul/project/imseg_csml/data/csv/train.csv")
masks = eval_csv["feature_ids"].to_list()
ext = ["_vv.tif","_vh.tif","_nasadem.tif"]
masks = masks[0]
masks= [masks+ex for ex in ext]
print(masks)
for p in masks:
    with rasterio.open(p) as im:
        image = im.read(1)
    print("...............................")
    # print(p)
    # print(np.unique(image, return_counts=False))
    print(np.mean(image))
    print(np.std(image))
    print("...............................")

['/mnt/hdd2/mdsamiul/project/dataset/rice_data_training/2018_2019_w5000_h5000_id_4_vv.tif', '/mnt/hdd2/mdsamiul/project/dataset/rice_data_training/2018_2019_w5000_h5000_id_4_vh.tif', '/mnt/hdd2/mdsamiul/project/dataset/rice_data_training/2018_2019_w5000_h5000_id_4_nasadem.tif']


...............................
-12.204613877277495
4.0422273221629785
...............................
...............................
-18.96970667660666
4.367189151420688
...............................
...............................
812.64422736
425.41858324129265
...............................


In [21]:
def save_tiles(path, out_path, tiles_size=512, stride=512):
    os.makedirs(out_path, exist_ok=True)
    
    # Iterate over each file in the path
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        with rasterio.open(file_path) as src:
            # Get metadata and calculate number of tiles in each dimension
            meta = src.meta
            meta["height"]= tiles_size
            meta["width"]= tiles_size
            # print(meta)
            height, width = src.shape
            num_rows = math.ceil((height - tiles_size) / stride + 1)
            num_cols = math.ceil((width - tiles_size) / stride + 1)
            total_tiles = num_rows* num_cols
            print(f"shape of the image before tiles : {src.shape}")
            print(f"number of tiles={total_tiles}")
            print("..................................................")
            # Iterate over each tile
            for row in range(num_rows):
                for col in range(num_cols):
                    # Calculate window coordinates
                    row_start = row * stride
                    row_stop = min(row_start + tiles_size, height)
                    col_start = col * stride
                    col_stop = min(col_start + tiles_size, width)
                    
                    # Read the tile data
                    # window = Window(x0, y0, x1 - x0, y1 - y0)
                    window = Window.from_slices((row_stop-stride, row_stop), (col_stop-stride, col_stop))
                    tile_data = src.read(window=window)
                    # print("...........")
                    # print(tile_data.shape)
                    # Save the tile with a suffix of tile id
                    # out_filename = f"{os.path.splitext(filename)[0]}_tile_{row}_{col}.tif"
                    out_filename = f"tile_{row}_{col}_{os.path.splitext(filename)[0]}.tif"
                    out_file_path = os.path.join(out_path, out_filename)
                    with rasterio.open(out_file_path, 'w', **meta) as dst:
                        dst.write(tile_data)


In [18]:
import math
math.ceil((5000 - 512) / 256 + 1)

19

In [22]:
dataset_dir = Path("/mnt/hdd2/mdsamiul/project/dataset/test/")

In [23]:
save_tiles(dataset_dir,config.root_dir/"tiles")

shape of the image before tiles : (5000, 5000)
number of tiles=100
..................................................


shape of the image before tiles : (5000, 5000)
number of tiles=100
..................................................
shape of the image before tiles : (5000, 5000)
number of tiles=100
..................................................
shape of the image before tiles : (5000, 5000)
number of tiles=100
..................................................


In [None]:
config.dataset_dir

PosixPath('/mnt/hdd2/mdsamiul/project/dataset/rice_data_training')

In [None]:
def rename_files(datapath):
    # List all files in the directory
    files = os.listdir(datapath)
    
    for filename in files:
        # Extract the file extension
        _, ext = os.path.splitext(filename)
        
        # Check if the filename starts with DEM_ab.tif
        if filename.startswith("DEM_"):
            new_filename = filename.replace("DEM_", "").replace(".tif", "_nasadem.tif")
        
        # Check if the filename starts with VV_ab.tif
        elif filename.startswith("VV_"):
            new_filename = filename.replace("VV_", "").replace(".tif", "_vv.tif")
        
        # Check if the filename starts with VH_ab.tif
        elif filename.startswith("VH_"):
            new_filename = filename.replace("VH_", "").replace(".tif", "_vh.tif")
        
        # Check if the filename starts with GT_ab.tif
        elif filename.startswith("GT_"):
            new_filename = filename.replace("GT_", "")
        
        else:
            # If none of the conditions are met, skip this file
            raise ValueError("files_name_mismatch")
        
        # Construct the new filepath
        new_filepath = os.path.join(datapath, new_filename)
        
        # Rename the file
        os.rename(os.path.join(datapath, filename), new_filepath)
        print(f"Renamed {filename} to {new_filename}")


In [None]:
datapath = config.dataset_dir
rename_files(datapath)

In [None]:
# Run the command in the terminal
command = "python visualization.py"
result = subprocess.run(command, shell=True, capture_output=True, text=True)

# Get the terminal output
terminal_output = result.stdout

# Save the output to an RTF file
rtf_filename = "data_statistics.rtf"
with open(rtf_filename, "w") as rtf_file:
    # rtf_file.write("{\\rtf1\\ansi\n")
    rtf_file.write(terminal_output)
    # rtf_file.write("}")

print(f"Terminal output saved to {rtf_filename}")

Terminal output saved to data_statistics.rtf
