# Step-1 Notebook for processing raw flowcam data, visualising no. of samples vs flowcam phytoplankton categories and visualising common categories between Flowcam and IFCB datasets

In [1]:
import os
import glob
import numpy as np
import pandas as pd
from PIL import Image
import cv2
import sys
from os import listdir, path
from pathlib import Path
from typing import List, Tuple
import matplotlib.pyplot as plt
import glob
import pickle
from tqdm import tqdm

In [2]:
os.chdir("./")

In [4]:
# listdir('./')

In [5]:
# for i in os.listdir("./"):
#     if os.path.isdir(i):
#         print(i)

## Defining the function process_flowcam which parses the raw flowcam files and stores them in "flocam_processed_1" directory

In [6]:
def process_flowcam(fc_dir,target_dir,resize= False):
    images_list=[]
    flb_path = fc_dir + "/*.flb"
    # make the individual category directory 
    for i in tqdm(glob.glob(flb_path),leave=True):
        dir_name=i.split('.')[1].split('/')[2].strip()
        dir_path_obj=Path(f'{target_dir}/{dir_name}')
        if not dir_path_obj.exists():
            # Create the directory (including any necessary parent directories)
            dir_path_obj.mkdir(parents=True, exist_ok=True)
            try:
                with open(i,'r') as fh:
                    data=fh.readlines()
                col_names=[]
                col_values=[]
                for j in data[2:]:
                    if j.count('|')==1:
                        a=j.split('|')[0]
                        col_names.append(a)
                    else:
                        b=j.split('|')
                        col_values.append(b)

                data_pd=pd.DataFrame(col_values,columns=col_names)
                # Print the current progress
                print(f"{dir_name} processed {data_pd.shape[:2]}")
                images_list.append(f"{dir_name} {data_pd.shape[:2]}")
            except Exception as e:
                print(f'error occured in dataframe creation {e}')

        #images parsing from the above dataframe
            images_data: List = []
            try:
                for index, row in data_pd.iterrows():
                    image_collage_path = path.join(
                        fc_dir, row['collage_file'])
                    im = cv2.imread(image_collage_path)
                    # Clipping the corresponding image from collage file
                    im = im[
                        int(row['image_y']):int(row['image_y'])+int(row['image_h']),
                        int(row['image_x']):int(row['image_x'])+int(row['image_w'])]
                    if resize:
                        # transform images to portrait orientation
                        # if width > height
                        (h, w) = im.shape[:2]
                        if w > h:
                            im = cv2.rotate(im, cv2.ROTATE_90_CLOCKWISE)
                        im = cv2.resize(im, desired_image_size)
                    # normalize the image pixel values between 0 to 255 
                    im = im / 255
                    images_data.append(im)
                images_data: np.array = np.array(images_data, dtype=object)
            except Exception as e:
                print(f"error in image_creation {dir_name}_{e}")

        #save the data
            try:
                data_pd.to_csv(f'{target_dir}/{dir_name}/{dir_name}_pd.csv',index=False)
                np.save(f'{target_dir}/{dir_name}/{dir_name}_im.npy',images_data)

            except Exception as e:
                print(f"error in storing data {e}")
        else:
            print(f"Directory {dir_path_obj} already exists. Skipping data creation.")
            pass
    return images_list

In [7]:
fc_dir="./raw_flowcam_data"
target_dir="./flowcam_processed_1"
images_list=process_flowcam(fc_dir,target_dir,resize= False)

  0%|          | 0/4 [00:00<?, ?it/s]

Euterpina_copepod processed (44, 65)


 25%|██▌       | 1/4 [00:02<00:08,  2.89s/it]

Eutintinnus_tintinnid processed (58, 65)


 50%|█████     | 2/4 [00:03<00:03,  1.54s/it]

Echinoderm_larvae processed (16, 65)


 75%|███████▌  | 3/4 [00:03<00:01,  1.05s/it]

Empty_lorica processed (46, 65)


100%|██████████| 4/4 [00:05<00:00,  1.25s/it]


### Resizing and merging all the phytoplankton data into single dataframe and numpy objects and saving to "flowcam_merged_data" directory

In [8]:
from collections import defaultdict
class_dict: defaultdict = defaultdict(dict)
dataframes: List = []
all_images: List = []
desired_image_size = (64,101)

for idx, directory in enumerate(glob.glob("./flowcam_processed_1/*")):
    resized_images=[]
    # our class_dict will save human readable name to integer key
    class_dict[idx] = os.path.basename(directory)
    # read the df and add target variable equal to idx
    df = pd.read_csv(glob.glob(path.join(directory,"*.csv"))[0])
    df["_target"]=idx
    print(df.shape)
    # resize all images to same size that corrospond to each row in df above.
    images = np.load(glob.glob(path.join(directory,"*.npy"))[0],allow_pickle=True)
    for im in images:
        (h, w) = im.shape[:2]
        if w > h:
            im = cv2.rotate(im, cv2.ROTATE_90_CLOCKWISE)
            im = cv2.resize(im, desired_image_size)
        else:
            im = cv2.resize(im, desired_image_size)
        resized_images.append(im)
        
    print(len(resized_images))
    dataframes.append(df)
    len_res = len(resized_images)
    resized_images=np.array(resized_images)
    all_images.append(resized_images)
    if df.shape[0]==len_res:
        print('unique-->',np.unique(df["_target"]))
        print('resized_shape-->',resized_images.shape)
        print('*'*20,os.path.basename(directory),'idx-->',idx,'*'*20)
    else:
        print('failed',idx)

(44, 66)
44
unique--> [0]
resized_shape--> (44, 101, 64, 3)
******************** Euterpina_copepod idx--> 0 ********************
(46, 66)
46
unique--> [1]
resized_shape--> (46, 101, 64, 3)
******************** Empty_lorica idx--> 1 ********************
(16, 66)
16
unique--> [2]
resized_shape--> (16, 101, 64, 3)
******************** Echinoderm_larvae idx--> 2 ********************
(58, 66)
58
unique--> [3]
resized_shape--> (58, 101, 64, 3)
******************** Eutintinnus_tintinnid idx--> 3 ********************


In [9]:
# merge all the dataframe and images into single dataframe and numpy object respectively
path_to_merged = './flowcam_merged_data/'
os.makedirs('./flowcam_merged_data/',exist_ok=True)
df: pd.DataFrame = pd.concat(dataframes, sort=True)
df.to_csv(path.join(path_to_merged, 'merged_df.csv'))

all_images: np.array = np.vstack(all_images)
np.save(path.join(path_to_merged, 'merged_images.npy'),all_images)
np.save(path.join(path_to_merged,'class_dict.npy'),class_dict)

In [10]:
df=pd.read_csv("./flowcam_merged_data/merged_df.csv")

In [11]:
all_images=np.load("./flowcam_merged_data/merged_images.npy")

In [12]:
class_dict=np.load("./flowcam_merged_data/class_dict.npy",allow_pickle=True)

In [13]:
df.shape,all_images.shape

((164, 67), (164, 101, 64, 3))

In [14]:
df.columns

Index(['Unnamed: 0', '_target', 'abd_area', 'abd_diameter', 'avg_blue',
       'avg_green', 'avg_red', 'cal_const', 'cal_image', 'camera', 'ch1_area',
       'ch1_peak', 'ch1_width', 'ch2_area', 'ch2_peak', 'ch2_width',
       'ch3_area', 'ch3_peak', 'ch3_width', 'circle_fit', 'circularity_hu',
       'collage_file', 'compactness', 'convex_perimeter', 'edge_gradient',
       'elapsed_time', 'elongation', 'esd_diameter', 'fd_diameter',
       'feret_max_angle', 'feret_min_angle', 'filled_area', 'fringe_size',
       'id', 'image_h', 'image_id', 'image_w', 'image_x', 'image_y',
       'intensity', 'intensity_calimage', 'length', 'perimeter', 'ppc',
       'raw_area', 'raw_convex_hull_area', 'raw_convex_perimeter',
       'raw_feret_max', 'raw_feret_mean', 'raw_feret_min', 'raw_filled_area',
       'raw_legendre_major', 'raw_legendre_minor', 'raw_perimeter',
       'raw_sphere_complement', 'raw_sphere_unknown', 'raw_sphere_volume',
       'roughness', 'sigma_intensity', 'sphere_count', 's

### feature engineering and splitting the merged data into traning, validation amd test data and saving to "flowcam_split_data" directory

In [15]:
from data_preprocessing import (drop_columns, prepare_training_data, process_attributes)
df: pd.DataFrame = df.drop(['Unnamed: 0'],axis=1)
df = process_attributes(df)
# drop features that are no longer needed for training
df = drop_columns(df)

print(f'total columns after modifying df -- {len(df.columns)}')

min_samples=14

''' a custom standard scaler is called from utils inside prepare_training_data and 
    rescaler is stored at path below using joblib.dumb'''
'''dump(rescaler, './flowcam_split_data/std_scaler.bin', compress=True)'''

path_to_store_std_scaler = './flowcam_split_data/std_scaler.bin'
os.makedirs('./flowcam_split_data/', exist_ok= True)

trainAttrX, valAttrX, testAttrX, trainImagesX, \
    valImagesX, testImagesX, y_train, y_val, y_test = \
    prepare_training_data(df, all_images, min_samples, path_to_store_std_scaler)


total columns after modifying df -- 44


In [16]:
trainAttrX.shape, valAttrX.shape, testAttrX.shape, trainImagesX.shape, \
    valImagesX.shape, testImagesX.shape, y_train.shape, y_val.shape, y_test.shape

((118, 43),
 (23, 43),
 (23, 43),
 (118, 101, 64, 3),
 (23, 101, 64, 3),
 (23, 101, 64, 3),
 (118, 4),
 (23, 4),
 (23, 4))

In [17]:
# save the class dictionary, for later use,
np.save(path.join('./flowcam_split_data', 'class_dict.npy'),
        class_dict,allow_pickle=True)

processed_training_data_path=path.join("./flowcam_split_data", "plankton_data_101x64_final.pkl")
# finally save the training data for future use
with open(processed_training_data_path, "wb") as f:
    pickle.dump((trainAttrX, valAttrX, testAttrX, trainImagesX,
                 valImagesX, testImagesX, y_train, y_val, y_test),
                f, protocol=4)