# Step-2 Notebook for processing raw IFCB data, and adding 5 features viz. height, width and average channel colors as feature data corresponding to each image. 

In [1]:
import os
import glob
import numpy as np
import pandas as pd
from PIL import Image
import cv2
import sys
from os import listdir, path
from pathlib import Path
from typing import List, Tuple
import matplotlib.pyplot as plt
import glob
import pickle
from tqdm import tqdm

In [2]:
sys.executable

'/opt/conda/bin/python'

In [3]:
os.getcwd()

'/lustre_scratch/prabgithub/test_folder/project_code1'

In [4]:
os.chdir("./")

In [5]:
for rt,dr,fl in os.walk('./raw_ifcb_data/',topdown=True):
    if not fl:
        print(rt,'-->',len(fl))

./raw_ifcb_data/ --> 0


In [6]:
for i in glob.glob("./raw_ifcb_data/*"):
    print(os.path.basename(i))

Acantharian
Alexandrium
Acanthoica_quattrospina
Amphorelloides_tropidoneis
Amphidinium_crassipes
Askenasia
Asterompalus_flabellatus


In [7]:
ifcb_cat=[]
for i in glob.glob("./raw_ifcb_data/*"):
    if os.path.isdir(i) and (os.path.basename(i) != '.ipynb_checkpoints') :
        if len(os.listdir(i))==0:
            print('empty folder -->',i)
        else: 
            # print(i,'filled folder')
            ifcb_cat.append(os.path.basename(i))
    else: print(i)

In [8]:
ifcb_cat

['Acantharian',
 'Alexandrium',
 'Acanthoica_quattrospina',
 'Amphorelloides_tropidoneis',
 'Amphidinium_crassipes',
 'Askenasia',
 'Asterompalus_flabellatus']

### Feature extraction and stroing images data as numpy database in corresponding phytoplankton directory in "ifcb_processed_1" directory

In [9]:
processed_ifcb_images_folder='./ifcb_processed_1'

In [10]:
for m,i in enumerate(np.sort(ifcb_cat)):
    df = pd.DataFrame(columns=['height','width','B','G','R'])
    images_data=[]
    os.makedirs(os.path.join(processed_ifcb_images_folder,f'./{i}'),exist_ok=True)
    save_path=os.path.join(processed_ifcb_images_folder,f'./{i}')
    for k,j in enumerate(glob.glob(f'./raw_ifcb_data/{i}/*')):
        try:
            img=cv2.imread(j)
            img=img/255
            mean_color = cv2.mean(img)
            # mean_color is a tuple of the form (B, G, R, A)
            # If the image has no alpha channel, the A value will be 0
            average_color = mean_color[:3] 
            df.loc[k,['height','width']]=img.shape[:2]
            df.loc[k,['B','G','R']]=average_color
            images_data.append(img)
        except Exception as e:
            print(f'problem with_{k}_of_{j}_of_{i}_error_{e}')
            print('#'*40)
    try:
        images_data_object = np.empty(len(images_data), dtype=object)
        for l in range(len(images_data)):
            images_data_object[l] = images_data[l]
        np.save(os.path.join(save_path,f'{i}.npy'),images_data_object)
        # df['_target']=m
        print(len(images_data),df.shape)
        df.to_csv(os.path.join(save_path,f'{i}.csv'),index=False)
        print(f'processed_{i} with target_id_{m}')
        print('*'*40)
    except Exception as e:
            print(f'problem with_{k}_of_{j}_of_{i}_error_{e}')
            print('#'*40)
        

2 (2, 5)
processed_Acantharian with target_id_0
****************************************
59 (59, 5)
processed_Acanthoica_quattrospina with target_id_1
****************************************
21 (21, 5)
processed_Alexandrium with target_id_2
****************************************
4 (4, 5)
processed_Amphidinium_crassipes with target_id_3
****************************************
62 (62, 5)
processed_Amphorelloides_tropidoneis with target_id_4
****************************************
10 (10, 5)
processed_Askenasia with target_id_5
****************************************
2 (2, 5)
processed_Asterompalus_flabellatus with target_id_6
****************************************


### Resizing and merging all the phytoplankton data into single dataframe and numpy objects and saving to "ifcb_merged_data" directory

In [11]:
from collections import defaultdict
class_dict: defaultdict = defaultdict(dict)
dataframes: List = []
all_images: List = []
desired_image_size = (64,101)

for idx, directory in enumerate(glob.glob("./ifcb_processed_1/*")):
    resized_images=[]
    # our class_dict will save human readable name to integer key
    class_dict[idx] = os.path.basename(directory)
    # read the df and add target variable equal to idx
    df = pd.read_csv(glob.glob(path.join(directory,"*.csv"))[0])
    df["_target"]=idx
    print(df.shape)
    # resize all images to same size that corrospond to each row in df above.
    images = np.load(glob.glob(path.join(directory,"*.npy"))[0],allow_pickle=True)
    for im in images:
        (h, w) = im.shape[:2]
        if w > h:
            im = cv2.rotate(im, cv2.ROTATE_90_CLOCKWISE)
            im = cv2.resize(im, desired_image_size)
        else:
            im = cv2.resize(im, desired_image_size)
        resized_images.append(im)
        
    print(len(resized_images))
    dataframes.append(df)
    len_res = len(resized_images)
    resized_images=np.array(resized_images)
    all_images.append(resized_images)
    if df.shape[0]==len_res:
        print('unique-->',np.unique(df["_target"]))
        print('resized_shape-->',resized_images.shape)
        print('*'*20,os.path.basename(directory),'idx-->',idx,'*'*20)
    else:
        print('failed',idx)

(2, 6)
2
unique--> [0]
resized_shape--> (2, 101, 64, 3)
******************** Acantharian idx--> 0 ********************
(21, 6)
21
unique--> [1]
resized_shape--> (21, 101, 64, 3)
******************** Alexandrium idx--> 1 ********************
(59, 6)
59
unique--> [2]
resized_shape--> (59, 101, 64, 3)
******************** Acanthoica_quattrospina idx--> 2 ********************
(62, 6)
62
unique--> [3]
resized_shape--> (62, 101, 64, 3)
******************** Amphorelloides_tropidoneis idx--> 3 ********************
(4, 6)
4
unique--> [4]
resized_shape--> (4, 101, 64, 3)
******************** Amphidinium_crassipes idx--> 4 ********************
(10, 6)
10
unique--> [5]
resized_shape--> (10, 101, 64, 3)
******************** Askenasia idx--> 5 ********************
(2, 6)
2
unique--> [6]
resized_shape--> (2, 101, 64, 3)
******************** Asterompalus_flabellatus idx--> 6 ********************


In [12]:
# merge all the dataframe and images into single dataframe and numpy object respectively
path_to_merged = './ifcb_merged_data/'
os.makedirs('./ifcb_merged_data/',exist_ok=True)
df: pd.DataFrame = pd.concat(dataframes, sort=True)
df.to_csv(path.join(path_to_merged, 'merged_df.csv'))

all_images: np.array = np.vstack(all_images)
np.save(path.join(path_to_merged, 'merged_images.npy'),all_images)
np.save(path.join(path_to_merged,'class_dict.npy'),class_dict)

In [13]:
df=pd.read_csv("./ifcb_merged_data/merged_df.csv")

In [14]:
all_images=np.load("./ifcb_merged_data/merged_images.npy")

In [15]:
class_dict=np.load("./ifcb_merged_data/class_dict.npy",allow_pickle=True)

In [16]:
df.shape,all_images.shape

((160, 7), (160, 101, 64, 3))

In [17]:
df.columns

Index(['Unnamed: 0', 'B', 'G', 'R', '_target', 'height', 'width'], dtype='object')

### splitting the merged data into traning, validation amd test data and saving to "ifcb_split_data" directory

In [18]:
from data_preprocessing import prepare_training_data

df: pd.DataFrame = df.drop(['Unnamed: 0'],axis=1)
# drop features that are no longer needed for training
print(f'total columns after modifying df -- {len(df.columns)}')

min_samples=10

''' a custom standard scaler is called from utils inside prepare_training_data and 
    rescaler is stored at path below using joblib.dumb'''
'''dump(rescaler, './ifcb_split_data/std_scaler.bin', compress=True)'''

path_to_store_std_scaler = './ifcb_split_data/std_scaler.bin'
os.makedirs('./ifcb_split_data/', exist_ok= True)

trainAttrX, valAttrX, testAttrX, trainImagesX, \
    valImagesX, testImagesX, y_train, y_val, y_test = \
    prepare_training_data(df, all_images, min_samples, path_to_store_std_scaler)


total columns after modifying df -- 6


In [19]:
trainAttrX.shape, valAttrX.shape, testAttrX.shape, trainImagesX.shape, \
    valImagesX.shape, testImagesX.shape, y_train.shape, y_val.shape, y_test.shape

((109, 5),
 (22, 5),
 (21, 5),
 (109, 101, 64, 3),
 (22, 101, 64, 3),
 (21, 101, 64, 3),
 (109, 7),
 (22, 7),
 (21, 7))

In [20]:
# save the class dictionary, for later use,
np.save(path.join('./ifcb_split_data', 'class_dict.npy'),
        class_dict,allow_pickle=True)

processed_training_data_path=path.join("./ifcb_split_data", "plankton_data_101x64_final.pkl")
# finally save the training data for future use
with open(processed_training_data_path, "wb") as f:
    pickle.dump((trainAttrX, valAttrX, testAttrX, trainImagesX,
                 valImagesX, testImagesX, y_train, y_val, y_test),
                f, protocol=4)