<a href="https://colab.research.google.com/github/silvigeo/phytoPlanktonCNN/blob/main/Phytoplankton_classification_with_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Image classification of phytoplankton with CNN

---

Objectives:

Perform EDA:

- Check for underrrepresented classes

Image preprocessing

- Generate more images using data augmentation for the underrepresented classes (e.g. skewing, zooming, rotating, etc)
- Ensure all images have the same size


---

# Step 1. Import Libraries

In [8]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from google.colab import drive
import zipfile
import os
from sklearn.model_selection import train_test_split
import shutil
import pandas as pd

# Step 2. Load and preprocess the data

We need to first divide the data into test and train subsets

In [3]:
zip_ref = zipfile.ZipFile("/content/drive/MyDrive/Colab Notebooks/TensorFlow project - Phytoplankton image classification/phytoplankton_labeled.zip", 'r')
zip_ref.extractall("/tmp")
zip_ref.close()

In [4]:


dataset_path = '/tmp/labeled_20201020'
os.listdir(dataset_path)


['Beads',
 'Gonyaulax_verior',
 'Chroococcus_small',
 'Centrales_sp',
 'Prorocentrum_cordatum',
 'Gymnodinium_like',
 'Thalassiosira_levanderi',
 'Pauliella_taeniata',
 'Uroglenopsis_sp',
 'Peridiniella_catenata_chain',
 'Dinophyceae',
 'Mesodinium_rubrum',
 'Ciliata',
 'Monoraphidium_contortum',
 'Merismopedia_sp',
 'Cryptophyceae-Teleaulax',
 'Nitzschia_paleacea',
 'Skeletonema_marinoi',
 'Oocystis_sp',
 'Katablepharis_remigera',
 'Amylax_triacantha',
 'Dolichospermum-Anabaenopsis-coiled',
 'Dinophysis_acuminata',
 'Pyramimonas_sp',
 'Gymnodiniales',
 'Peridiniella_catenata_single',
 'Cyclotella_choctawhatcheeana',
 'Chroococcales',
 'Ceratoneis_closterium',
 'Melosira_arctica',
 'Nodularia_spumigena',
 'Aphanizomenon_flosaquae',
 'Cymbomonas_tetramitiformis',
 'Cryptomonadales',
 'Dolichospermum-Anabaenopsis',
 'Heterocapsa_triquetra',
 'Chlorococcales',
 'Pennales_sp_thin',
 'Euglenophyceae',
 'Oscillatoriales',
 'Pseudopedinella_sp',
 'Aphanothece_paralleliformis',
 'Heterocapsa_r

In [6]:
def count_files_in_subdirectories(directory_path):
    result_dict = {}

    for root, dirs, files in os.walk(directory_path):
        # Iterate through subdirectories
        for dir_name in dirs:
            dir_path = os.path.join(root, dir_name)
            # Count the number of files in each subdirectory
            file_count = len([f for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f))])
            result_dict[dir_name] = file_count

    return result_dict

sample_count_dict = count_files_in_subdirectories(dataset_path)
sample_count_dict

{'Beads': 125,
 'Gonyaulax_verior': 22,
 'Chroococcus_small': 827,
 'Centrales_sp': 480,
 'Prorocentrum_cordatum': 276,
 'Gymnodinium_like': 158,
 'Thalassiosira_levanderi': 2537,
 'Pauliella_taeniata': 119,
 'Uroglenopsis_sp': 516,
 'Peridiniella_catenata_chain': 193,
 'Dinophyceae': 1433,
 'Mesodinium_rubrum': 1132,
 'Ciliata': 243,
 'Monoraphidium_contortum': 327,
 'Merismopedia_sp': 98,
 'Cryptophyceae-Teleaulax': 6830,
 'Nitzschia_paleacea': 65,
 'Skeletonema_marinoi': 4128,
 'Oocystis_sp': 842,
 'Katablepharis_remigera': 54,
 'Amylax_triacantha': 19,
 'Dolichospermum-Anabaenopsis-coiled': 2504,
 'Dinophysis_acuminata': 217,
 'Pyramimonas_sp': 1224,
 'Gymnodiniales': 69,
 'Peridiniella_catenata_single': 899,
 'Cyclotella_choctawhatcheeana': 102,
 'Chroococcales': 142,
 'Ceratoneis_closterium': 45,
 'Melosira_arctica': 43,
 'Nodularia_spumigena': 169,
 'Aphanizomenon_flosaquae': 6989,
 'Cymbomonas_tetramitiformis': 199,
 'Cryptomonadales': 713,
 'Dolichospermum-Anabaenopsis': 12280

In [9]:
phyto_class_info_df = pd.DataFrame(data={'categories': list(sample_count_dict.keys()), 'values': list(sample_count_dict.values())})
phyto_class_info_df.head()

Unnamed: 0,categories,values
0,Beads,125
1,Gonyaulax_verior,22
2,Chroococcus_small,827
3,Centrales_sp,480
4,Prorocentrum_cordatum,276
