In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# The dataframe is created by merging all CSV files
df = pd.read_csv(r'C:\Users\sprin\Downloads\SIMC_OverlapTiffsWithPP\SIMC_OverlapTiffsWithPP\SIMC.Overlap.csv\merged_data.csv')

feature_cols = [
    "Area..ABD.", "Area..Filled.", "Width", "Length", "Volume..ABD.", "Volume..ESD.", 
    "Diameter..ABD.", "Diameter..ESD.", "Feret.Angle.Max", "Feret.Angle.Min", "Transparency", 
    "Sum.Intensity", "Intensity", "Sigma.Intensity", "Edge.Gradient"
]
target_col = "Class"

selected_classes = ["Calanoid_1", "Cyclopoid_1", "Bosmina_1", "Herpacticoida", "Chironomid", "Chydoridae", "Daphnia"]

# Assign "Others" to all plankton not in selected_classes
df[target_col] = df[target_col].apply(lambda x: x if x in selected_classes else "Others")

First, we will do logistic regression. 
We need some assumptions for logistic regression. First is that features needed to be linearly separable. Since features of zooplankton data are not separable but we will use logistic regression as a baseline. As logistic regression is simple and interpretable, as well as handle imbalanced classes well, I choose logistic regression with a baseline. 
Second is that there should be no multicolinearity. To remove multicolinearity, we will use PCA as EDA. 
Third is that features are on the same scale however we found that range is extremely large. Therefore we will standardize the features. 

In [6]:
X = df[feature_cols]
y = df[target_col]

if X.isnull().values.any():
    print("There are missing values in the features.")
else:
    print("There are no missing values in the features.")

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y, test_size=0.2, random_state=2453, stratify=y
)

# Logistic Regression
log_reg = LogisticRegression(class_weight="balanced", max_iter=1000)
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

There are no missing values in the features.
Classification Report:
                precision    recall  f1-score   support

    Bosmina_1       0.02      0.85      0.03       575
   Calanoid_1       0.51      0.23      0.32     43043
   Chironomid       0.00      0.25      0.00         8
   Chydoridae       0.00      0.00      0.00         9
  Cyclopoid_1       0.50      0.12      0.19     39596
      Daphnia       0.00      0.39      0.00       112
Herpacticoida       0.00      0.34      0.01       121
       Others       0.89      0.74      0.80    164452

     accuracy                           0.55    247916
    macro avg       0.24      0.36      0.17    247916
 weighted avg       0.76      0.55      0.62    247916

Confusion Matrix:
 [[   487      3      0      5     22      7     46      5]
 [  1740  10007   9739    794   3782   7136   3679   6166]
 [     0      4      2      0      0      0      0      2]
 [     2      2      0      0      2      0      3      0]
 [  3036   28

In [8]:
# Standardize features
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y, test_size=0.2, random_state=2453, stratify=y
)

# Logistic Regression
log_reg = LogisticRegression(class_weight="balanced", max_iter=1000)
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
                precision    recall  f1-score   support

    Bosmina_1       0.00      0.93      0.01       575
   Calanoid_1       0.54      0.16      0.24     43043
   Chironomid       0.00      0.25      0.00         8
   Chydoridae       0.00      0.56      0.00         9
  Cyclopoid_1       0.49      0.12      0.20     39596
      Daphnia       0.00      0.45      0.01       112
Herpacticoida       0.00      0.45      0.00       121
       Others       0.76      0.18      0.29    164452

     accuracy                           0.17    247916
    macro avg       0.23      0.39      0.09    247916
 weighted avg       0.67      0.17      0.27    247916

Confusion Matrix:
 [[   537      0      0     28      0      8      2      0]
 [  1853   6674   3923   7204   3934   8036   3136   8283]
 [     0      0      2      0      0      0      1      5]
 [     1      1      0      5      1      0      1      0]
 [  2892   4785    962   8968   4861   3910  12073   1145]