In [1]:
# Load all packages
import os
import sys
import time
from urllib.parse import urlparse

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline
mpl.rcParams["figure.dpi"] = 300
import glob
import re

import anndata
import h5py
import joblib
import napari
import numpy as np
import pandas as pd
import psutil
import pymeshfix
import pyvista as pv
import scanpy as sc
import skimage
from IPython.display import HTML, clear_output
from morphometrics.measure import measure_selected
from PIL import Image
from skimage import draw
from skimage.io import imread, imsave
from skimage.measure import label, marching_cubes, regionprops, regionprops_table
from skimage.transform import downscale_local_mean, rescale, resize
from sklearn.metrics import accuracy_score, confusion_matrix
from tqdm import tqdm

print(skimage.__version__)

  from tqdm.autonotebook import tqdm


0.19.3


In [2]:
# Load annotated dataset
seg_directory = "/cluster/project/treutlein/DATA/imaging/viventis/Morphodynamics_of_human_early_brain_organoid_development/3D_Brain_organoids_half_res_morphometrics/"

annotation_directory_v1 = "/cluster/project/treutlein/DATA/imaging/EmbedSeg_test/data/3D_Brain_organoids_half_res_morphometrics/Annotation_QC_updated/"
labeled_DF = pd.read_hdf(f"{seg_directory}/training_data_19_06_2023.h5", index="0")

In [3]:
# Test set
# Create training date and create a stratified split
X_frame = labeled_DF.loc[:, labeled_DF.columns != "time_point"].iloc[:, 6:-7]
X_frame["channel"] = (labeled_DF["channel"] == "GFP").astype(int)
X = np.array(X_frame)
y = np.array(labeled_DF["Label"])

In [4]:
from sklearn.model_selection import train_test_split

# train test split, stratify to ensure all labels are in test/train
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y
)

In [5]:
# Test how well the classifier is doing --> confusion matrix on the test set
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rfc = RandomForestClassifier(n_jobs=1, random_state=42)
parameters = {"max_depth": [10, 15, 20, 25], "n_estimators": [50, 75, 100]}
grid_clf = GridSearchCV(
    rfc, parameters, cv=4, scoring="accuracy", verbose=False, refit=True
)
grid_clf.fit(X_train, y_train)

In [None]:
# Accuracy on test
from sklearn.metrics import accuracy_score

y_pred = grid_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

In [None]:
# Save classifier
rf_save_dir = "/cluster/home/gutgi/git_repositories/morphodynamics-of-human-brain-organoid-patterning/light_sheet_analysis/cell_morphology_analysis/models/random_forrest_all_v29_06_23.joblib"
joblib.dump(grid_clf, rf_save_dir)
grid_clf = joblib.load(rf_save_dir)

In [25]:
from sklearn.model_selection import StratifiedKFold

random_forrest = RandomForestClassifier(
    n_jobs=1, max_depth=25, n_estimators=75, random_state=42
)

skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
skf.get_n_splits(X, y)
cm = np.zeros((len(labels), len(labels)))
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_training = X[train_index]
    y_training = y[train_index]

    random_forrest.fit(X_training, y_training)

    X_testing = X[test_index]
    y_testing = y[test_index]

    y_prediction = random_forrest.predict(X_testing)
    cm += confusion_matrix(y_testing, y_prediction)
cm = cm.astype(int)
cm = pd.DataFrame(cm, columns=labels, index=labels)
markers_cm = cm.iloc[1:, 1:]
tp = markers_cm.values[[np.arange(markers_cm.shape[0])] * 2]
total_positives = markers_cm.sum(axis=0)
true_positive_rate = tp / total_positives
true_positive_rate = pd.DataFrame(true_positive_rate)
true_positive_rate.columns = ["Cross validation true positive rate"]