In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [13]:
import numpy as np
import tensorflow as tf
import os
import pandas as pd
from tensorflow.keras.preprocessing.image import img_to_array, load_img

In [14]:
# Load the trained model
model = tf.keras.models.load_model("/kaggle/input/test/tensorflow2/default/1/single_blood_cell_classifier.h5")

In [None]:
IMG_HEIGHT, IMG_WIDTH = 360, 363
class_labels = ['basophil', 'eosinophil', 'erythroblast', 'ig', 'lymphocyte', 'monocyte', 'neutrophil', 'platelet']

In [None]:
# Path to the main directory containing subclass folders, each containing patient folders
DATASET_DIR = "/kaggle/input/deep-learning-aml/Dataset/data"

In [19]:
# Function to count cell types in a single patient folder
def count_cell_types_in_patient_folder(patient_folder_path):
    cell_counts = np.zeros(len(class_labels))
    for image_file in os.listdir(patient_folder_path):
        image_path = os.path.join(patient_folder_path, image_file)
        try:
            img = load_img(image_path, target_size=(IMG_HEIGHT, IMG_WIDTH))
            img_array = img_to_array(img)
            img_array = np.expand_dims(img_array, axis=0)  # Add batch dimension
            prediction = model.predict(img_array, verbose=0)
            predicted_class = np.argmax(prediction)
            cell_counts[predicted_class] += 1
        except Exception as e:
            print(f"Skipping unreadable file: {image_path}. Error: {e}")
            continue
    return cell_counts

In [18]:
# Initialize an empty DataFrame for all patients
df_all_patients = pd.DataFrame(columns=["patient"] + class_labels + ["target"])

In [None]:
# Iterate through each subclass folder (which will be the target for the patients within)
for subclass_folder in os.listdir(DATASET_DIR):
    subclass_path = os.path.join(DATASET_DIR, subclass_folder)
    if os.path.isdir(subclass_path):
        # Iterate through each patient folder in the subclass folder
        for patient_folder in os.listdir(subclass_path):
            patient_path = os.path.join(subclass_path, patient_folder)
            if os.path.isdir(patient_path):
                cell_counts = count_cell_types_in_patient_folder(patient_path)
                
                # Normalize counts for this patient
                total_count = np.sum(cell_counts)
                if total_count > 0:
                    normalized_counts = cell_counts / total_count
                else:
                    normalized_counts = cell_counts  # No normalization if no cells detected

                # Create a DataFrame row for this patient with the target and append it
                patient_data = pd.DataFrame([[patient_folder] + list(normalized_counts) + [subclass_folder]], 
                                            columns=["patient"] + class_labels + ["target"])
                df_all_patients = pd.concat([df_all_patients, patient_data], ignore_index=True)
                
                # Display the updated DataFrame after each patient is added
                print(f"Updated DataFrame after adding patient '{patient_folder}':")
                print(df_all_patients)

  df_all_patients = pd.concat([df_all_patients, patient_data], ignore_index=True)


Updated DataFrame after adding patient 'RHX':
  patient  basophil  eosinophil  erythroblast        ig  lymphocyte  monocyte  \
0     RHX  0.052314         0.0      0.022133  0.573441    0.038229   0.28169   

   neutrophil  platelet         target  
0    0.014085  0.018109  RUNX1_RUNX1T1  
Updated DataFrame after adding patient 'BHG':
  patient  basophil  eosinophil  erythroblast        ig  lymphocyte  monocyte  \
0     RHX  0.052314    0.000000      0.022133  0.573441    0.038229  0.281690   
1     BHG  0.014085    0.002012      0.012072  0.334004    0.018109  0.597586   

   neutrophil  platelet         target  
0    0.014085  0.018109  RUNX1_RUNX1T1  
1    0.018109  0.004024  RUNX1_RUNX1T1  
Updated DataFrame after adding patient 'HVE':
  patient  basophil  eosinophil  erythroblast        ig  lymphocyte  monocyte  \
0     RHX  0.052314    0.000000      0.022133  0.573441    0.038229  0.281690   
1     BHG  0.014085    0.002012      0.012072  0.334004    0.018109  0.597586   
2     H

In [21]:
df_all_patients.to_csv("all_patients_cell_counts_with_target.csv", index=False)