In [18]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import tensorflow as tf
tf.keras.backend.set_learning_phase(1)
from PIL import Image
from keras import backend as K 
from keras.preprocessing.image import ImageDataGenerator 
import matplotlib.pyplot as plt
import os, sys, random, gc

from src import exploration
from src import pre_process
from sklearn.model_selection import train_test_split

All .tiff images are stained with `Martius Scarlet Blue (MSB) stain`. The two major acute ischemic stroke (AIS) etiology subtypes and their main cellular differences are:

1. **Cardio Embolism (CE):** RBCs=47.67%, WBCs=4.22%, F= 29.19%, P=18.21%

2. **Artery Atherosclerosis (AA):** RBCs=42.58%, WBCs=3.12%, F=31.31%, P=20.81%


<font size="1,5"> [Abbasi M, Fitzgerald S, Ayers-Ringler J, Espina V, Mueller C, Rucker S, Kadirvel R, Kallmes D, Brinjikji W. Proteomic Analysis of Cardioembolic and Large Artery Atherosclerotic Clots Using Reverse Phase Protein Array Technology Reveals Key Cellular Interactions Within Clot Microenvironments. Cureus. 2021 Feb 22;13(2):e13499. doi: 10.7759/cureus.13499. PMID: 33777584; PMCID: PMC7990677.](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7990677/)</font>

Given this insight, we can maintain the distinct colors in the images since each cellular subtype is stained differently. Rather than preserving every pixel, we can reduce their size to enhance the efficiency of the neural network (NN) processing.

![CE](./images/CEexample1.png)

`Data preprocess:`


`      1. Split dataset into train_x, train_y, test_x, test_y`

In [19]:
train1 = pd.read_csv("../Final-IRONHACK-Project/data/train1.csv")
train_x, valid_x, train_y, valid_y = pre_process.train_valid_split(train1)

In [20]:
train_x['label'].value_counts() / len(train_x.label)

label
CE     0.72471
LAA    0.27529
Name: count, dtype: float64

In [21]:
train2 = exploration.Labeling_images(train_x)
train3 = train2.class_again_train()
train3.sample(1)

Unnamed: 0,image_id,center_id,patient_id,image_num,label,new_file_path
511,af4dd6_0,4,af4dd6,0,CE,D:/bootcamp/original/train_folder/af4dd6_0.tif


In [22]:
valid_x['label'].value_counts() / len(valid_x.label)

label
CE     0.728477
LAA    0.271523
Name: count, dtype: float64

In [23]:
val = exploration.Labeling_images(valid_x)
val2 = val.class_again_val()
val2.sample(1)

Unnamed: 0,image_id,center_id,patient_id,image_num,label,new_file_path
717,f7fb11_0,11,f7fb11,0,LAA,D:/bootcamp/original/val_folder1/f7fb11_0.tif


In [24]:
train3.to_csv("../Final-IRONHACK-Project/data/train_xdef.csv", index = False)

In [25]:
val2.to_csv("../Final-IRONHACK-Project/data/valid_xdef.csv", index = False)

`2. Create folders for each label:`

In [26]:
# data_path = "D:/bootcamp/original/"
# folder_names= ["train_folder", "val_folder"]
# pre_process.createfolders(data_path,folder_names)

In [27]:
# Just run it once because it moves permanently
#     train_x move img to train_folder
# train_path = "D:/bootcamp/original/train_folder/"
# valid_path = "D:/bootcamp/original/val_folder1/"
# pre_process.move_images(train_x, train_path)
# pre_process.move_images(valid_x, valid_path)

In [28]:
# For training folder
# data_path = "D:/bootcamp/original/try_train/"
# folder_names= ["CE", "LAA"]
# pre_process.createfolders(data_path,folder_names)

In [29]:
# For validation folder
# data_path = "D:/bootcamp/original/try_val/"
# folder_names= ["CE", "LAA"]
# pre_process.createfolders(data_path,folder_names)

In [30]:
# Just run it once because it moves permanently
#      For the train folder
# folder_path_CE = "D:/bootcamp/original/train_folder/CE/"
# folder_path_LAA = "D:/bootcamp/original/train_folder/LAA/"
# pre_process.images_class(train3, folder_path_CE, folder_path_LAA)

In [31]:
# Just run it once because it moves permanently
    # For the validation folder
# folder_path_CE = "D:/bootcamp/original/val_folder1/CE/"
# folder_path_LAA = "D:/bootcamp/original/val_folder1/LAA/"
# pre_process.images_class(val2, folder_path_CE, folder_path_LAA)

In [32]:
# So I can plot all images regardless of the amount of pixels:
Image.MAX_IMAGE_PIXELS = 3000000000 
K.set_learning_phase(1)

# nb_train_samples = 2  
# nb_validation_samples = 4  

n_classes = 2 

In [33]:
train_generator, test_generator = pre_process.image_generator()


Found 80 images belonging to 2 classes.
Found 20 images belonging to 2 classes.
