# Imports

In [None]:
import glob, pylab, pandas as pd
import pydicom, numpy as np
from os import listdir
from os.path import isfile, join
import matplotlib.pylab as plt
import os
import scipy
import seaborn as sns
import pickle
import math
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer 

# **Loading the dataset**

In [None]:
# Loading the dataset
!ls ../input/rsna-intracranial-hemorrhage-detection/rsna-intracranial-hemorrhage-detection

> ### Read train csv file

In [None]:
train_csv_file = pd.read_csv('../input/rsna-intracranial-hemorrhage-detection/rsna-intracranial-hemorrhage-detection/stage_2_train.csv')

In [None]:
# size of training set
print(train_csv_file.shape)
print(train_csv_file.head)

# **Visualization**

In [None]:
# Checking if label exists in all dataset
train_csv_file.Label.isnull().sum()

In [None]:
sns.countplot(train_csv_file.Label)

In [None]:
train_csv_file.head

In [None]:
# Get count of 0/1 for each subtype
subtype_counts = train_csv_file.groupby("Sub-type").Label.value_counts().unstack()
# print(subtype_counts)
# 
# loc -> selects rows with same label
# get count for ones for each sub-type / the total number of ones
subtype_counts = subtype_counts.loc[:, 1] / train_df.groupby("Sub-type").size() * 100
# print()
# print(subtype_counts)
# train_df.head()
multi_target_count = train_df.groupby("Image_ID").Label.sum()

fig, ax = plt.subplots(1,3,figsize=(20,5))

sns.countplot(train_df.Label, ax=ax[0], palette="Reds")
ax[0].set_xlabel("Binary label")
ax[0].set_title("How often do we observe a positive label?");

sns.countplot(multi_target_count, ax=ax[1])
ax[1].set_xlabel("Number of targets per image")
ax[1].set_ylabel("Frequency")
ax[1].set_title("Multi-Hot occurences")

sns.barplot(x=subtype_counts.index, y=subtype_counts.values, ax=ax[2], palette="Set2")
plt.xticks(rotation=45); 
ax[2].set_title("How much binary imbalance do we have?")
ax[2].set_ylabel("% of positive occurences (1)");

**Observation:**
1. Number of positive examples is relatively low
2. Probability of having more than one hemorrhage is lower
3. Epidural class (label) has the least number of examples which will cause a slight problem 

## What does pixel spacing mean? 


*all pixel spacing related Attributes are encoded as the physical distance between the centers of each two-dimensional pixel, specified by two numeric values.The first value is the row spacing in mm, that is the spacing between the centers of adjacent rows, or vertical spacing.The second value is the column spacing in mm, that is the spacing between the centers of adjacent columns, or horizontal spacing.*

Consequently it's related to the physical distance.

> # **Creating pd table**

In [None]:
duplicates_to_remove = [56340, 56341, 56342, 56343, 56344, 56345, 56346, 56347, 56348, 56349, 56350, 56351, 1171824, 1171825, 1171826, 1171827,
 1171828, 1171829, 1171830, 1171831, 1171832, 1171833, 1171834, 1171835, 3705306, 3705307, 3705308, 3705309, 3705310,
 3705311, 3705312, 3705313, 3705314, 3705315, 3705316, 3705317, 3842472, 3842473, 3842474, 3842475, 3842476, 3842477,
 3842478, 3842479, 3842480, 3842481, 3842482, 3842483]


In [None]:
train_csv_file.shape
# Output :  (4516842, 2)

In [None]:
train_csv_file = train_csv_file.drop(index=duplicates_to_remove)
train_csv_file = train_csv_file.reset_index(drop=True)

In [None]:
train_csv_file.shape
# Output : (4516794, 2)


In [None]:
# IDs = train_csv_file['ID'].str.rsplit("_", n=1, expand=True)[0]

# print(IDs[0:10])

## **Creating one row in the dataframe for each image**

In [None]:
# Creating 1 row for each img
# Create new train Dataframe for training resnet with multilabel data

df_train_multilbl = pd.DataFrame(
    columns=['ID','epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural', 'any'])

# cnt = 1
for img_idx in range(500000,750000):
# for img_idx in range(18):
    if img_idx%10000==0:
        print(img_idx)
    k = img_idx*6
    ID_col_val =  train_csv_file.iloc[k]['ID']
    img_name = 'ID_' + ID_col_val.split('_')[1] 
    imgID = ID_col_val.split('_')[1]
    if len(ID_col_val.split('_')) == 3:
#         print(cnt)
#         cnt += 1
        epidural_lbl = train_csv_file.iloc[k]['Label']
        intraparenchymal_lbl = train_csv_file.iloc[k+1]['Label']
        intraventricular_lbl = train_csv_file.iloc[k+2]['Label']
        subarachnoid_lbl = train_csv_file.iloc[k+3]['Label']
        subdural_lbl = train_csv_file.iloc[k+4]['Label']
        any_lbl = train_csv_file.iloc[k+5]['Label']
        df_train_multilbl = df_train_multilbl.append(
            {'ID': img_name, 
             'epidural': epidural_lbl, 
             'intraparenchymal': intraparenchymal_lbl, 
             'intraventricular': intraventricular_lbl, 
             'subarachnoid': subarachnoid_lbl, 
             'subdural': subdural_lbl, 
             'any': any_lbl}, ignore_index=True)

print(df_train_multilbl.shape)
df_train_multilbl.head(10)

In [None]:
filename = 'pickled_ds3'
df_train_multilbl.to_pickle(filename)

In [None]:
df1 = pd.read_pickle('../input/dataframe/pickled_ds')
df2 = pd.read_pickle('../input/dataset/pickled_ds2')
df3 = pd.read_pickle('../input/dataset/pickled_ds3')

In [None]:
df_train_multilbl = pd.concat([df1, df2,df3])

In [None]:
print(df_train_multilbl.shape)
print(df_train_multilbl.head)

### **Converting np.array into a df**

In [None]:
# # Order :  any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
# dataset = pd.DataFrame({'Image_ID': result[:, 0],'Any':result[:, 1],'Epidural':result[:, 2],'Intraparenchymal':result[:, 3],
#                         'Intraventricular':result[:, 4],'Subarachnoid':result[:, 5],'Subdural':result[:, 6],'Label': result[:, 7]})

# print(dataset.head)

## **Selecting subset from dataset**

In [None]:
 df_train_multilbl.loc[(df_train_multilbl["epidural"] == 1)].shape

In [None]:
#-H- Take 3000 of each and 15000 of any
epidural = df_train_multilbl.loc[(df_train_multilbl["epidural"] == 1)][0:3136]
intraparenchymal = df_train_multilbl.loc[(df_train_multilbl["intraparenchymal"] == 1)][0:3136]
intraventricular = df_train_multilbl.loc[(df_train_multilbl["intraventricular"] == 1)][0:3136]
subarachnoid = df_train_multilbl.loc[(df_train_multilbl["subarachnoid"] == 1)][0:3136]
subdural = df_train_multilbl.loc[(df_train_multilbl["subdural"] == 1)][0:3136]
no_label = df_train_multilbl.loc[(df_train_multilbl["any"] == 0)][0:9000]


In [None]:
print(epidural.shape)
print(intraparenchymal.shape)
print(intraventricular.shape)
print(subarachnoid.shape)
print(subdural.shape)
print(no_label.shape)

### **Concatenating 5 subtypes & any = 0**

In [None]:
selected_ds=pd.concat([epidural, intraparenchymal,intraventricular,subarachnoid,subdural,no_label])
selected_ds.shape
print(selected_ds.head)

### **Removing Duplicates from selected_ds**

In [None]:
selected_ds.drop_duplicates(subset ="ID", keep = False, inplace = True) 
selected_ds.shape
# before dropping [23000 rows x 7 columns]>
# after dropping (17455, 7)

### **Saving all needed Images_ID**

In [None]:
images_id = selected_ds['ID']
print(len(images_id))

In [None]:
# # Save IDS nparray
# with open('images_id.npy', 'wb') as f:
#     np.save(f, images_id)

In [None]:
import random
# selected_ds is the df of all data
l = [ i for i in range(0,selected_ds.shape[0])]
l_shuffled = random.sample(l, len(l))
length = len(l_shuffled)
# indices of training
ind_train = l_shuffled[0:(math.ceil(0.8*length))]
# indices of validation
ind_valid = l_shuffled[(math.ceil(0.8*length)):(math.ceil(0.9*length))]
# indices of testing
ind_test = l_shuffled[(math.ceil(0.9*length)):]

In [None]:
print(len(ind_train))
print(len(ind_valid))
print(len(ind_test))

## **Preprocessing Functions**

## Rescale to HU:
The Hounsfield unit (HU) is a relative quantitative measurement of radio density used by radiologists in the interpretation of computed tomography (CT) images. The absorption/attenuation coefficient of radiation within a tissue is used during CT reconstruction to produce a grayscale image.

For rescaling : We will just need to multiply the values by the slope (dicom.RescaleSlope) and add the intercept (dicom.RescaleIntercept).

In [None]:
def image_to_hu(dicom):

    image = dicom.pixel_array.astype(np.float64)
         
    # Convert to Hounsfield units (HU)
    intercept = dicom.RescaleIntercept
    slope = dicom.RescaleSlope
    
    if slope != 1:
        image = slope * image.astype(np.float64)
        image = image.astype(np.float64)
        
    image += np.float64(intercept)
    
    image[image < -1024] = -1024 # Setting values smaller than air, to air.
    # Values smaller than -1024, are probably just outside the scanner.
    return image

## **Windowing**

Although, we can interpret the values in the image now, we cannot really see anything in the image. But now we can look at a certain value range, which we now is interesting. Typically this is done by a process called windowing, where the image is basically clipped to a certain value range. Which is defined as:

WindowLength ± WindowWidth2 

A range in Hounsfield Units that might be interesting to look at is a width of 80 and a center of 40, described as useful for analyses of brains (source: https://radiopaedia.org/articles/windowing-ct), as well as a width of 130 and a center of 50.

In [None]:
def image_windowed(image, custom_center=50, custom_width=130, out_side_val=False):
    '''
    Important thing to note in this function: The image migth be changed in place!
    '''
    # see: https://www.kaggle.com/allunia/rsna-ih-detection-eda-baseline
    min_value = custom_center - (custom_width/2)
    max_value = custom_center + (custom_width/2)
    
    # Including another value for values way outside the range, to (hopefully) make segmentation processes easier. 
    out_value_min = custom_center - custom_width
    out_value_max = custom_center + custom_width
    
    if out_side_val:
        image[np.logical_and(image < min_value, image > out_value_min)] = min_value
        image[np.logical_and(image > max_value, image < out_value_max)] = max_value
        image[image < out_value_min] = out_value_min
        image[image > out_value_max] = out_value_max
    
    else:
        image[image < min_value] = min_value
        image[image > max_value] = max_value
    
    return image

## Resample images : 

Slice Thickness: 2.500000
Pixel Spacing (row, col): (0.722656, 0.722656) 

This means we have 2.5 mm slices, and each voxel represents 0.7 mm.

Because a CT slice is typically reconstructed at 512 x 512 voxels, each slice represents approximately 370 mm of data in length and width.

Using the metadata from the DICOM we can figure out the size of each voxel as the slice thickness. It would be useful to ensure that each slice is resampled in 1x1x1 mm pixels and slices.

In [None]:
def image_resample(image, dicom_header, new_spacing=[1,1]):
    # Code from https://www.raddq.com/dicom-processing-segmentation-visualization-in-python/
    # Adapted to work for pixels.
    spacing = map(float, dicom_header.PixelSpacing)
    spacing = np.array(list(spacing))
    resize_factor = spacing / new_spacing
    new_real_shape = image.shape * resize_factor
    new_shape = np.round(new_real_shape)
    real_resize_factor = new_shape / image.shape
    new_spacing = spacing / real_resize_factor
    
    image = scipy.ndimage.interpolation.zoom(image, real_resize_factor)
    
    return image

## Cropping images:

numpy.nonzero()function is used to Compute the indices of the elements that are non-zero.

It returns a tuple of arrays, one for each dimension of arr, containing the indices of the non-zero elements in that dimension.

In [None]:
def image_crop(image):
    # Based on this stack overflow post: https://stackoverflow.com/questions/26310873/how-do-i-crop-an-image-on-a-white-background-with-python
    # mask is a matrix with pixels = 0 (black) will be set to 1
    mask = image == 0

    # Find the bounding box of those pixels
    # now the inside of the image is the inverted and the outside is zeros 
    coords = np.array(np.nonzero(~mask))
    top_left = np.min(coords, axis=1)
    bottom_right = np.max(coords, axis=1)

    out = image[top_left[0]:bottom_right[0],
                top_left[1]:bottom_right[1]]
    
    return out

## **Padding**

Bring images back to equal spacing
Pading the images puts the brain in the center and keeps the resampled voxel dimensions. A further thing to test out, might be to resize the images to fill out the whole space.

In [None]:
def image_pad(image, new_height = 512, new_width = 512):
    # based on https://stackoverflow.com/questions/26310873/how-do-i-crop-an-image-on-a-white-background-with-python
    height, width = image.shape

    # make canvas
    im_bg = np.zeros((new_height, new_width))

    # Your work: Compute where it should be
    pad_left = int( (new_width - width) / 2)
    pad_top = int( (new_height - height) / 2)

    im_bg[pad_top:pad_top + height,
          pad_left:pad_left + width] = image

    return im_bg

## Resize to same dimensions:

In [None]:
# image here is ds.pixel_array.
from skimage.transform import resize
def resize_image(img, IMG_PX_SIZE=32):
#     print(img.shape)
    resized_img = resize(img, (IMG_PX_SIZE, IMG_PX_SIZE))
#     print(resized_img.shape)
    return resized_img

# **Dealing with DOCIM**

In [None]:
# Get list of paths for total_imgs
# images_id = train_images_dir + id + .dcm 
train_images_dir = '../input/rsna-intracranial-hemorrhage-detection/rsna-intracranial-hemorrhage-detection/stage_2_train/'
img_paths = []
for image_id in images_id:
    file = train_images_dir+image_id+'.dcm'
    img_paths.append(file)
    
print(len(img_paths))

In [None]:
# Reading  docim images
# + Preprocessing
# m = 10
img_list = []
for path in img_paths:
    ds = pydicom.dcmread(path)
    im = ds.pixel_array
    # Hounsfield     
    im = image_to_hu(ds)
    # Windowing
    im = image_windowed(im)
    # Resampling
    im = image_resample(im, ds)
    # Cropping    
#     im = image_crop(im)
#     # Padding
#     im = image_pad(im)
    # Resizing
    im = resize_image(im,64)
    img_list.append(im)

img_array = np.asarray(img_list) 
print('done reading images')
print(img_array.shape)

## **Converting Images to 3 Channels**

In [None]:
#convert to 3 channels
img_array = np.repeat(img_array[..., np.newaxis], 3, -1)
img_array.shape

## **Creating train/validation/test images and labels**

In [None]:
selected_ds = selected_ds.drop('ID', axis=1)
selected_ds = selected_ds.drop('any', axis=1)

**Convert to numpy float32**

In [None]:
img_array = np.asarray(img_array).astype(np.float32)
selected_ds = np.asarray(selected_ds).astype(np.float32)


In [None]:
selected_ds.shape

In [None]:
#After reading img_array
train_images = [img_array[i] for i in ind_train]
valid_images = [img_array[i] for i in ind_valid]
test_images = [img_array[i] for i in ind_test]

train_labels = [selected_ds[i] for i in ind_train]
valid_labels = [selected_ds[i] for i in ind_valid]
test_labels = [selected_ds[i] for i in ind_test]

In [None]:
train_labels = np.asarray(train_labels).astype(np.float32)
train_images = np.asarray(train_images).astype(np.float32)
valid_labels = np.asarray(train_labels).astype(np.float32)
valid_images = np.asarray(train_images).astype(np.float32)
test_labels = np.asarray(train_labels).astype(np.float32)
test_images = np.asarray(train_images).astype(np.float32)

In [None]:
for i in range(0,5):
    plt.imshow(img_list[i],cmap=plt.cm.bone)
    plt.figure(i+1)

plt.show()

# **Building a Neural Network**

In [None]:
from keras.applications import ResNet50,ResNet101
from tensorflow import Tensor
from keras import layers
from keras.models import Sequential
from keras.optimizers import Adam
from tensorflow.keras.layers import Input, Conv2D, ReLU, BatchNormalization,\
                                    Add, AveragePooling2D, Flatten, Dense
from tensorflow.keras.models import Model

from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization


1. **Turn internet on to download densenet (or any pretrained weights) because it's off by default on Kaggle**


> *In the context of transfer learning, standard architectures designed for ImageNet with corresponding pretrained weights are fine-tuned on medical tasks ranging from interpreting chest x-rays and identifying eye diseases, to early detection of Alzheimer’s disease.*
> *Most published deep learning models for healthcare data analysis are pretrained on ImageNet, Cifar10, etc. Pretraining most times does not necessarily need to be done on dataset of similar domain but just to give a model a general context about objects. This has been proven to fasten convergence of deep models than training from scratch.*

In [None]:
resnet = ResNet101(
    weights='imagenet',
    include_top=False,
    input_shape=(64,64,3),
)

In [None]:
# Initializations
BATCH_SIZE = 32
EPOCHS = 5
learning_rate=1e-3


In [None]:
# Resnet pretrained - AvgPool - Flatten - FC - Sigmoid
def build_model():
    model = Sequential()
    model.add(resnet)
    model.add(layers.GlobalAveragePooling2D())
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(5, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer=Adam(lr=learning_rate), metrics=['accuracy'])
    
    return model

In [None]:
# fit the keras model on the dataset
model = build_model()
model.fit(train_images, train_labels, epochs=EPOCHS, batch_size=BATCH_SIZE)

In [None]:
# evaluate the keras model
_, accuracy = model.evaluate(valid_images, valid_labels)
print('Accuracy: %.2f' % (accuracy*100))

## **Second NN Model**

In [None]:

model2 = Sequential()
model2.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(64,64,3)))
model2.add(BatchNormalization())

model2.add(Conv2D(32, kernel_size=(3, 3), activation='relu'))
model2.add(BatchNormalization())
model2.add(MaxPooling2D(pool_size=(2, 2)))

model2.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
model2.add(BatchNormalization())
model2.add(Dropout(0.25))

model2.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
model2.add(BatchNormalization())
model2.add(Dropout(0.25))


model2.add(Conv2D(256, kernel_size=(3, 3), activation='relu'))
model2.add(BatchNormalization())
model2.add(MaxPooling2D(pool_size=(2, 2)))
model2.add(Dropout(0.25))


model2.add(Flatten())

model2.add(Dense(512, activation='relu'))
model2.add(BatchNormalization())
model2.add(Dropout(0.25))

model2.add(Dense(128, activation='relu'))
model2.add(BatchNormalization())
model2.add(Dropout(0.5))


model2.add(Dense(5, activation='sigmoid'))

model2.compile(loss='binary_crossentropy',optimizer=Adam(lr=learning_rate), metrics=['accuracy'])


In [None]:
model2.fit(train_images, train_labels, epochs=EPOCHS, batch_size=BATCH_SIZE)

In [None]:
# evaluate the keras model
_, accuracy = model2.evaluate(valid_images, valid_labels)
print('Accuracy: %.2f' % (accuracy*100))

## NOTES : 
### pre-trained Model:
1. using batch size = 32 and epoch = 5 ,image is 32x32 accuracy:15% and loss : 0.6  TOO MUCH 

* Binary cross-entropy is for multi-label classifications, whereas categorical cross entropy is for multi-class classification where each example belongs to a single class.

2. increase size = 64 * 64 and removing padding and cropping  and use binary cross entropy instead of categorical:
    loss decreased to : 0.2 and accuracy increased but 5% 
3. Shuffled data splitting and added more epidural examples (3000) any=0 (8000) Loss : 0.3262 Training Accuracy:35% Dev Acc:23%
4. Added more data (136 images for each type of hemorrhage) acc = 27 % (first epoch) --> no change
5. Using resnet101 instad of resnet50 acc increased to 37% , dev acc :31%


### Implemented Model: 
1. batch size =32, epochs= 5 accuracy:30% and loss: 0.3495  dev acc :61%
2. add 1 more layer accuracy:30%, dev acc: 42%
3. decreased dense layer dropout to 0.25 instead 0.5 and removed dropout from conv layer accuracy:32%, dev acc:42%



TODO :
1. Reading more data as epidural hemorrhage is just 900 **--DONE--**
2. Don't save numpy array **--DONE--**
3. Shuffle data after preprocessing ( cancel sorting) **--DONE--**
4. Check Cropping and Padding **--( better without )--**
5. Change imagenet to a different pretrained 
6. Create model

In [None]:
# make probability predictions with the model
predictions = model.predict(test_images)
predictions=(predictions>0.5).astype(int)



In [None]:
predictions2 = model2.predict(test_images)
predictions2=(predictions2>0.5).astype(int)



In [None]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score


In [None]:
#model 1 test accuracy
accuracy = accuracy_score(test_labels, predictions)
print('Accuracy: %f' % accuracy*100)

#Accuracy: 0.532738


In [None]:
#model 2 test accuracy
accuracy2 = accuracy_score(test_labels, predictions2)
print('Accuracy: %f' % accuracy2)

#Accuracy: 0.477249
