In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import tensorflow as tf
import tensorflow_io as tfio
import os
from tqdm import tqdm
import cv2
from PIL import Image
from joblib import Parallel, delayed
import gc
from PIL import Image as im
import random
from scipy import ndimage


In [2]:
import sys
!{sys.executable} -m pip install dicomsdl
#!pip install /kaggle/input/rsnapacks/dicomsdl-0.109.1-cp36-cp36m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
#import pydicom as dicom
import dicomsdl as dicom
import tensorflow as tf
import tensorflow_io as tfio
from PIL import Image as im
import os
from tqdm import tqdm
import cv2
from PIL import Image
from joblib import Parallel, delayed
import gc
import random

Collecting dicomsdl
  Downloading dicomsdl-0.109.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dicomsdl
Successfully installed dicomsdl-0.109.1
[0m

In [3]:
datasetPath = '/kaggle/input/rsna-breast-cancer-detection/train.csv'
imgPath = '/kaggle/input/rsna-breast-cancer-detection/train_images/'

def crop(sideName, imgName):
    """
    This function is used to crop the breast images. It takes two arguments.
    
    Input:-
    :sideName = Laterality of breast if it is right or left
    :imgName = Image pixel data of the DCM images
    
    Output:-
    :return = Output after cropping the image.
    
    """
    if sideName == 'L':
        colind=[]
        for r,row in enumerate(imgName):
            for c,col in enumerate(row):
                if col==0:
                    colind.append(c)
                    break
        crop_size = max(colind)
        imgName = imgName[0:512,0:crop_size]
        imgName = cv2.resize(imgName,(128,128))
        
    if sideName == 'R':
        colind=[]
        for r,row in enumerate(imgName):
            for c,col in enumerate(row):
                if col!=0:
                    colind.append(c)
                    break
        crop_size = min(colind)
        imgName = imgName[0:512,crop_size:512]
        imgName = cv2.resize(imgName,(128,128))
    
    return imgName    

    gc.collect()
    
def crop_reverse(sideName, imgName):
    
    """
    This function is used to crop the breast images but in the reverse order.
    Because the laterality is defined wrongly for some images. It takes two arguments.
    
    Input:-
    :sideName = Laterality of breast if it is right or left
    :imgName = Image pixel data of the DCM images
    
    Output:-
    :return = Output after cropping the image.
    
    """
    
    if sideName == 'R':
        colind=[]
        for r,row in enumerate(imgName):
            for c,col in enumerate(row):
                if col==0:
                    colind.append(c)
                    break
        crop_size = max(colind)
        imgName = imgName[0:512,0:crop_size]
        imgName = cv2.resize(imgName,(128,128))
        
    if sideName == 'L':
        colind=[]
        for r,row in enumerate(imgName):
            for c,col in enumerate(row):
                if col!=0:
                    colind.append(c)
                    break
        crop_size = min(colind)
        imgName = imgName[0:512,crop_size:512]
        imgName = cv2.resize(imgName,(128,128))
    
    return imgName    

    gc.collect()
    

def img_process(i,filename,sides):
    
    """
    This function is used to process the images which will be used for the training/test dataset. It takes three arguments.
    
    Input:-
    :i = Index of the image in the dataframe
    :filename = Path of the image
    :sides = List of all images' laterality
    
    Output:-
    :return = Output after cropping the image.
    
    """
    
    
    #ds = dicom.dcmread(filename)
    dsraw = dicom.open(filename)
    ds = dsraw.pixelData()
    
    ds = (ds - ds.min()) / (ds.max() - ds.min())
    if dsraw.PhotometricInterpretation == "MONOCHROME1":  
        ds = 1 - ds
    ds = (ds * 255).astype(np.uint8)

    
    #ds = cv2.normalize(ds, None, 0, 1.0, cv2.NORM_MINMAX, dtype=cv2.CV_32F)
    ds = cv2.resize(ds,(512,512))
    
    #ds = np.where(ds >= 0.999, 0,ds)
    
    try:
        ds = np.array(crop(sides[i], ds))   
    except:
        ds = np.array(crop_reverse(sides[i], ds))
    

    #train_data.loc[i,'img_data'] = [img_fin]
    #train_data.to_csv('/kaggle/working/training_img_data.csv') 
    return ds
    gc.collect()

def dcmToPix(datasetPath, imgPath):
    
    """
    This function is used to process all the images which will be used for the training/test dataset. It takes two arguments.
    
    Input:-
    :datasetPath = Path of the cancer dataset
    :imgPath = Path of the image dataset
   
    Output:-
    :return = Array of all the processed images
    
    """
    
    
    dataset = pd.read_csv(datasetPath)
    
    patient_ids = dataset['patient_id']
    image_ids = dataset['image_id']
    sides  = dataset['laterality']

    imgData = []

    for pi, ii, leng in zip(patient_ids, image_ids, range(len(patient_ids))):
        imgData.append(imgPath + str(pi) + '/' + str(ii) + '.dcm')

    dataset['img_data'] = " "
    
    result = Parallel(n_jobs=128)(\
    delayed(img_process)(i, fname, sides) for i, fname in zip(range(len(imgData)),tqdm(imgData))\
    )
    
    dataset['img_data'] = result
    dataset.to_pickle('imgData.pkl' )
    
    return result
    

In [4]:
with (open('/kaggle/input/output/imgData.pkl', "rb")) as openfile:
     imgData = pickle.load(openfile)

In [5]:
#datacancer = pd.read_csv(datasetPath)
#imgDataFrame = {'cancer':datacancer['cancer'][:50], 'img_data':imgData}
#imgData2 = pd.DataFrame(imgDataFrame)

#imgData=imgData2

In [6]:
imgdata_pos = imgData[imgData['cancer'] == 1]
imgdata_neg = imgData[imgData['cancer'] == 0]

imgdata_pos = imgdata_pos.sample(frac = 1)
imgdata_neg = imgdata_neg.sample(frac = 1)

imgdata_neg = imgdata_neg.sample(frac= 0.7)

frames = 5*[imgdata_pos]
frames.append(imgdata_neg)
imgdata_shuff = pd.concat(frames)
imgdata_shuff = imgdata_shuff.sample(frac=1)

In [7]:
print(imgdata_shuff['cancer'].value_counts())

0    37484
1     5790
Name: cancer, dtype: int64


In [8]:
from sklearn.decomposition import PCA


def random_rotate(imgData):
    #thresh = 0.15
    
    imgData = imgData.reshape(128,128)
    clahe = cv2.createCLAHE(clipLimit=5, tileGridSize=(3,3))
    imgData = clahe.apply(imgData)
    imgData = ndimage.rotate(imgData, random.randint(-30, 30), reshape=False)

    #imgData = np.clip(imgData,thresh,1)
    imgData = imgData[15:110,15:110]
    pca = PCA(25)
    imgData = pca.fit_transform(imgData)
    imgData = pca.inverse_transform(imgData)
    #imgData = cv2.resize(imgData,(528,528))
    
    
    #imgData = im.fromarray(imgData)
    #imgData = np.asarray(imgData.rotate(random.randint(-20, 20)))
    return imgData

In [9]:
imgDataList=[]
for j in tqdm(imgdata_shuff['img_data']):
    imgDataList.append(random_rotate(j))
imgdata_shuff['img_data'] = imgDataList

  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
100%|██████████| 43274/43274 [07:28<00:00, 96.55it/s] 


In [10]:
from sklearn.model_selection import train_test_split
train,test = train_test_split(imgdata_shuff, test_size=0.3, random_state=42, shuffle=True)



train_target = np.array(train['cancer'])
train_features=[]
for i in train['img_data']:
    i=np.array(i)
    train_features.append(i)
train_features=np.array(train_features)



#featureTransform = train_features.reshape(len(train_features), 6400)

#from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
#featureTransform = scaler.fit_transform(featureTransform)
norm_features= []
for i in range(len(train_features)):
        norm_features.append(scaler.fit_transform(train_features[i]))
train_features=np.array(norm_features)




#backTransform = featureTransform.reshape(len(train_features),80,80)
#train_features = backTransform.reshape(len(train_features),80,80,1)
train_features = train_features.reshape(len(train_features),95,95,1)
#train_features = np.repeat(train_features[..., np.newaxis], 3, -1)




In [11]:
from keras.models import Sequential
from keras.layers import LeakyReLU
from keras.layers import Dense, Conv2D, Flatten,MaxPooling2D, Dropout, BatchNormalization, GlobalMaxPooling2D
from tensorflow.keras.applications import *
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers
import tensorflow as tf
from tensorflow import keras
#conv_base = EfficientNetB6(weights='imagenet', include_top=False, input_shape=(95,95,3),drop_connect_rate=0.2)
model = Sequential()


model.add(Conv2D(8, 3, activation = "relu", input_shape = (95,95,1)))
#model.add(MaxPooling2D())
model.add(LeakyReLU(alpha=0.2))
model.add(Dropout(0.2))

model.add(Conv2D(16, 3, activation = "relu",kernel_regularizer=keras.regularizers.l1_l2(l1=0.1, l2=0.01)))
#model.add(MaxPooling2D())
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(0.1))

model.add(Conv2D(32, 3, activation = "relu",kernel_regularizer=keras.regularizers.l1_l2(l1=0.1, l2=0.01)))
model.add(MaxPooling2D())
model.add(LeakyReLU(alpha=0.1))
model.add(Dropout(0.2))

#model.add(Conv2D(512, 3, activation = "relu"))
#model.add(MaxPooling2D())
#model.add(LeakyReLU(alpha=0.2))
#model.add(Dropout(0.1))

#model.add(Conv2D(1024, 3, activation = "relu"))
#model.add(MaxPooling2D())
#model.add(LeakyReLU(alpha=0.2))
#model.add(Dropout(0.1))


model.add(Flatten())

#model.add(Dense(1024, activation = 'relu')) 
#model.add(LeakyReLU(alpha=0.2))
#model.add(Dropout(0.2))

#model.add(Dense(512, activation = 'relu')) 
#model.add(LeakyReLU(alpha=0.2))
#model.add(Dropout(0.1))

#model.add(Dense(512, activation = 'relu')) 
#model.add(LeakyReLU(alpha=0.2))
#model.add(Dropout(0.1))

model.add(Dense(1024, activation = 'relu',kernel_regularizer=keras.regularizers.l1_l2(l1=0.1, l2=0.01))) 
model.add(LeakyReLU(alpha=0.2))
model.add(Dropout(0.2))

model.add(Dense(256, activation = 'relu',kernel_regularizer=keras.regularizers.l1_l2(l1=0.1, l2=0.01))) 
model.add(LeakyReLU(alpha=0.2))
model.add(Dropout(0.1))


model.add(Dense(1, activation = 'sigmoid'))


#conv_base.trainable = True

#model.compile(optimizer =tf.keras.optimizers.Adam(learning_rate=0.0001),\
#          loss= 'binary_crossentropy', metrics=['accuracy'])

model.compile(\
    loss="binary_crossentropy",\
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),\
    metrics=["acc"],\
)





2023-02-10 01:43:11.614346: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-10 01:43:11.721532: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-10 01:43:11.722461: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-10 01:43:11.725041: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 93, 93, 8)         80        
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 93, 93, 8)         0         
_________________________________________________________________
dropout (Dropout)            (None, 93, 93, 8)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 91, 91, 16)        1168      
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 91, 91, 16)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 91, 91, 16)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 89, 89, 32)        4

In [13]:
#from tensorflow import keras
#model = keras.models.load_model("/kaggle/input/pre-trained-model-of-breast-cancer/trained_model_breast_cancer3.h5")


In [14]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

#datagen = ImageDataGenerator(\
#    rescale=1/255,\
#    validation_split=0.10,\
#    rotation_range=40,\
#    width_shift_range=0.2,\
#    height_shift_range=0.2,\
#    shear_range=0.2,\
#    zoom_range=0.2,\
#    horizontal_flip=True,\
#    fill_mode='nearest'\
#)


#datagen.fit(train_features)
weights = {0:1, 1:10}
model.fit(train_features, train_target,class_weight = weights,batch_size=64,validation_split=0.3,\
           epochs=150)


2023-02-10 01:43:16.021786: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 765428300 exceeds 10% of free system memory.
2023-02-10 01:43:16.943385: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 765428300 exceeds 10% of free system memory.
2023-02-10 01:43:17.585320: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/150


2023-02-10 01:43:18.489709: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 253755392 exceeds 10% of free system memory.
2023-02-10 01:43:18.645379: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 253755392 exceeds 10% of free system memory.
2023-02-10 01:43:18.799435: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 253755392 exceeds 10% of free system memory.
2023-02-10 01:43:20.975174: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 7

<keras.callbacks.History at 0x7f6dcc9dce10>

In [15]:
model.save("/kaggle/working/trained_model_breast_cancer3.h5")


In [16]:
#from tensorflow import keras
#savedModel = keras.models.load_model("/kaggle/input/pre-trained-model-of-breast-cancer/trained_model_breast_cancer3.h5")



In [17]:
test_features=[]
for i in test['img_data']:
    i=np.array(i)
    test_features.append(i)
test_features=np.array(test_features)
 

    
#featureTransform = test_features.reshape(len(test_features), 6400)
scaler = MinMaxScaler()
#featureTransform =scaler.fit_transform(featureTransform)
norm_features= []
for i in range(len(test_features)):
    norm_features.append(scaler.fit_transform(test_features[i]))
test_features=np.array(norm_features)


#backTransform = featureTransform.reshape(len(test_features),80,80)
test_features = test_features.reshape(len(test_features),95,95,1)





test_target = np.array(test['cancer'])

In [18]:

pred = model.predict(test_features)
bin_pred = []
for i in pred:
    if i>=0.5:
        bin_pred.append(1)
    else:
        bin_pred.append(0)
bin_pred = np.array(bin_pred)


In [19]:
import sklearn
acc = sklearn.metrics.accuracy_score(test_target, bin_pred)
print(acc)

countzero=0
countone=0
countzerot=0
countonet=0
for i, j in zip(test_target, bin_pred):
    if i==0 and j==0:
        countzero+=1
    if i==1 and j==1:
        countone+=1
    if i==1:
        countonet+=1
    if i==0:
        countzerot+=1
print(countzero/countzerot,countone/countonet)

0.12778248478779944
0.0 1.0


In [20]:

def recall_m(y_true, y_pred):
    true_positives = np.sum(np.round(np.clip(y_true * y_pred, 0, 1)))
    possible_positives = np.sum(np.round(np.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives )
    return recall

def precision_m(y_true, y_pred):
    true_positives = np.sum(np.round(np.clip(y_true * y_pred, 0, 1)))
    predicted_positives = np.sum(np.round(np.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives )
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall))


print(f1_m(test_target,bin_pred))

0.22660838683239995
