In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import tensorflow as tf
import tensorflow_io as tfio
import os
from tqdm import tqdm
import cv2
from PIL import Image
from joblib import Parallel, delayed
import gc
from PIL import Image as im
import random

In [2]:
import sys
!{sys.executable} -m pip install dicomsdl
#!pip install /kaggle/input/rsnapacks/dicomsdl-0.109.1-cp36-cp36m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
#import pydicom as dicom
import dicomsdl as dicom
import tensorflow as tf
import tensorflow_io as tfio
from PIL import Image as im
import os
from tqdm import tqdm
import cv2
from PIL import Image
from joblib import Parallel, delayed
import gc
import random

Collecting dicomsdl
  Downloading dicomsdl-0.109.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dicomsdl
Successfully installed dicomsdl-0.109.1
[0m

In [3]:
datasetPath = '/kaggle/input/rsna-breast-cancer-detection/train.csv'
imgPath = '/kaggle/input/rsna-breast-cancer-detection/train_images/'

def crop(sideName, imgName):
    """
    This function is used to crop the breast images. It takes two arguments.
    
    Input:-
    :sideName = Laterality of breast if it is right or left
    :imgName = Image pixel data of the DCM images
    
    Output:-
    :return = Output after cropping the image.
    
    """
    if sideName == 'L':
        colind=[]
        for r,row in enumerate(imgName):
            for c,col in enumerate(row):
                if col==0:
                    colind.append(c)
                    break
        crop_size = max(colind)
        imgName = imgName[0:512,0:crop_size]
        imgName = cv2.resize(imgName,(128,128))
        
    if sideName == 'R':
        colind=[]
        for r,row in enumerate(imgName):
            for c,col in enumerate(row):
                if col!=0:
                    colind.append(c)
                    break
        crop_size = min(colind)
        imgName = imgName[0:512,crop_size:512]
        imgName = cv2.resize(imgName,(128,128))
    
    return imgName    

    gc.collect()
    
def crop_reverse(sideName, imgName):
    
    """
    This function is used to crop the breast images but in the reverse order.
    Because the laterality is defined wrongly for some images. It takes two arguments.
    
    Input:-
    :sideName = Laterality of breast if it is right or left
    :imgName = Image pixel data of the DCM images
    
    Output:-
    :return = Output after cropping the image.
    
    """
    
    if sideName == 'R':
        colind=[]
        for r,row in enumerate(imgName):
            for c,col in enumerate(row):
                if col==0:
                    colind.append(c)
                    break
        crop_size = max(colind)
        imgName = imgName[0:512,0:crop_size]
        imgName = cv2.resize(imgName,(128,128))
        
    if sideName == 'L':
        colind=[]
        for r,row in enumerate(imgName):
            for c,col in enumerate(row):
                if col!=0:
                    colind.append(c)
                    break
        crop_size = min(colind)
        imgName = imgName[0:512,crop_size:512]
        imgName = cv2.resize(imgName,(128,128))
    
    return imgName    

    gc.collect()
    

def img_process(i,filename,sides):
    
    """
    This function is used to process the images which will be used for the training/test dataset. It takes three arguments.
    
    Input:-
    :i = Index of the image in the dataframe
    :filename = Path of the image
    :sides = List of all images' laterality
    
    Output:-
    :return = Output after cropping the image.
    
    """
    
    
    #ds = dicom.dcmread(filename)
    dsraw = dicom.open(filename)
    ds = dsraw.pixelData()
    
    ds = (ds - ds.min()) / (ds.max() - ds.min())
    if dsraw.PhotometricInterpretation == "MONOCHROME1":  
        ds = 1 - ds
    ds = (ds * 255).astype(np.uint8)

    
    #ds = cv2.normalize(ds, None, 0, 1.0, cv2.NORM_MINMAX, dtype=cv2.CV_32F)
    ds = cv2.resize(ds,(512,512))
    
    #ds = np.where(ds >= 0.999, 0,ds)
    
    try:
        ds = np.array(crop(sides[i], ds))   
    except:
        ds = np.array(crop_reverse(sides[i], ds))
    

    #train_data.loc[i,'img_data'] = [img_fin]
    #train_data.to_csv('/kaggle/working/training_img_data.csv') 
    return ds
    gc.collect()

def dcmToPix(datasetPath, imgPath):
    
    """
    This function is used to process all the images which will be used for the training/test dataset. It takes two arguments.
    
    Input:-
    :datasetPath = Path of the cancer dataset
    :imgPath = Path of the image dataset
   
    Output:-
    :return = Array of all the processed images
    
    """
    
    
    dataset = pd.read_csv(datasetPath)
    
    patient_ids = dataset['patient_id']
    image_ids = dataset['image_id']
    sides  = dataset['laterality']

    imgData = []

    for pi, ii, leng in zip(patient_ids, image_ids, range(len(patient_ids))):
        imgData.append(imgPath + str(pi) + '/' + str(ii) + '.dcm')

    dataset['img_data'] = " "
    
    result = Parallel(n_jobs=128)(\
    delayed(img_process)(i, fname, sides) for i, fname in zip(range(len(imgData)),tqdm(imgData))\
    )
    
    dataset['img_data'] = result
    dataset.to_pickle('imgData.pkl' )
    
    return result
    

In [4]:
with (open('/kaggle/input/output/imgData.pkl', "rb")) as openfile:
     imgData = pickle.load(openfile)

In [5]:
#datacancer = pd.read_csv(datasetPath)
#imgDataFrame = {'cancer':datacancer['cancer'][:50], 'img_data':imgData}
#imgData2 = pd.DataFrame(imgDataFrame)

#imgData=imgData2

In [6]:
imgdata_pos = imgData[imgData['cancer'] == 1]
imgdata_neg = imgData[imgData['cancer'] == 0]

imgdata_pos = imgdata_pos.sample(frac = 1)
imgdata_neg = imgdata_neg.sample(frac = 1)

imgdata_neg = imgdata_neg.sample(frac= 0.6)

frames = 9*[imgdata_pos]
frames.append(imgdata_neg)
imgdata_shuff = pd.concat(frames)
imgdata_shuff = imgdata_shuff.sample(frac=1)

In [7]:
print(imgdata_shuff['cancer'].value_counts())

0    32129
1    10422
Name: cancer, dtype: int64


In [8]:
def random_rotate(imgData):
    #thresh = 0.15
    
    imgData = imgData.reshape(128,128)
    
    #imgData = np.clip(imgData,thresh,1)
    imgData = imgData[25:105,25:105]
   
    imgData = im.fromarray(imgData)
    imgData = np.asarray(imgData.rotate(random.randint(-40, 40)))
    return imgData

In [9]:
imgDataList=[]
for j in tqdm(imgdata_shuff['img_data']):
    imgDataList.append(random_rotate(j))
imgdata_shuff['img_data'] = imgDataList

100%|██████████| 42551/42551 [00:07<00:00, 5947.32it/s]


In [10]:
from sklearn.model_selection import train_test_split
train,test = train_test_split(imgdata_shuff, test_size=0.3, random_state=42, shuffle=True)



train_target = np.array(train['cancer'])
train_features=[]
for i in train['img_data']:
    i=np.array(i)
    train_features.append(cv2.cvtColor(i,cv2.COLOR_GRAY2RGB))
train_features=np.array(train_features)



#featureTransform = train_features.reshape(len(train_features), 6400)

#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()
#featureTransform = scaler.fit_transform(featureTransform)
#norm_features= []
#for i in range(len(train_features)):
#        norm_features.append(scaler.fit_transform(train_features[i]))
#train_features=np.array(norm_features)




#backTransform = featureTransform.reshape(len(train_features),80,80)
#train_features = backTransform.reshape(len(train_features),80,80,1)
train_features = train_features.reshape(len(train_features),80,80,3)





In [11]:
from keras.models import Sequential
from keras.layers import LeakyReLU
from keras.layers import Dense, Conv2D, Flatten,MaxPooling2D, Dropout, GlobalMaxPooling2D
#model = Sequential()
#model.add(Conv2D(64, 3, activation = "relu", input_shape = (80,80,1)))
#model.add(MaxPooling2D())
#model.add(LeakyReLU(alpha=0.1))
#model.add(Dropout(0.3))
#model.add(Conv2D(32, 3, activation = "relu"))
#model.add(MaxPooling2D())
#model.add(LeakyReLU(alpha=0.1))
#model.add(Conv2D(16, 3, activation = "relu"))
#model.add(MaxPooling2D())
#model.add(Dropout(0.3))
#model.add(Flatten())
#model.add(Dense(128, activation = 'relu')) 
#model.add(LeakyReLU(alpha=0.1))
#model.add(Dropout(0.3))
#model.add(Dense(1, activation = 'sigmoid'))




from keras import backend as K
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives )
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives )
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall))





#loss = tf.keras.losses.BinaryFocalCrossentropy(\
#    apply_class_balancing=True, gamma=5, from_logits=True,\
#    reduction=tf.keras.losses.Reduction.NONE)


#model.compile(optimizer =tf.keras.optimizers.Adam(learning_rate=0.0001),\
#              loss= 'binary_crossentropy', metrics=['accuracy'])



In [12]:
#from tensorflow import keras
#model = keras.models.load_model("/kaggle/input/pre-trained-model-of-breast-cancer/trained_model_breast_cancer3.h5")


In [13]:
from tensorflow.keras.applications import EfficientNetB2
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import LeakyReLU
from keras.layers import Dense, Conv2D, Flatten,MaxPooling2D, Dropout, GlobalMaxPooling2D


efficient_net = EfficientNetB2(\
    weights='imagenet',\
    input_shape=(80,80,3),\
    include_top=False,\
    pooling='max'\
)

model = Sequential()
model.add(efficient_net)
model.add(Dense(units = 120, activation='relu'))
#model.add(Dense(units = 120, activation = 'relu'))
model.add(Dense(units = 1, activation='sigmoid'))

model.compile(optimizer =tf.keras.optimizers.Adam(learning_rate=0.0001),\
              loss= 'binary_crossentropy', metrics=['accuracy'])

weights = {0:1, 1:3}
model.fit(train_features, train_target,  class_weight=weights,validation_split=0.3,batch_size = 64,epochs=50)




2023-01-26 15:44:10.167401: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb2_notop.h5


2023-01-26 15:44:16.193790: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f044bd25550>

In [14]:
model.save("/kaggle/working/trained_model_breast_cancer3.h5")



In [15]:
#from tensorflow import keras
#savedModel = keras.models.load_model("/kaggle/input/pre-trained-model-of-breast-cancer/trained_model_breast_cancer3.h5")



In [16]:
test_features=[]
for i in test['img_data']:
    i=np.array(i)
    test_features.append(cv2.cvtColor(i,cv2.COLOR_GRAY2RGB))
test_features=np.array(test_features)
 

    
#featureTransform = test_features.reshape(len(test_features), 6400)
#scaler = StandardScaler()
#featureTransform =scaler.fit_transform(featureTransform)
#norm_features= []
#for i in range(len(test_features)):
#    norm_features.append(scaler.fit_transform(test_features[i][0]))
#test_features=np.array(norm_features)


#backTransform = featureTransform.reshape(len(test_features),80,80)
test_features = test_features.reshape(len(test_features),80,80,3)



test_target = np.array(test['cancer'])

In [17]:
pred = model.predict(test_features)
bin_pred = []
for i in pred:
    if i>=0.5:
        bin_pred.append(1)
    else:
        bin_pred.append(0)
bin_pred = np.array(bin_pred)


In [18]:
import sklearn
acc = sklearn.metrics.accuracy_score(test_target, bin_pred)
print(acc)

countzero=0
countone=0
countzerot=0
countonet=0
for i, j in zip(test_target, bin_pred):
    if i==0 and j==0:
        countzero+=1
    if i==1 and j==1:
        countone+=1
    if i==1:
        countonet+=1
    if i==0:
        countzerot+=1
print(countzero/countzerot,countone/countonet)

0.43388688704370987
0.35098100280286515 0.6887966804979253


In [19]:

def recall_m(y_true, y_pred):
    true_positives = np.sum(np.round(np.clip(y_true * y_pred, 0, 1)))
    possible_positives = np.sum(np.round(np.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives )
    return recall

def precision_m(y_true, y_pred):
    true_positives = np.sum(np.round(np.clip(y_true * y_pred, 0, 1)))
    predicted_positives = np.sum(np.round(np.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives )
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall))


print(f1_m(test_target,bin_pred))

0.3739062635363424


In [20]:
datasetPath = '/kaggle/input/rsna-breast-cancer-detection/test.csv'
imgPath = '/kaggle/input/rsna-breast-cancer-detection/test_images/'

testImgData = dcmToPix(datasetPath, imgPath)

 75%|███████▌  | 3/4 [00:00<00:00,  4.56it/s]


In [21]:
datacancer = pd.read_csv(datasetPath)
imgDataFrame = {'img_data':testImgData}
imgData2 = pd.DataFrame(imgDataFrame)
testImgData=imgData2
imgdata_shuff = testImgData.sample(frac=1)

In [22]:
imgDataList=[]
for j in tqdm(imgdata_shuff['img_data']):
    imgDataList.append(random_rotate(j))
imgdata_shuff['img_data'] = imgDataList

100%|██████████| 4/4 [00:00<00:00, 1877.91it/s]


In [23]:
test=imgdata_shuff


test_features=[]
for i in test['img_data']:
    i=np.array(i)
    test_features.append(cv2.cvtColor(i,cv2.COLOR_GRAY2RGB))
test_features=np.array(test_features)
 
#featureTransform = test_features.reshape(len(test_features), 6400)
#scaler = StandardScaler()
#featureTransform =scaler.fit_transform(featureTransform)
#norm_features= []
#for i in range(len(test_features)):
#    norm_features.append(scaler.fit_transform(test_features[i][0]))
#test_features=np.array(norm_features)


#backTransform = featureTransform.reshape(len(test_features),80,80)
test_features = test_features.reshape(len(test_features),80,80,3)




In [24]:
pred = model.predict(test_features)
bin_pred = []
for i in pred:
    if i>=0.5:
        bin_pred.append(1)
    else:
        bin_pred.append(0)
bin_pred = np.array(bin_pred)
print(bin_pred)

[1 0 0 1]


In [25]:
testData = pd.read_csv('/kaggle/input/rsna-breast-cancer-detection/test.csv')

submissionFrame={'prediction_id':testData['prediction_id'],'cancer':bin_pred}
submission = pd.DataFrame(submissionFrame)
print(submission)

  prediction_id  cancer
0       10008_L       1
1       10008_L       0
2       10008_R       0
3       10008_R       1
