In [2]:
pip install pydicom

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pydicom
  Downloading pydicom-2.3.1-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-2.3.1


In [3]:
# DATA MANIPULATION
import numpy as np  
import pandas as pd 

# VISUALIZATION
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# MISC
import os
import cv2
import warnings
warnings.filterwarnings('ignore')
# import dicom to read the images
import pydicom as dicom

In [4]:
df1 = pd.read_csv('/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_detailed_class_info.csv')

FileNotFoundError: ignored

In [None]:
df1['class'].value_counts()

In [None]:
df1.drop_duplicates(inplace=True)

In [None]:
df1.info()

In [None]:
df1['class'].value_counts()

In [None]:
sns.countplot(x='class', data=df1)

In [None]:
classes = df1['class'].unique()
print(classes)

In [None]:
# We try to balance the dataset. Since we have many images (26.7kimages) and the length of lowest numbered class is 6012,
# we try to reduce each class to have 5000 images.
import random
np.random.seed(10)
df = pd.DataFrame(columns=['patientId','class'])
for cl in classes:
    df2 = df1[df1['class']==cl]
    n_remove = len(df2)-5000
    df2 = df2.sample(len(df2)-n_remove)
    print(df2.shape)
    df = df.append(df2,verify_integrity=True)
print(df.shape)
df = df.sort_index()
df.head()  
    

In [None]:
df['class'].value_counts()

In [None]:
sns.countplot(x='class',data=df)

In [None]:
#store rows that we removed into another dataframe that can be used for testing
df_t = df1[~df1.isin(df)].dropna()
df_t['class'].value_counts()

In [None]:
# Now we get 1000 rows of each class and form the testing data.
#min(df2['class'].value_counts())
import random
np.random.seed(10)
df_test = pd.DataFrame(columns=['patientId','class'])
for cl in classes:
    df2 = df_t[df_t['class']==cl]
    n_remove = len(df2)-1000
    df2 = df2.sample(len(df2)-n_remove)
    print(df2.shape)
    df_test = df_test.append(df2,verify_integrity=True)
print(df_test.shape)
df_test = df_test.sort_index()
df_test.head() 

In [None]:
X = df['patientId']
Y = df['class']
x_test = df_test['patientId']
y_test = df_test['class']

In [None]:
# Split the dataset into 4 sets for easier loading and avoiding memory errors, maintaining the same 'class' ratios.
# The idea is we train the model on each set separately and at the end store the weights. Load the weights back before training the next set.
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(X, Y, stratify = Y, test_size = 0.5, random_state =42)
x1, x2, y1, y2 = train_test_split(x_train, y_train, stratify = y_train, test_size = 0.5, random_state =42)
x3, x4, y3, y4 = train_test_split(x_val, y_val, stratify = y_val, test_size = 0.5, random_state =42)

In [None]:
print(x1.shape,y1.shape)
print(x2.shape,y2.shape)
print(x3.shape,y3.shape)
print(x4.shape,y4.shape)

In [None]:
del X, Y
del df1, df2, df_t, df
del x_train, x_val, y_train, y_val

In [None]:
import gc
gc.collect()

In [None]:
# Now we need to replace the x? datasets that has patiendId with their corresponding actual images/pixel data. 
# We do this by defining a function
def populate_X_images(x):
    os.chdir('/kaggle/input/rsna-pneumonia-detection-challenge/')
    file_path='stage_2_train_images'
    x_img = pd.DataFrame(columns=['image'])
    for Id in x:
        fname = str(Id) + ".dcm"
        ds = dicom.dcmread(os.path.join(file_path,fname))
        x_img = x_img.append({'image':ds.pixel_array},ignore_index=True)
    return x_img

def populate_X_test_images():
    os.chdir('/kaggle/input/rsna-pneumonia-detection-challenge/')
    file_path='stage_2_test_images'
    x_img = pd.DataFrame(columns=['image'])
    for fname in os.listdir(file_path):
        ds = dicom.dcmread(os.path.join(file_path,fname))
        x_img = x_img.append({'image':ds.pixel_array},ignore_index=True)
    return x_img

# Define a new function to resize and reshape according to our needs
import cv2
def resize_reshape_x(x, shape: tuple):
    X_rsz = []
    for i in (x['image']):
        img = cv2.resize(i, shape)
        img = np.stack((img,)*3, axis=-1)
        X_rsz.append(img)
    return np.array(X_rsz)

# Vizualizing the images
def plot_sample(x,y):
    plt.figure(figsize=(20,20))
    n=3
    j=1
    sample = random.sample(range(0,len(x)),3)
    for i in sample:
        plt.subplot(1,n,j)
        plt.imshow(x[i])
        plt.title("{}".format(y.iloc[i]))
        j+=1
# pre-process data
def pre_process(x_tr, x_val, y_tr, y_val):
    x_tr = (x_tr/255).astype('float16')
    y_tr = pd.get_dummies(y_tr)
    x_val = (x_val/255).astype('float16')
    y_val = pd.get_dummies(y_val)
    return x_tr, x_val, y_tr, y_val

def pre_process_test(x_test, y_test):
    x_test = (x_test/255).astype('float16')
    y_test = pd.get_dummies(y_test)
    return x_test, y_test

# Loading the images, replacing the patientIds. Also resize them to required,shape
def load_image(x):
    x_img = populate_X_images(x)
    print('Populated Images\n')
    x_img = resize_reshape_x(x_img,(224,224))
    return x_img

In [None]:
def create_model():
    import tensorflow as tf
    from tensorflow.keras.applications import Xception, ResNet50V2
    from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, Conv2D, Flatten, Activation, MaxPooling2D, concatenate, Input, BatchNormalization
    from tensorflow.keras.models import Model, Sequential
    input_x = tf.keras.Input(shape=(224,224,3),name='inputx')
    base_x_model = Xception(input_tensor=input_x,input_shape=(224,224,3),
                          weights='imagenet',
                          #pooling='max',
                          include_top=False) 
    for layer in base_x_model.layers[0:-11]: # has 71 layers in total
        layer.trainable = False
    #base_x_model.trainable = False
    #input_r = tf.keras.Input(shape=(224,224,3),name='inputr')
    #base_r_model = ResNet50V2(input_tensor=input_r,input_shape=(224,224,3),
    #                      weights='imagenet',
    #                      #pooling='max',
     #                     include_top=False) 
    #for layer in base_r_model.layers[0:-5]:
    #    layer.trainable = False
    #base_r_model.trainable = False
    #x = Sequential()
    #x = concatenate([base_x_model.output, base_r_model.output], axis=-1)
    x = base_x_model.output
    #x = GlobalAveragePooling2D()(x)
    x = MaxPooling2D()(x)
    #x = Conv2D(filters=1024,kernel_size=(1,1),padding='valid')(x)
    #x = Activation('relu')(x)
    #x = MaxPooling2D(pool_size=(2,2),strides=2,padding='valid')(x)
    x = Flatten()(x)
    x = Dropout(rate=0.5)(x)
    x = Dense(512, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(rate=0.5)(x)
    x = Dense(256, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(rate=0.5)(x)
    x = Dense(128, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(rate=0.5)(x)
    preds = Dense(3, activation='softmax')(x) #final layer with softmax activation

    merged_model = Model([base_x_model.input],
                           outputs=preds)

    # Compile the model
    opt = tf.keras.optimizers.Adam(learning_rate=0.00001) #reduce lr 
    merged_model.compile(optimizer=opt,
                         loss='categorical_crossentropy',
                         metrics=['accuracy'])

    #merged_model.summary()
    return merged_model

In [None]:
# function to plot Accuracy and loss plots
def plot_hist(tr_hist):
    accuracy = tr_hist.history['accuracy']
    val_accuracy = tr_hist.history['val_accuracy']
    loss = tr_hist.history['loss']
    val_loss = tr_hist.history['val_loss']
    epochs   = range(len(accuracy)) # Get number of epochs

    plt.plot  (epochs, accuracy, label = 'training accuracy')
    plt.plot  (epochs, val_accuracy, label = 'validation accuracy')
    plt.title ('Training and validation accuracy')
    plt.legend(loc = 'lower right')
    plt.figure()

    plt.plot  (epochs, loss, label = 'training loss')
    plt.plot  (epochs, val_loss, label = 'validation loss')
    plt.legend(loc = 'upper right')
    plt.title ('Training and validation loss')

def plot_hist_simple(tr_hist):
    pd.DataFrame(tr_hist).plot()

In [None]:
# Saving model and training history and its weights for future use
import pickle
def save_model_and_hist(model, hist, model_file, model_wght_file, hist_file):
    model.save(model_file)
    model.save_weights(model_wght_file)
    with open(hist_file, 'wb') as file_pi:
        pickle.dump(hist.history, file_pi)

# Load hist and plot
def load_and_plot_hist(model_name: str, i: int):
    hist_fname = "/kaggle/working/"+ model_name + "_hist" + str(i)
    with open(hist_fname, 'rb') as f:
        tr_hist = pickle.load(f)
    pd.DataFrame(tr_hist).plot()

# Load and return model
from tensorflow.keras.models import load_model
def load_and_return_model(modelname: str, j: int):
    model_fname = '/kaggle/working/'+ modelname + str(j) + ".h5"
    model = load_model(model_fname)
    return model

In [None]:
import gc
gc.collect()

In [None]:
import tensorflow.keras.callbacks as cb
def run_save_one_iteration(x_img, y, model, bsz, epo, modelname: str, j: int):
    #plot_sample(x_img, y)
    x_train, x_val, y_train, y_val = train_test_split(x_img, y, stratify = y, test_size = 0.2, random_state =42)
    x_train, x_val, y_train, y_val = pre_process(x_train, x_val, y_train, y_val)
    print(x_train.shape,y_train.shape,x_val.shape,y_val.shape)
    callback = cb.EarlyStopping(monitor='val_loss', patience = 10,restore_best_weights=True)
    tr_history = model.fit(x=[x_train],
                           y=y_train,
                           batch_size=bsz,
                           epochs=epo, 
                           validation_data=(({'inputx': x_val}) , y_val),
                           callbacks=callback
                           )
    #plot_hist_sample(tr_history)
    model_fname = '/kaggle/working/' + modelname + str(j) + ".h5"
    model_wname = '/kaggle/working/' + modelname + "_weights" + str(j) + ".h5"
    model_hname= '/kaggle/working/' + modelname + "_hist" + str(j)
    save_model_and_hist(model, tr_history, model_fname, model_wname, model_hname)
    del x_train, x_val, y_train, y_val, tr_history

    j=1
    modelname='densenet'
    model_fname = '/kaggle/working/' + modelname + str(j) + ".h5"
    model_wname = '/kaggle/working/' + modelname + "_weights" + str(j) + ".h5"
    model_hname= '/kaggle/working/' + modelname + "_hist" + str(j)
    save_model_and_hist(model_densenet,tr_history1,model_fname, model_wname, model_hname)

In [None]:
merged_model = create_model()
merged_model.load_weights('/kaggle/working/XcepRes_weights4.h5')

In [None]:
x_img = load_image(x1)
gc.collect()
run_save_one_iteration(x_img, y1, merged_model, 18, 50, 'XcepRes', 1)

In [None]:
load_and_plot_hist('XcepRes',1)
del x_img, merged_model
gc.collect()
model= load_and_return_model('XcepRes', 1)

In [None]:
x_img = load_image(x2)
gc.collect()
run_save_one_iteration(x_img, y2, model, 18, 50, 'XcepRes', 2)
load_and_plot_hist('XcepRes',2)

In [None]:
del x_img, model
gc.collect()
model= load_and_return_model('XcepRes', 2)

In [None]:
gc.collect()
x_img = load_image(x3)
gc.collect()
run_save_one_iteration(x_img, y3, model, 18, 50, 'XcepRes', 3)
load_and_plot_hist('XcepRes',3)

In [None]:
del x_img, model
gc.collect()
model= load_and_return_model('XcepRes', 3)

In [None]:
gc.collect()
x_img = load_image(x4)
gc.collect()
run_save_one_iteration(x_img, y4, model, 18, 50, 'XcepRes', 4)
load_and_plot_hist('XcepRes',4)

In [None]:
del x_img, model
gc.collect()
model= load_and_return_model('XcepRes', 4)

In [None]:
gc.collect()
x_img = load_image(x_test)
gc.collect()
classes = y_test.unique()
x_test, y_test = pre_process_test(x_img, y_test)
gc.collect()
y_pred = model.predict([x_test])

In [None]:
y_pred_label = []
for i in range(0,len(y_pred)):
    y_pred_label.append(y_pred[i].argmax())

In [None]:
classes = list(y_test.columns)
print(classes)

In [None]:
y_test_label = []
num_classes = 3
for index, row in y_test.iterrows():
    for i in range(0,num_classes):
        if row[i] == 1:
            y_test_label.append(i)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test_label, y_pred_label)

In [None]:
print(classes)
print(cm)

In [None]:
#accuracy = (710+439+843)/3000
#print("Test Accuracy: %.2f" % (accuracy*100))
#66.4
#accuracy = (697+428+834)/3000
#print("Test Accuracy: %.2f" % (accuracy*100))
accuracy = (695+454+865)/3000
print("Test Accuracy: %.2f" % (accuracy*100))

In [None]:
import os
os.listdir('/kaggle/working/')

In [None]:
os.chdir('/kaggle/working/')
!zip -r file1.zip /kaggle/working/Xcep*

In [None]:
from IPython.display import FileLink
FileLink(r'file1.zip')

j=1
modelname='XcepRes'
tr_history1=[]
model_fname = '/kaggle/working/' + modelname + str(j) + ".h5"
model_wname = '/kaggle/working/' + modelname + "_weights" + str(j) + ".h5"
model_hname= '/kaggle/working/' + modelname + "_hist" + str(j)
save_model_and_hist(model,tr_history1,model_fname, model_wname, model_hname)